import requests
import json
from openpyxl import Workbook
import time
import hashlib
import os
import datetime
start_url = 'https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time='
url = 'https://www.toutiao.com'
headers={
'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
cookies = {'tt_webid':'6649949084894053895'} # 此處cookies可從瀏覽器中查找,為了避免被頭條禁止爬蟲
max_behot_time = '0' # 鏈接參數(shù)
title = [] # 存儲(chǔ)新聞標(biāo)題
source_url = [] # 存儲(chǔ)新聞的鏈接
s_url = [] # 存儲(chǔ)新聞的完整鏈接
source = [] # 存儲(chǔ)發(fā)布新聞的公眾號
media_url = {} # 存儲(chǔ)公眾號的完整鏈接
def get_as_cp(): # 該函數(shù)主要是為了獲取as和cp參數(shù),程序參考今日頭條中的加密js文件:home_4abea46.js
zz = {}
now = round(time.time())
print(now) # 獲取當(dāng)前計(jì)算機(jī)時(shí)間
e = hex(int(now)).upper()[2:] #hex()轉(zhuǎn)換一個(gè)整數(shù)對象為16進(jìn)制的字符串表示
print('e:', e)
a = hashlib.md5() #hashlib.md5().hexdigest()創(chuàng)建hash對象并返回16進(jìn)制結(jié)果
print('a:', a)
a.update(str(int(now)).encode('utf-8'))
i = a.hexdigest().upper()
print('i:', i)
if len(e)!=8:
zz = {'as':'479BB4B7254C150',
'cp':'7E0AC8874BB0985'}
return zz
n = i[:5]
a = i[-5:]
r = ''
s = ''
for i in range(5):
s= s+n[i]+e[i]
for j in range(5):
r = r+e[j+3]+a[j]
zz ={
'as':'A1'+s+e[-3:],
'cp':e[0:3]+r+'E1'
}
print('zz:', zz)
return zz
def getdata(url, headers, cookies): # 解析網(wǎng)頁函數(shù)
r = requests.get(url, headers=headers, cookies=cookies)
print(url)
data = json.loads(r.text)
return data
def savedata(title, s_url, source, media_url): # 存儲(chǔ)數(shù)據(jù)到文件
# 存儲(chǔ)數(shù)據(jù)到xlxs文件
wb = Workbook()
if not os.path.isdir(os.getcwd()+'/result'): # 判斷文件夾是否存在
os.makedirs(os.getcwd()+'/result') # 新建存儲(chǔ)文件夾
filename = os.getcwd()+'/result/result-'+datetime.datetime.now().strftime('%Y-%m-%d-%H-%m')+'.xlsx' # 新建存儲(chǔ)結(jié)果的excel文件
ws = wb.active
ws.title = 'data' # 更改工作表的標(biāo)題
ws['A1'] = '標(biāo)題' # 對表格加入標(biāo)題
ws['B1'] = '新聞鏈接'
ws['C1'] = '頭條號'
ws['D1'] = '頭條號鏈接'
for row in range(2, len(title)+2): # 將數(shù)據(jù)寫入表格
_= ws.cell(column=1, row=row, value=title[row-2])
_= ws.cell(column=2, row=row, value=s_url[row-2])
_= ws.cell(column=3, row=row, value=source[row-2])
_= ws.cell(column=4, row=row, value=media_url[source[row-2]])
wb.save(filename=filename) # 保存文件
def main(max_behot_time, title, source_url, s_url, source, media_url): # 主函數(shù)
for i in range(3): # 此處的數(shù)字類似于你刷新新聞的次數(shù),正常情況下刷新一次會(huì)出現(xiàn)10條新聞,但夜存在少于10條的情況;所以最后的結(jié)果并不一定是10的倍數(shù)
ascp = get_as_cp() # 獲取as和cp參數(shù)的函數(shù)
demo = getdata(start_url+max_behot_time+'&max_behot_time_tmp='+max_behot_time+'&tadrequire=true&as='+ascp['as']+'&cp='+ascp['cp'], headers, cookies)
print(demo)
# time.sleep(1)
for j in range(len(demo['data'])):
# print(demo['data'][j]['title'])
if demo['data'][j]['title'] not in title:
title.append(demo['data'][j]['title']) # 獲取新聞標(biāo)題
source_url.append(demo['data'][j]['source_url']) # 獲取新聞鏈接
source.append(demo['data'][j]['source']) # 獲取發(fā)布新聞的公眾號
if demo['data'][j]['source'] not in media_url:
media_url[demo['data'][j]['source']] = url+demo['data'][j]['media_url'] # 獲取公眾號鏈接
print(max_behot_time)
max_behot_time = str(demo['next']['max_behot_time']) # 獲取下一個(gè)鏈接的max_behot_time參數(shù)的值
for index in range(len(title)):
print('標(biāo)題:', title[index])
if 'https' not in source_url[index]:
s_url.append(url+source_url[index])
print('新聞鏈接:', url+source_url[index])
else:
print('新聞鏈接:', source_url[index])
s_url.append(source_url[index])
# print('源鏈接:', url+source_url[index])
print('頭條號:', source[index])
print(len(title)) # 獲取的新聞數(shù)量
if __name__ == '__main__':
main(max_behot_time, title, source_url, s_url, source, media_url)
savedata(title, s_url, source, media_url)