python 简单爬取今日头条热点新闻(一)

阿甘ch1wn8cyc3 2019-06-19

展开全文

今日头条如今在自媒体领域算是比较强大的存在，今天就带大家利用python爬去今日头条的热点新闻，理论上是可以做到无限爬取的；

在浏览器中打开今日头条的链接，选中左侧的热点，在浏览器开发者模式network下很快能找到一个‘?category=new_hot...’字样的文件，查看该文件发现新闻内容的数据全部存储在data里面，且能发现数据类型为json；如下图：

这样一来就简单了，只要找到这个文件的requests url即可通过python requests来爬取网页了；

查看请求的url，如下图：

发现链接为：https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1B5AC16548E0FA&cp=5C647E601F9AEE1&_signature=F09fYAAASzBjiSc9oUU9MxdPX3

其中有9个参数，对比如下表：

其中max_behot_time在获取的json数据中获得，具体数据见如下截图：

在网上找了下大神对as和cp算法的分析，发现两个参数在js文件：home_4abea46.js中有，具体算法如下代码：

!function(t) {

var e = {};

e.getHoney = function() {

var t = Math.floor((new Date).getTime() / 1e3)

, e = t.toString(16).toUpperCase()

, i = md5(t).toString().toUpperCase();

if (8 != e.length)

return {

as: "479BB4B7254C150",

cp: "7E0AC8874BB0985"

};

for (var n = i.slice(0, 5), a = i.slice(-5), s = "", o = 0; 5 > o; o++)

s += n[o] + e[o];

for (var r = "", c = 0; 5 > c; c++)

r += e[c + 3] + a[c];

return {

as: "A1" + s + e.slice(-3),

cp: e.slice(0, 3) + r + "E1"

}

,

t.ascp = e

}(window, document),

　python获取as和cp值的代码如下：(代码参考blog：https://www.cnblogs.com/xuchunlin/p/7097391.html)

def get_as_cp(): # 该函数主要是为了获取as和cp参数，程序参考今日头条中的加密js文件：home_4abea46.js

zz = {}

now = round(time.time())

print(now) # 获取当前计算机时间

e = hex(int(now)).upper()[2:] #hex()转换一个整数对象为16进制的字符串表示

print('e:', e)

a = hashlib.md5() #hashlib.md5().hexdigest()创建hash对象并返回16进制结果

print('a:', a)

a.update(str(int(now)).encode('utf-8'))

i = a.hexdigest().upper()

print('i:', i)

if len(e)!=8:

zz = {'as':'479BB4B7254C150',

'cp':'7E0AC8874BB0985'}

return zz

n = i[:5]

a = i[-5:]

r = ''

s = ''

for i in range(5):

s= s+n[i]+e[i]

for j in range(5):

r = r+e[j+3]+a[j]

zz ={

'as':'A1'+s+e[-3:],

'cp':e[0:3]+r+'E1'

}

print('zz:', zz)

return zz

　　这样完整的链接就构成了，另外提一点就是：_signature参数去掉也是可以获取到json数据的，因此这样请求的链接就完成了；下面附上完整代码：

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

import requests

import json

from openpyxl import Workbook

import time

import hashlib

import os

import datetime

start_url = 'https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time='

url = 'https://www.toutiao.com'

headers={

'user-agent':

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

}

cookies = {'tt_webid':'6649949084894053895'} # 此处cookies可从浏览器中查找，为了避免被头条禁止爬虫

max_behot_time = '0' # 链接参数

title = [] # 存储新闻标题

source_url = [] # 存储新闻的链接

s_url = [] # 存储新闻的完整链接

source = [] # 存储发布新闻的公众号

media_url = {} # 存储公众号的完整链接

def get_as_cp(): # 该函数主要是为了获取as和cp参数，程序参考今日头条中的加密js文件：home_4abea46.js

zz = {}

now = round(time.time())

print(now) # 获取当前计算机时间

e = hex(int(now)).upper()[2:] #hex()转换一个整数对象为16进制的字符串表示

print('e:', e)

a = hashlib.md5() #hashlib.md5().hexdigest()创建hash对象并返回16进制结果

print('a:', a)

a.update(str(int(now)).encode('utf-8'))

i = a.hexdigest().upper()

print('i:', i)

if len(e)!=8:

zz = {'as':'479BB4B7254C150',

'cp':'7E0AC8874BB0985'}

return zz

n = i[:5]

a = i[-5:]

r = ''

s = ''

for i in range(5):

s= s+n[i]+e[i]

for j in range(5):

r = r+e[j+3]+a[j]

zz ={

'as':'A1'+s+e[-3:],

'cp':e[0:3]+r+'E1'

}

print('zz:', zz)

return zz

def getdata(url, headers, cookies): # 解析网页函数

r = requests.get(url, headers=headers, cookies=cookies)

print(url)

data = json.loads(r.text)

return data

def savedata(title, s_url, source, media_url): # 存储数据到文件

# 存储数据到xlxs文件

wb = Workbook()

if not os.path.isdir(os.getcwd()+'/result'): # 判断文件夹是否存在

os.makedirs(os.getcwd()+'/result') # 新建存储文件夹

filename = os.getcwd()+'/result/result-'+datetime.datetime.now().strftime('%Y-%m-%d-%H-%m')+'.xlsx' # 新建存储结果的excel文件

ws = wb.active

ws.title = 'data' # 更改工作表的标题

ws['A1'] = '标题' # 对表格加入标题

ws['B1'] = '新闻链接'

ws['C1'] = '头条号'

ws['D1'] = '头条号链接'

for row in range(2, len(title)+2): # 将数据写入表格

_= ws.cell(column=1, row=row, value=title[row-2])

_= ws.cell(column=2, row=row, value=s_url[row-2])

_= ws.cell(column=3, row=row, value=source[row-2])

_= ws.cell(column=4, row=row, value=media_url[source[row-2]])

wb.save(filename=filename) # 保存文件

def main(max_behot_time, title, source_url, s_url, source, media_url): # 主函数

for i in range(3):

# 此处的数字类似于你刷新新闻的次数，正常情况下刷新一次会出现10条新闻，但夜存在少于10条的情况；所以最后的结果并不一定是10的倍数

ascp = get_as_cp() # 获取as和cp参数的函数

demo = getdata(start_url+max_behot_time+'&max_behot_time_tmp='+max_behot_time+'&tadrequire=true&as='+ascp['as']+'&cp='+ascp['cp'], headers, cookies)

print(demo)

# time.sleep(1)

for j in range(len(demo['data'])):

# print(demo['data'][j]['title'])

if demo['data'][j]['title'] not in title:

title.append(demo['data'][j]['title']) # 获取新闻标题

source_url.append(demo['data'][j]['source_url']) # 获取新闻链接

source.append(demo['data'][j]['source']) # 获取发布新闻的公众号

if demo['data'][j]['source'] not in media_url:

media_url[demo['data'][j]['source']] = url+demo['data'][j]['media_url'] # 获取公众号链接

print(max_behot_time)

max_behot_time = str(demo['next']['max_behot_time']) # 获取下一个链接的max_behot_time参数的值