【原】古典书籍爬取

郭祺迦 2018-09-24

展开全文

import urllib.request

from bs4 import BeautifulSoup

import time

def main():

url = 'http://www.shicimingju.com/book/'

#获得请求对象

request = get_request(url)

#获得响应参数

html = get_response(request)

#获取数据

soup_html(html)

def get_request(url):

#获得请求对象

headers={

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

}

request = urllib.request.Request(url=url,headers=headers)

return request

def get_response(req):

response = urllib.request.urlopen(req)

html = response.read().decode('utf8')

return html

#获得第一层数据

def soup_html(html):

#实例化soup

soup = BeautifulSoup(html,'lxml')

ret = soup.select('.bookmark-list ul li h2 a')

for i in ret:

title = i.text

href = 'http://www.shicimingju.com' + i['href']

# print(href)

#调用生成请求对象的方法

request = get_request(href)

#调用生成响应对象的方法

text = get_response(request)

filename = title + '.txt'

fp = open(filename,'a',encoding='utf8')

# print(text)

# 调用第二层的生成的数据

datas = get_txt(href)

for i in datas:

title = i[0]

print("正在下载%s" %title)

for i in i[1]:

res = i.get_text()

fp.write(title+'\n'+res)

time.sleep(2)

print("下载结束%s"%title)

# print("datas:\n",datas)

# #获得第二层数据

def get_txt(href):

# #调用生成请求对象的方法

request = get_request(href)

# #调用生成响应对象的方法

text = get_response(request)

# print(text)

#实例化一个soup

soup = BeautifulSoup(text,'lxml')

ret = soup.select('.book-mulu ul li a')

# print(ret)

print(1)

d1 = []

for i in ret:

title = i.text

href = 'http://www.shicimingju.com' + i['href']

# print(title,href)

res = get_data(href)

d1.append(res)

return d1

#获得第三层数据

def get_data(href):

#调用生成对象的方法

request = get_request(href)

#调用生成响应对象的方法

response = get_response(request)

#实例化一个soup

soup = BeautifulSoup(response,'lxml')

ret = soup.select('.www-shadow-card h1')#

title = ret[0].text#获得每个章节的章回名

if soup.select('.chapter_content p'):

res = soup.select('.chapter_content p')

return title, res

elif soup.select('.chapter_content'):

res = soup.select('.chapter_content')

return title, res

#循环第三层数据

if __name__ == '__main__':

main()

转藏分享

QQ空间 QQ好友新浪微博微信

献花（0） +1

来自：郭祺迦 > 《实践案例》

举报/认领

0条评论

发表

请遵守用户评论公约

类似文章 更多

郭祺迦

关注对话

TA的最新馆藏

智联招聘
美团网
唯品会爬取商品信息
8684公交面向过程爬取
python与json的互换
正则、bs4、xpath的区别

喜欢该文的人也喜欢更多

热门阅读换一换