import urllib.request from bs4 import BeautifulSoup import time def main(): url = 'http://www.shicimingju.com/book/' #获得请求对象 request = get_request(url) #获得响应参数 html = get_response(request) #获取数据 soup_html(html) def get_request(url): #获得请求对象 headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } request = urllib.request.Request(url=url,headers=headers) return request def get_response(req): response = urllib.request.urlopen(req) html = response.read().decode('utf8') return html #获得第一层数据 def soup_html(html): #实例化soup soup = BeautifulSoup(html,'lxml') ret = soup.select('.bookmark-list ul li h2 a') for i in ret: title = i.text href = 'http://www.shicimingju.com' + i['href'] # print(href) #调用生成请求对象的方法 request = get_request(href) #调用生成响应对象的方法 text = get_response(request) filename = title + '.txt' fp = open(filename,'a',encoding='utf8') # print(text) # 调用第二层的生成的数据 datas = get_txt(href) for i in datas: title = i[0] print("正在下载%s" %title) for i in i[1]: res = i.get_text() fp.write(title+'\n'+res) time.sleep(2) print("下载结束%s"%title) # print("datas:\n",datas) # # #获得第二层数据 def get_txt(href): # #调用生成请求对象的方法 request = get_request(href) # #调用生成响应对象的方法 text = get_response(request) # print(text) #实例化一个soup soup = BeautifulSoup(text,'lxml') ret = soup.select('.book-mulu ul li a') # print(ret) print(1) d1 = [] for i in ret: title = i.text href = 'http://www.shicimingju.com' + i['href'] # print(title,href) res = get_data(href) d1.append(res) return d1 #获得第三层数据 def get_data(href): #调用生成对象的方法 request = get_request(href) #调用生成响应对象的方法 response = get_response(request) #实例化一个soup soup = BeautifulSoup(response,'lxml') ret = soup.select('.www-shadow-card h1')# title = ret[0].text#获得每个章节的章回名 if soup.select('.chapter_content p'): res = soup.select('.chapter_content p') return title, res elif soup.select('.chapter_content'): res = soup.select('.chapter_content') return title, res #循环第三层数据 if __name__ == '__main__': main()
|
|