分享

python爬虫爬虫,爬取电影榜单

 xxcc140 2020-07-31

猫眼电影榜单的爬取

from bs4 import BeautifulSoupimport re # 正则表达式,进行文字匹配import urllib.request, urllib.error # 制定url ,获取网页数据import xlwt # 进行excel操作def main (): baseurl = 'https:///board/4?offset=' list = getdata(baseurl) savepath = '猫眼电影.xls' savedate(list,savepath)rank = re.compile(r'<i class='board-index board-index-.*'>(.*?)</i>')title = re.compile(r'<img alt='(.*?)' class='board-img'')actor = re.compile(r'<p class='star'>(.*?)</p>',re.S)time = re.compile(r'<p class='releasetime'>(.*?)</p>')rate1 = re.compile(r'<i class='integer'>(.*?)</i>')rate2 = re.compile(r'<i class='fraction'>(.*)</i>')def getdata(baseurl): list = [] for i in range(10): url = baseurl+str(i*10) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363'} request = urllib.request.Request(url=url,headers=headers) reponse = urllib.request.urlopen(request) html = reponse.read().decode('utf_8') soup = BeautifulSoup(html,'html.parser') for item in soup.find_all('dd'): # print(item) date = [] item = str(item) ranking = re.findall(rank,item) date.append(ranking[0]) titleing = re.findall(title,item) date.append(titleing[0]) actoring = re.findall(actor,item) actoring[0] = actoring[0].replace(' ', '') actoring[0] = actoring[0].replace('\n', '') date.append(actoring[0]) timeing = re.findall(time,item) date.append(timeing[0]) rate1ing = re.findall(rate1,item) rate2ing = re.findall(rate2,html) date.append(rate1ing[0]+rate2ing[0]) list.append(date) return listdef savedate(list,savepath): book = xlwt.Workbook(encoding='utf-8') # 创建book对象 sheet = book.add_sheet('猫眼电影',cell_overwrite_ok=True) # 创建工作表 col = ('排名', '片名', '演员', '发布时间', '评分') for i in range(0,5): sheet.write(0,i,col[i]) #列名 for i in range(0,100): # print('第%d条' % i) # 监督进度 可以删除 data = list[i] for j in range(0, 5): sheet.write(i + 1, j, data[j]) book.save(savepath) # 保存if __name__ == '__main__': main()

喜欢的小伙伴点赞关注转发一下~。

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多