分享

用python实现的抓取腾讯视频所有电影的爬虫

 free_light 2014-04-30

1. [代码]用python实现的抓取腾讯视频所有电影的爬虫     跳至 [1] [全屏预览]

001# -*- coding: utf-8 -*-
002# by awakenjoys. my site: www.dianying.at
003import re
004import urllib2
005from bs4 import BeautifulSoup
006import string, time
007import pymongo
008 
009NUM     = 0         #全局变量,电影数量
010m_type  = u''       #全局变量,电影类型
011m_site  = u'qq' #全局变量,电影网站
012 
013#根据指定的URL获取网页内容
014def gethtml(url):
015    req = urllib2.Request(url)
016    response = urllib2.urlopen(req)
017    html = response.read()
018    return html
019 
020#从电影分类列表页面获取电影分类
021def gettags(html):
022    global m_type
023    soup = BeautifulSoup(html)      #过滤出分类内容
024    #print soup
025    #<ul class="clearfix _group" gname="mi_type" gtype="1">
026    tags_all = soup.find_all('ul', {'class' : 'clearfix _group' , 'gname' : 'mi_type'})
027    #print len(tags_all), tags_all
028    #print str(tags_all[1]).replace('\n', '')
029 
030    #<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html" title="动作" tvalue="0">动作</a>
031    re_tags = r'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'
032    p = re.compile(re_tags, re.DOTALL)
033 
034    tags = p.findall(str(tags_all[0]))
035    if tags:
036        tags_url = {}
037        #print tags
038        for tag in tags:
039            tag_url = tag[0].decode('utf-8')
040            #print tag_url
041            m_type = tag[1].decode('utf-8')
042            tags_url[m_type] = tag_url
043             
044    else:
045            print "Not Find"
046    return tags_url
047 
048#获取每个分类的页数
049def get_pages(tag_url):
050    tag_html = gethtml(tag_url)
051    #div class="paginator
052    soup = BeautifulSoup(tag_html)      #过滤出标记页面的html
053    #print soup
054    #<div class="mod_pagenav" id="pager">
055    div_page = soup.find_all('div', {'class' : 'mod_pagenav', 'id' : 'pager'})
056    #print div_page #len(div_page), div_page[0]
057 
058    #<a class="c_txt6" href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html" title="25"><span>25</span></a>
059    re_pages = r'<a class=.+?><span>(.+?)</span></a>'
060    p = re.compile(re_pages, re.DOTALL)
061    pages = p.findall(str(div_page[0]))
062    #print pages
063    if len(pages) > 1:
064        return pages[-2]
065    else:
066        return 1
067     
068 
069def getmovielist(html):
070    soup = BeautifulSoup(html)
071 
072    #<ul class="mod_list_pic_130">
073    divs = soup.find_all('ul', {'class' : 'mod_list_pic_130'})
074    #print divs
075    for div_html in divs:
076        div_html = str(div_html).replace('\n', '')
077        #print div_html
078        getmovie(div_html)
079 
080 
081def getmovie(html):
082    global NUM
083    global m_type
084    global m_site
085 
086    #<h6 class="caption"> <a href="http://www.tudou.com/albumcover/Z7eF_40EL4I.html" target="_blank" title="徒步旅行队">徒步旅行队</a> </h6> <ul class="info"> <li class="desc">法国卖座喜剧片</li> <li class="cast"> </li> </ul> </div> <div class="ext ext_last"> <div class="ext_txt"> <h3 class="ext_title">徒步旅行队</h3> <div class="ext_info"> <span class="ext_area">地区: 法国</span> <span class="ext_cast">导演: </span> <span class="ext_date">年代: 2009</span> <span class="ext_type">类型: 喜剧</span> </div> <p class="ext_intro">理查德·达奇拥有一家小的旅游公司,主要经营法国游客到非洲大草原的旅游服务。六个法国游客决定参加理查德·达奇组织的到非洲的一...</p>
087 
088    re_movie = r'<li><a class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\"><img.+?</li>'
089    p = re.compile(re_movie, re.DOTALL)
090    movies = p.findall(html)
091    if movies:
092        conn = pymongo.Connection('localhost', 27017)
093        movie_db = conn.dianying
094        playlinks = movie_db.playlinks
095        #print movies
096        for movie in movies:
097            #print movie
098            NUM += 1
099            print "%s : %d" % ("=" * 70, NUM)
100            values = dict(
101                movie_title = movie[1],
102                movie_url   = movie[0],
103                movie_site      = m_site,
104                movie_type      = m_type
105                )
106            print values
107            playlinks.insert(values)
108            print "_" * 70
109            NUM += 1
110            print "%s : %d" % ("=" * 70, NUM)
111 
112    #else:
113    #   print "Not Find"
114 
115def getmovieinfo(url):
116    html = gethtml(url)
117    soup = BeautifulSoup(html)
118 
119    #pack pack_album album_cover
120    divs = soup.find_all('div', {'class' : 'pack pack_album album_cover'})
121    #print divs[0]
122 
123    #<a href="http://www.tudou.com/albumplay/9NyofXc_lHI/32JqhiKJykI.html" target="new" title="《血滴子》独家纪录片" wl="1"> </a>
124    re_info = r'<a href=\"(.+?)\" target=\"new\" title=\"(.+?)\" wl=\".+?\"> </a>'
125    p_info = re.compile(re_info, re.DOTALL)
126    m_info = p_info.findall(str(divs[0]))
127    if m_info:
128        return m_info
129    else:
130        print "Not find movie info"
131 
132    return m_info
133 
134 
135def insertdb(movieinfo):
136    global conn
137    movie_db = conn.dianying_at
138    movies = movie_db.movies
139    movies.insert(movieinfo)
140 
141if __name__ == "__main__":
142    global conn
143 
144    tags_url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
145    #print tags_url
146    tags_html = gethtml(tags_url)
147    #print tags_html
148    tag_urls = gettags(tags_html)
149    #print tag_urls
150 
151 
152    for url in tag_urls.items():
153        print  str(url[1]).encode('utf-8') #,url[0]
154        maxpage = int(get_pages(str(url[1]).encode('utf-8')))
155        print maxpage
156 
157        for x in range(0, maxpage):
158            #http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html
159            m_url = str(url[1]).replace('0_20_0_-1_0.html', '')
160            movie_url = "%s%d_20_0_-1_0.html" % (m_url, x)
161            print movie_url
162            movie_html = gethtml(movie_url.encode('utf-8'))
163            #print movie_html
164            getmovielist(movie_html)
165            time.sleep(0.1)

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多