初学爬虫

dinghj 2013-11-17

展开全文

抓取网易黑标题下的网页，把正文保存在txt文档。确保你的D盘下有data这个文件夹。
有些文档内容包括一些无用信息。因为水平有限，无法去掉。
代码比较好理解。有的模块需要自己下载。作者也提供压缩文件
只使用部分正则表达式进行替换
初学者，问题、毛病等比较多，请各位见谅，

[1].[文件] SGMLParser_crawl_html_txt.py ~ 3KB 下载(54) 跳至 [1] [2]

01#coding:utf-8
02 
03import urllib,chardet,re
04from sgmllib import SGMLParser
05from BeautifulSoup import BeautifulSoup
06 
07'''爬取网页新闻的黑标题下的网页正文部分，保存在txt文档里
08以黑标题的名字作为txt文档的名字
09这个黑标题不是网页打开之后的正文标题
10'''
11 
12class URLLister_1(SGMLParser):
13 
14    def reset(self):
15 
16        self.is_a= ""
17        self.urls = []
18        self.name = []
19        SGMLParser.reset(self)
20 
21    def start_a(self, attrs):
22 
23        self.is_a = 1
24        href = [v for k, v in attrs if k == 'href']
25        if href:
26            self.urls.extend(href)
27 
28    def end_a(self):
29 
30        self.is_a = ""
31 
32    def handle_data(self, text):
33 
34        if self.is_a:
35            self.name.append(text)
36 
37class URLLister_2(SGMLParser):
38 
39    def reset(self):
40 
41        self.is_p = ""
42        self.text = []
43        SGMLParser.reset(self)
44 
45    def start_p(self,attrs):
46 
47        self.is_p = 1
48 
49    def end_p(self):
50 
51        self.is_p = ""
52 
53    def handle_data(self, text):
54 
55        if self.is_p:
56            self.text.append(text)
57 
58if __name__ == "__main__":
59 
60    string_getContent = ''
61    content = urllib.urlopen("http://news.163.com/")
62    soup = BeautifulSoup(content)
63    getContent = soup.find('div',{"id" : "news"})
64    getContent = getContent.findAll('a')
65    #print getContent
66    for each in getContent:
67        each = str(each)
68        string_getContent += each
69    #print string_getContent
70    lister = URLLister_1()
71    lister.feed(string_getContent)
72    if len(lister.urls) == len(lister.name):
73        length =  len(lister.urls)
74    for every_html in range(length):
75        lister.name[every_html] = re.sub(r'[:<>?/"]','_',lister.name[every_html])
76        lister.name[every_html] = lister.name[every_html].decode('utf-8') + '.txt'
77        page = urllib.urlopen(lister.urls[every_html])
78        page_content = page.read()
79        paper = re.compile('(<div id="endText">.).+?(<div class="ep-source cDGray">)',re.I|re.S)
80        data = paper.search(page_content)
81        if data:
82            data = data.group()
83            lister_text = URLLister_2()
84            lister_text.feed(data)
85            file = open('D:\data/'+lister.name[every_html],'a')
86            for each in lister_text.text:
87                each = each.replace('宋体'.decode('utf-8').encode('gbk'),'')
88                each = re.sub('\n','',each)
89                each = re.sub('[a-z]','',each)
90                each = re.sub('[A-Z]','',each)
91                each = re.sub('[=:;{}_]','',each)
92                each = re.sub("['\/]",'',each)
93                file.write(each)
94        else:
95            file = open('D:\data/'+lister.name[every_html],'w')
96            file.write('Notext')

[2].[文件] BeautifulSoup-3.2.0.tar.gz ~ 30KB 下载(65) 跳至 [1] [2]

文件不存在或者代码语言不存在