抓取网易黑标题下的网页,把正文保存在txt文档。确保你的D盘下有data这个文件夹。 有些文档内容包括一些无用信息。因为水平有限,无法去掉。 代码比较好理解。有的模块需要自己下载。作者也提供压缩文件 只使用部分正则表达式进行替换 初学者,问题、毛病等比较多,请各位见谅,
[1].[文件] SGMLParser_crawl_html_txt.py ~ 3KB 下载(54) 跳至 [1] [2]03 | import urllib,chardet,re |
04 | from sgmllib import SGMLParser |
05 | from BeautifulSoup import BeautifulSoup |
12 | class URLLister_1(SGMLParser): |
19 | SGMLParser.reset( self ) |
21 | def start_a( self , attrs): |
24 | href = [v for k, v in attrs if k = = 'href' ] |
26 | self .urls.extend(href) |
32 | def handle_data( self , text): |
35 | self .name.append(text) |
37 | class URLLister_2(SGMLParser): |
43 | SGMLParser.reset( self ) |
45 | def start_p( self ,attrs): |
53 | def handle_data( self , text): |
56 | self .text.append(text) |
58 | if __name__ = = "__main__" : |
60 | string_getContent = '' |
61 | content = urllib.urlopen( "http://news.163.com/" ) |
62 | soup = BeautifulSoup(content) |
63 | getContent = soup.find( 'div' ,{ "id" : "news" }) |
64 | getContent = getContent.findAll( 'a' ) |
66 | for each in getContent: |
68 | string_getContent + = each |
70 | lister = URLLister_1() |
71 | lister.feed(string_getContent) |
72 | if len (lister.urls) = = len (lister.name): |
73 | length = len (lister.urls) |
74 | for every_html in range (length): |
75 | lister.name[every_html] = re.sub(r '[:<>?/"]' , '_' ,lister.name[every_html]) |
76 | lister.name[every_html] = lister.name[every_html].decode( 'utf-8' ) + '.txt' |
77 | page = urllib.urlopen(lister.urls[every_html]) |
78 | page_content = page.read() |
79 | paper = re. compile ( '(<div id="endText">.).+?(<div class="ep-source cDGray">)' ,re.I|re.S) |
80 | data = paper.search(page_content) |
83 | lister_text = URLLister_2() |
84 | lister_text.feed(data) |
85 | file = open ( 'D:\data/' + lister.name[every_html], 'a' ) |
86 | for each in lister_text.text: |
87 | each = each.replace( '宋体' .decode( 'utf-8' ).encode( 'gbk' ),'') |
88 | each = re.sub( '\n' ,'',each) |
89 | each = re.sub( '[a-z]' ,'',each) |
90 | each = re.sub( '[A-Z]' ,'',each) |
91 | each = re.sub( '[=:;{}_]' ,'',each) |
92 | each = re.sub( "['\/]" ,'',each) |
95 | file = open ( 'D:\data/' + lister.name[every_html], 'w' ) |
[2].[文件] BeautifulSoup-3.2.0.tar.gz ~ 30KB 下载(65) 跳至 [1] [2]文件不存在或者代码语言不存在
|