转自:http://blog.csdn.net/Hawk_lipu/archive/2008/06/03/2506919.aspx
【用Python写爬虫】获取html的方法【一】:使用urllib
# -*- coding: UTF-8 -*-
import urllib
def getWebPageContent(url):
f
= urllib.urlopen(url)
data
= f.read()
f.close()
return
data
url = 'http://blog.csdn.net'
content = getWebPageContent(url)
print content
【用Python写爬虫】获取html的方法【二】:使用pycurl
# Pycurl参考地址:http://pycurl./
# Pycurl下载地址:http://pycurl./download/pycurl-7.18.1.tar.gz
# -*-coding: UTF-8 -*-
importpycurl
importStringIO
def getURLContent_pycurl(url):
c
= pycurl.Curl()
c.setopt(pycurl.URL,url)
b
= StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION,
b.write)
c.setopt(pycurl.FOLLOWLOCATION,
1)
c.setopt(pycurl.MAXREDIRS,
5)
#
代理
#c.setopt(pycurl.PROXY,
'http://11.11.11.11:8080')
#c.setopt(pycurl.PROXYUSERPWD,
'aaa:aaa')
c.perform()
return
b.getvalue()
url = 'http://blog.csdn.net'
content =getURLContent_pycurl(url)
print content
【用Python写爬虫】获取html的方法【三】:使用cPAMIE
# cPAMIE下载:http:///project/showfiles.php?group_id=103662
# -*-coding: UTF-8 -*-
import cPAMIE
def getURLContent_cPAMIE(url):
g_ie
=cPAMIE.PAMIE()
g_ie.showDebugging
= False
g_ie.frameName=
None
g_ie.navigate(url)
content =g_ie.pageGetText()
g_ie.quit()
return content
url = 'http://blog.csdn.net'
content = getURLContent_cPAMIE(url)
print content
【用Python写爬虫】获取html的方法【四】:使用urllib下载文件
# -*- coding: UTF-8 -*-
import urllib
url = 'http://blog.csdn.net'
path = 'C:\\temp\\csdn.net.html'
urllib.urlretrieve(url,path)
【用Python写爬虫】获取html的方法【五】:利用Twisted框架之client.getPage
# Twisted框架下载:
http:///trac/wiki/Downloads
# -*-coding: UTF-8 -*-
from twisted.internet import reactor
from twisted.web import client
def result(content):
print
content
reactor.stop()
deferred =client.getPage("http://blog.csdn.net")
deferred.addCallback(result)
reactor.run()
|