#coding=utf-8 #url #HTTP Request # http header(key/value pairs) # GET / POST (data) #HTTP Response ''' #使用urllib2模块实现 import urllib2#urllib2是python自带的, 它在抓取数据量较大的网站时,会比较慢 def fetch(url): http_header = {'User-Agent':'Chrome'} #dictionary #get请求,设置为None,如果是post则需要另填 http_request = urllib2.Request(url,None,http_header) #向服务器发送请求打开URL http_response = urllib2.urlopen(http_request)
#Status code #200 / OK #404 / Invalid URL #500 / Interal Error print "----response code----" print http_response.code
#http header (key/value pairs) print '----response header----' print http_response.info()
print '-------data-------' #打印响应数据 #开始下载数据 print "Start downloading data..." print http_response.read() print "Finish downloading data..."
if __name__ == '__main__': fetch("http://www.meituan.com/api/v2/beijing/deals")#美团接口 http://www.meituan.com/api/v1/divisions '''
#使用Tornado web server //这是facebook开源的做服务器的一个库 #Tornado web server里有一个HTTP Client对象,可以使用这个对象进行爬虫程序 #首先安装 pip install tornadohttpclient库(可能也需要安装pip install backports.ssl_match_hostname库) import tornado.httpclient def fetch(url): http_header = {'User-Agent':'Chrome'} #connect_timeout是设置链接超时时间,如果改该时间后还没链接成功,则超时,安全退出; # request_timeout为:链接成功后,下载数据时如果block,则到一定时间后会超时,继续往下执行程序 http_request = tornado.httpclient.HTTPRequest(url=url,method='GET',headers=http_header,connect_timeout=20,request_timeout=600) http_client = tornado.httpclient.HTTPClient() print "Start downloading data..." http_response = http_client.fetch(http_request) print "Finish downloading data..."
print "----response code----" print http_response.code
print '----response header----' #Different from urllib2 all_fields = http_response.headers.get_all() for field in all_fields: print field
print "----response data----" print http_response.body
if __name__ == '__main__': fetch('http://www.meituan.com/api/v2/beijing/deals')
|