python两种方法进行网络爬虫-urllib2、tornado

Macalist 2016-09-06

展开全文

#coding=utf-8
#url
#HTTP Request
#   http header(key/value pairs)
#   GET / POST (data)
#HTTP Response
'''
#使用urllib2模块实现
import urllib2#urllib2是python自带的， 它在抓取数据量较大的网站时，会比较慢
def fetch(url):
    http_header = {'User-Agent':'Chrome'} #dictionary
    #get请求，设置为None,如果是post则需要另填
    http_request = urllib2.Request(url,None,http_header)
    #向服务器发送请求打开URL
    http_response = urllib2.urlopen(http_request)


    #Status code
    #200 / OK
    #404 / Invalid URL
    #500 / Interal Error
    print "----response code----"
    print http_response.code

    #http header (key/value pairs)
    print '----response header----'
    print http_response.info()

    print '-------data-------'
    #打印响应数据
    #开始下载数据
    print "Start downloading data..."
    print http_response.read()
    print "Finish downloading data..."

if __name__ == '__main__':
    fetch("http://www.meituan.com/api/v2/beijing/deals")#美团接口  http://www.meituan.com/api/v1/divisions
'''

#使用Tornado web server  //这是facebook开源的做服务器的一个库
#Tornado web server里有一个HTTP Client对象，可以使用这个对象进行爬虫程序
#首先安装 pip install tornadohttpclient库(可能也需要安装pip install backports.ssl_match_hostname库)
import tornado.httpclient
def fetch(url):
    http_header = {'User-Agent':'Chrome'}
    #connect_timeout是设置链接超时时间，如果改该时间后还没链接成功，则超时，安全退出；
    # request_timeout为：链接成功后，下载数据时如果block，则到一定时间后会超时，继续往下执行程序
    http_request = tornado.httpclient.HTTPRequest(url=url,method='GET',headers=http_header,connect_timeout=20,request_timeout=600)
    http_client = tornado.httpclient.HTTPClient()
    print "Start downloading data..."
    http_response = http_client.fetch(http_request)
    print "Finish downloading data..."

    print "----response code----"
    print http_response.code

    print '----response header----'
    #Different from urllib2
    all_fields =  http_response.headers.get_all()
    for field in all_fields:
        print field

    print "----response data----"
    print http_response.body

if __name__ == '__main__':
    fetch('http://www.meituan.com/api/v2/beijing/deals')