【原】认识爬虫：使用 urllib2 库与 cookielib 库读取网页内容的三种方法

Python集中营 2022-10-10 发布于甘肃

展开全文

在简单爬虫中，第二个步骤就是使用网页下载器对网页进行下载以及获取下载请求的状态等。urllib2 库主要用于读取网页内容以及请求状态、cookielib 库主要为了增加网页下载器处理 cookie 的能力。

1、导入第三方扩展库

1# 导入 urllib2 库、用于网页下载
2import urllib2
3# 导入 cookielib 库、处理 cookie 信息
4import cookielib

2、urlopen() 函数实现网页下载

 1def use_urlopen(url):
 2    if url is None:
 3        # 定义爬虫的 url
 4        url = "http://www.baidu.com"
 5    # 打开网页下载链接
 6    response = urllib2.urlopen(url)
 7    # 打印请求的响应状态，200 时表示成功
 8    print "网页请求状态：", response.getcode()
 9    # read() 读取网页源代码内容
10    content = response.read()
11    # 打印源代码内容
12    print "源代码内容：", content
13    # 获取源代码内容的字符串长度
14    print "网页字符串长度：", len(content)

3、Request() 函数模拟浏览器实现网页下载

 1def use_request(url):
 2    if url is None:
 3        # 定义爬虫的 url
 4        url = "http://www.baidu.com"
 5    # 构造 request 请求
 6    request = urllib2.Request(url)
 7    # 添加构造请求头
 8    request.add_header("user-agent", "Mozilla/5.0")
 9    # 打开网页下载链接
10    response = urllib2.urlopen(request)
11    # 打印请求的响应状态，200 时表示成功
12    print "网页请求状态：", response.getcode()
13    # read() 读取网页源代码内容
14    content = response.read()
15    # 打印源代码内容
16    print "源代码内容：", content
17    # 获取源代码内容的字符串长度
18    print "网页字符串长度：", len(content)

4、build_opener() 函数加入网页下载器处理 cookie 的能力

 1def use_build_opener(url):
 2    if url is None:
 3        # 定义爬虫的 url
 4        url = "http://www.baidu.com"
 5    # 获取 cookie 对象
 6    cookie = cookielib.CookieJar()
 7    # 将 cookie 处理能力加入 opener
 8    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
 9    # 安装 opener
10    urllib2.install_opener(opener)
11    # 打开下载链接
12    response = urllib2.urlopen(url)
13    # 打印请求的响应状态，200 时表示成功
14    print "网页请求状态：", response.getcode()
15    # read() 读取网页源代码内容
16    content = response.read()
17    # 打印源代码内容
18    print "源代码内容：", content
19    # 获取源代码内容的字符串长度
20    print "网页字符串长度：", len(content)
21    # 打印 cookie 信息
22    print "cookie 信息",cookie