import requests
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36'}for ip in http_ip_pool:try:
r = requests.get(f"http://www.baidu.com/",
headers=headers, proxies={'http': ip}, timeout=1)print(r.status_code)except Exception as e:print(e)
HTTPConnectionPool(host='139.9.25.69', port=3128): Max retries exceeded with url: http://www.baidu.com/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x0000028D6A993248>, 'Connection to 139.9.25.69 timed out. (connect timeout=1)'))
200
200
HTTPConnectionPool(host='106.58.191.24', port=8888): Max retries exceeded with url: http://www.baidu.com/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x0000028D6B59C2C8>, 'Connection to 106.58.191.24 timed out. (connect timeout=1)'))
200
HTTPConnectionPool(host='47.111.71.29', port=8081): Max retries exceeded with url: http://www.baidu.com/ (Caused by ProxyError('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response')))
200
200
HTTPConnectionPool(host='106.58.191.218', port=8888): Max retries exceeded with url: http://www.baidu.com/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x0000028D6A9C2F88>, 'Connection to 106.58.191.218 timed out. (connect timeout=1)'))
200
可以看到只有一半的代理IP能在指定时间内成功访问百度,说明免费的代理IP就是不如收费的稳定。
定期爬取并清除失效的代理IP
只要再完成这部分,我们的代理IP池就能够初见雏形,冲!
最终完成代码:
import requests
import pandas as pd
import redis
from threading import Timer
import time
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36'}defproxie_ip_validity_check(proxie_ip, proxie_type='http', timeout=2):try:
r = requests.get(f"{proxie_type}:///ip",
proxies={proxie_type: proxie_ip}, timeout=timeout)if r.status_code ==200:returnTruereturnFalseexcept:returnFalse
redis_conn = redis.Redis(host='192.168.3.31', port=6379)defcrawl_proxy_ip2redis():for page inrange(1,20):print("当前爬取页面:", page)
url = f'https://www./free/intr/{page}'
r = requests.get(url, headers=headers)
r.encoding ='u8'
ip_df,= pd.read_html(r.text)for ip, port, proxie_type in ip_df[["IP","PORT","类型"]].values:
proxie_ip = f"{ip}:{port}"
proxie_type = proxie_type.lower()if proxie_ip_validity_check(proxie_ip, proxie_type,2):
redis_conn.sadd(f"proxie_ip_{proxie_type}", proxie_ip)defget_proxy_ip_pool(proxie_type='http'):
http_ip_bytes = redis_conn.smembers(f"proxie_ip_{proxie_type}")
http_ip_pool =list(map(bytes.decode, http_ip_bytes))return http_ip_pool
defclear_invalid_proxie_ip():print("开始清空失效代理ip:")for proxie_type in['http','https']:for ip in get_proxy_ip_pool(proxie_type):ifnot proxie_ip_validity_check(ip, proxie_type,2):print(proxie_type, ip,"失效")
redis_conn.srem(f"proxie_ip_{proxie_type}", ip)
crawl_proxy_ip2redis()whileTrue:# 5分钟后清空一次失效的代理ip
Timer(5*60, clear_invalid_proxie_ip,()).start()# 10分钟后下载一次代理ip
Timer(10*60, crawl_proxy_ip2redis,()).start()# 每隔10分钟一个周期
time.sleep(10*60)