scrapy框架是爬虫界最为强大的框架,没有之一,它的强大在于它的高可扩展性和低耦合,使使用者能够轻松的实现更改和补充。
其中内置三种爬虫主程序模板,scrapy.Spider、RedisSpider、CrawlSpider、RedisCrawlSpider(深度分布式爬虫)分别为别为一般爬虫、分布式爬虫、深度爬虫提供内部逻辑;下面将从源码和应用来学习, scrapy.Spider 源码: ''' Base class for Scrapy spiders
See documentation in docs/topics/spiders.rst ''' import logging import warnings
from scrapy import signals from scrapy.http import Request from scrapy.utils.trackref import object_ref from scrapy.utils.url import url_is_from_spider from scrapy.utils.deprecate import create_deprecated_class from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.utils.deprecate import method_is_overridden
class Spider(object_ref): '''Base class for scrapy spiders. All spiders must inherit from this class. '''
name = None custom_settings = None
def __init__(self, name=None, **kwargs): if name is not None: self.name = name elif not getattr(self, 'name', None): raise ValueError('%s must have a name' % type(self).__name__) self.__dict__.update(kwargs) if not hasattr(self, 'start_urls'): self.start_urls = []
@property def logger(self): logger = logging.getLogger(self.name) return logging.LoggerAdapter(logger, {'spider': self})
def log(self, message, level=logging.DEBUG, **kw): '''Log the given message at the given log level
This helper wraps a log call to the logger within the spider, but you can use it directly (e.g. Spider.logger.info('msg')) or use any other Python logger too. ''' self.logger.log(level, message, **kw)
@classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) spider._set_crawler(crawler) return spider
def set_crawler(self, crawler): warnings.warn('set_crawler is deprecated, instantiate and bound the ' 'spider to this crawler with from_crawler method ' 'instead.', category=ScrapyDeprecationWarning, stacklevel=2) assert not hasattr(self, 'crawler'), 'Spider already bounded to a ' \ 'crawler' self._set_crawler(crawler)
def _set_crawler(self, crawler): self.crawler = crawler self.settings = crawler.settings crawler.signals.connect(self.close, signals.spider_closed)
def start_requests(self): cls = self.__class__ if method_is_overridden(cls, Spider, 'make_requests_from_url'): warnings.warn( 'Spider.make_requests_from_url method is deprecated; it ' 'won't be called in future Scrapy releases. Please ' 'override Spider.start_requests method instead (see %s.%s).' % ( cls.__module__, cls.__name__ ), ) for url in self.start_urls: yield self.make_requests_from_url(url) else: for url in self.start_urls: yield Request(url, dont_filter=True)
def make_requests_from_url(self, url): ''' This method is deprecated. ''' return Request(url, dont_filter=True)
def parse(self, response): raise NotImplementedError('{}.parse callback is not defined'.format(self.__class__.__name__))
@classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or {}, priority='spider')
@classmethod def handles_request(cls, request): return url_is_from_spider(request.url, cls)
@staticmethod def close(spider, reason): closed = getattr(spider, 'closed', None) if callable(closed): return closed(reason)
def __str__(self): return '<%s %r at 0x%0x>' % (type(self).__name__, self.name, id(self))
__repr__ = __str__
BaseSpider = create_deprecated_class('BaseSpider', Spider)
class ObsoleteClass(object): def __init__(self, message): self.message = message
def __getattr__(self, name): raise AttributeError(self.message)
spiders = ObsoleteClass( ''from scrapy.spider import spiders' no longer works - use ' ''from scrapy.spiderloader import SpiderLoader' and instantiate ' 'it with your project settings'' )
# Top-level imports from scrapy.spiders.crawl import CrawlSpider, Rule from scrapy.spiders.feed import XMLFeedSpider, CSVFeedSpider from scrapy.spiders.sitemap import SitemapSpider
其中需要关注的是name(爬虫名字)、start_urls(抓取的起始url列表)、allowed_domains(限定抓取的url所在域名)、start_requests(开始抓取的方法) name、start_urls、allowed_domains是属性,在创建创建项目的时候已经建好了,稍作修改即可。start_requests是起始的抓取方法,一般是默认的遍历start_urls列表生成Request对象,在scrapy中需要登录的时候可以复写该方法,这个比较简单不在赘述。 CrawlSpider 深度爬虫,根据连接提取规则,会自动抓取页面中满足规则的连接,然后再请求解析,再抓取从而一直深入。 源码 ''' This modules implements the CrawlSpider which is the recommended spider to use for scraping typical web sites that requires crawling pages.
See documentation in docs/topics/spiders.rst '''
import copy import six
from scrapy.http import Request, HtmlResponse from scrapy.utils.spider import iterate_spider_output from scrapy.spiders import Spider
def identity(x): return x
class Rule(object):
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity): self.link_extractor = link_extractor self.callback = callback self.cb_kwargs = cb_kwargs or {} self.process_links = process_links self.process_request = process_request if follow is None: self.follow = False if callback else True else: self.follow = follow
class CrawlSpider(Spider):
rules = ()
def __init__(self, *a, **kw): super(CrawlSpider, self).__init__(*a, **kw) self._compile_rules()
def parse(self, response): return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
def parse_start_url(self, response): return []
def process_results(self, response, results): return results
def _build_request(self, rule, link): r = Request(url=link.url, callback=self._response_downloaded) r.meta.update(rule=rule, link_text=link.text) return r
def _requests_to_follow(self, response): if not isinstance(response, HtmlResponse): return seen = set() for n, rule in enumerate(self._rules): links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = self._build_request(n, link) yield rule.process_request(r)
def _response_downloaded(self, response): rule = self._rules[response.meta['rule']] return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
def _parse_response(self, response, callback, cb_kwargs, follow=True): if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item
if follow and self._follow_links: for request_or_item in self._requests_to_follow(response): yield request_or_item
def _compile_rules(self): def get_method(method): if callable(method): return method elif isinstance(method, six.string_types): return getattr(self, method, None)
self._rules = [copy.copy(r) for r in self.rules] for rule in self._rules: rule.callback = get_method(rule.callback) rule.process_links = get_method(rule.process_links) rule.process_request = get_method(rule.process_request)
@classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs) spider._follow_links = crawler.settings.getbool( 'CRAWLSPIDER_FOLLOW_LINKS', True) return spider
def set_crawler(self, crawler): super(CrawlSpider, self).set_crawler(crawler) self._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True)
CrawlSpider是继承于Spider,也实现了其中的常用属性和方法,新增了一个rules属性(连接提取规则集合),但是不同的是Crawl内部实现了parse解析方法,不能在Crawl中使用该关键词。 def parse(self, response): return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True) 也提供了一个可复写(overrideable)的方法: rules
在rules中包含一个或多个Rule对象,每个Rule对爬取网站的动作定义了特定操作。如果多个rule匹配了相同的链接,则根据规则在本集合中被定义的顺序,第一个会被使用。 class scrapy.spiders.Rule( link_extractor, callback = None, cb_kwargs = None, follow = None, process_links = None, process_request = None )
link_extractor :是一个Link Extractor对象,用于定义需要提取的链接(Link Extractor对象见下)。
callback : 从link_extractor中每获取到链接时,参数所指定的值作为回调函数,该回调函数接受一个response作为其第一个参数。
注意:当编写爬虫规则时,避免使用parse作为回调函数。由于CrawlSpider使用parse方法来实现其逻辑,如果覆盖了 parse方法,crawl spider将会运行失败。
follow :是一个布尔(boolean)值,指定了根据该规则从response提取的链接是否需要跟进。 如果callback为None,follow 默认设置为True ,否则默认为False。
process_links :指定该spider中哪个的函数将会被调用,从link_extractor中获取到链接列表时将会调用该函数。该方法主要用来过滤。
process_request :指定该spider中哪个的函数将会被调用, 该规则提取到每个request时都会调用该函数。 (用来过滤request)
LinkExtractorsclass scrapy.linkextractors.LinkExtractor
Link Extractors 的目的很简单: 提取链接。 每个LinkExtractor有唯一的公共方法是 extract_links(),它接收一个 Response 对象,并返回一个 scrapy.link.Link 对象。 Link Extractors要实例化一次,并且 extract_links 方法会根据不同的 response 调用多次提取链接。 class scrapy.linkextractors.LinkExtractor( allow = (), deny = (), allow_domains = (), deny_domains = (), deny_extensions = None, restrict_xpaths = (), tags = ('a','area'), attrs = ('href'), canonicalize = True, unique = True, process_value = None )
主要参数: allow :满足括号中“正则表达式”的值会被提取,如果为空,则全部匹配。
deny :与这个正则表达式(或正则表达式列表)不匹配的URL一定不提取。
allow_domains :会被提取的链接的domains。
deny_domains :一定不会被提取链接的domains。
restrict_xpaths :使用xpath表达式,和allow共同作用过滤链接。
案例 from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule
class TestSpider(CrawlSpider): name = 'Test' allowed_domains = ['Test.com'] start_urls = ['http://Test.com/']
rules = ( Rule(LinkExtractor(allow=r'Items/'), callback='parse_test', follow=True), )
def parse_test(self, response): items = {} ············ return items
RedisSpider、RedisCrawlSpider 
Scrapy-redis提供了下面四种组件: Scheduler(调度程序)
Duplication Filter(过滤)
Item Pipeline(数据管道)
Base Spider(爬虫基类)
Scheduler :
Scrapy中跟“待爬队列”直接相关的就是调度器Scheduler ,它负责对新的request进行入列操作(加入Scrapy queue),取出下一个要爬取的request(从Scrapy queue中取出)等操作。它把待爬队列按照优先级建立了一个字典结构,比如: { 优先级0 : 队列0 优先级1 : 队列1 优先级2 : 队列2 }
然后根据request中的优先级,来决定该入哪个队列,出列时则按优先级较小的优先出列。为了管理这个比较高级的队列字典,Scheduler需要提供一系列的方法。但是原来的Scheduler已经无法使用,所以使用Scrapy-redis的scheduler组件。 Duplication Filter
Scrapy中用集合实现这个request去重功能,Scrapy中把已经发送的request指纹放入到一个集合中,把下一个request的指纹拿到集合中比对,如果该指纹存在于集合中,说明这个request发送过了,如果没有则继续操作。这个核心的判重功能是这样实现的: def request_seen(self, request): # self.request_figerprints就是一个指纹集合 fp = self.request_fingerprint(request)
# 这就是判重的核心操作 if fp in self.fingerprints: return True self.fingerprints.add(fp) if self.file: self.file.write(fp + os.linesep)
在scrapy-redis中去重是由Duplication Filter 组件来实现的,它通过redis的set 不重复的特性,巧妙的实现了Duplication Filter去重。scrapy-redis调度器从引擎接受request,将request的指纹存⼊redis的set检查是否重复,并将不重复的request push写⼊redis的 request queue。 引擎请求request(Spider发出的)时,调度器从redis的request queue队列⾥里根据优先级pop 出⼀个request 返回给引擎,引擎将此request发给spider处理。 Item Pipeline :
引擎将(Spider返回的)爬取到的Item给Item Pipeline,scrapy-redis 的Item Pipeline将爬取到的 Item 存⼊redis的 items queue。 修改过Item Pipeline 可以很方便的根据 key 从 items queue 提取item,从⽽实现 items processes 集群。 Base Spider
不在使用scrapy原有的Spider类,重写的RedisSpider 继承了Spider和RedisMixin这两个类,RedisMixin是用来从redis读取url的类。 当我们生成一个Spider继承RedisSpider时,调用setup_redis函数,这个函数会去连接redis数据库,然后会设置signals(信号): from scrapy import signals from scrapy.exceptions import DontCloseSpider from scrapy.spiders import Spider, CrawlSpider
from . import connection, defaults from .utils import bytes_to_str
class RedisMixin(object): '''Mixin class to implement reading urls from a redis queue.''' redis_key = None redis_batch_size = None redis_encoding = None
# Redis client placeholder. server = None
def start_requests(self): '''Returns a batch of start requests from redis.''' return self.next_requests()
def setup_redis(self, crawler=None): '''Setup redis connection and idle signal.
This should be called after the spider has set its crawler object. ''' if self.server is not None: return
if crawler is None: # We allow optional crawler argument to keep backwards # compatibility. # XXX: Raise a deprecation warning. crawler = getattr(self, 'crawler', None)
if crawler is None: raise ValueError('crawler is required')
settings = crawler.settings
if self.redis_key is None: self.redis_key = settings.get( 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY, )
self.redis_key = self.redis_key % {'name': self.name}
if not self.redis_key.strip(): raise ValueError('redis_key must not be empty')
if self.redis_batch_size is None: # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE). self.redis_batch_size = settings.getint( 'REDIS_START_URLS_BATCH_SIZE', settings.getint('CONCURRENT_REQUESTS'), )
try: self.redis_batch_size = int(self.redis_batch_size) except (TypeError, ValueError): raise ValueError('redis_batch_size must be an integer')
if self.redis_encoding is None: self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)
self.logger.info('Reading start URLs from redis key '%(redis_key)s' ' '(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s', self.__dict__)
self.server = connection.from_settings(crawler.settings) # The idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def next_requests(self): '''Returns a request to be scheduled or none.''' use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) fetch_one = self.server.spop if use_set else self.server.lpop # XXX: Do we need to use a timeout here? found = 0 # TODO: Use redis pipeline execution. while found < self.redis_batch_size: data = fetch_one(self.redis_key) if not data: # Queue empty. break req = self.make_request_from_data(data) if req: yield req found += 1 else: self.logger.debug('Request not made from data: %r', data)
if found: self.logger.debug('Read %s requests from '%s'', found, self.redis_key)
def make_request_from_data(self, data): '''Returns a Request instance from data coming from Redis.
By default, ``data`` is an encoded URL. You can override this method to provide your own message decoding.
Parameters ---------- data : bytes Message from redis.
''' url = bytes_to_str(data, self.redis_encoding) return self.make_requests_from_url(url)
def schedule_next_requests(self): '''Schedules a request if available''' # TODO: While there is capacity, schedule a batch of redis requests. for req in self.next_requests(): self.crawler.engine.crawl(req, spider=self)
def spider_idle(self): '''Schedules a request if available, otherwise waits.''' # XXX: Handle a sentinel to close the spider. self.schedule_next_requests() raise DontCloseSpider
class RedisSpider(RedisMixin, Spider): '''Spider that reads urls from redis queue when idle.
Attributes ---------- redis_key : str (default: REDIS_START_URLS_KEY) Redis key where to fetch start URLs from.. redis_batch_size : int (default: CONCURRENT_REQUESTS) Number of messages to fetch from redis on each attempt. redis_encoding : str (default: REDIS_ENCODING) Encoding to use when decoding messages from redis queue.
Settings -------- REDIS_START_URLS_KEY : str (default: '<spider.name>:start_urls') Default Redis key where to fetch start URLs from.. REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) Default number of messages to fetch from redis on each attempt. REDIS_START_URLS_AS_SET : bool (default: False) Use SET operations to retrieve messages from the redis queue. If False, the messages are retrieve using the LPOP command. REDIS_ENCODING : str (default: 'utf-8') Default encoding to use when decoding messages from redis queue.
'''
@classmethod def from_crawler(self, crawler, *args, **kwargs): obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs) obj.setup_redis(crawler) return obj
class RedisCrawlSpider(RedisMixin, CrawlSpider): '''Spider that reads urls from redis queue when idle.
Attributes ---------- redis_key : str (default: REDIS_START_URLS_KEY) Redis key where to fetch start URLs from.. redis_batch_size : int (default: CONCURRENT_REQUESTS) Number of messages to fetch from redis on each attempt. redis_encoding : str (default: REDIS_ENCODING) Encoding to use when decoding messages from redis queue.
Settings -------- REDIS_START_URLS_KEY : str (default: '<spider.name>:start_urls') Default Redis key where to fetch start URLs from.. REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) Default number of messages to fetch from redis on each attempt. REDIS_START_URLS_AS_SET : bool (default: True) Use SET operations to retrieve messages from the redis queue. REDIS_ENCODING : str (default: 'utf-8') Default encoding to use when decoding messages from redis queue.
'''
@classmethod def from_crawler(self, crawler, *args, **kwargs): obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs) obj.setup_redis(crawler) return obj
在scrapy_redis组件中不仅提供了RedisSpider还提供了兼具深度爬虫的RedisCrawlSpider,至于其余几个Redis分布式组件将在后面逐一分享。 Redis分布式组件,新增redis_key 属性,用于早redis中去重和数据存储。 示例 from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
class TestSpider(RedisCrawlSpider): name = 'test' allowed_domains = ['www.'] redis_key = 'testspider:start_urls' rules = [ # 获取每一页的链接 Rule(link_extractor=LinkExtractor(allow=('/?page=\d+'))), # 获取每一个公司的详情 Rule(link_extractor=LinkExtractor(allow=('/\d+')), callback='parse_item') ]
def parse_item(self, response): ······ return item
至于更多配置不再赘述, 后续将对一些组件继续深入分析。
|