from selenium.webdriver.common.keys import Keys from selenium import webdriver 导入这另个模块:案例 # -*- coding: utf-8 -*- from scrapy.selector import Selector #from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.contrib.spiders import CrawlSpider, Rule from gov_crawler.items import ZBGDDongGuangCZItem from scrapy.http import Request from selenium import webdriver from datetime import timedelta, date from time import strftime, localtime from selenium.webdriver.common.keys import Keys import datetime import time import re class ZBGDDongGuangCZBSpider(CrawlSpider): name = 'ZBGDDongGuangCZ' allowed_domains = ['jtj.dg.gov.cn'] urls = [] urls.append( 'http://jtj.dg.gov.cn/publicfiles/business/htmlfiles/dgcz/s25100/index.htm') start_urls = urls def parse_item(self, response): sel = Selector(response) i = response.meta['item'] htmlcontent = sel.xpath('//div[@class="center_nrbf"]').extract() i['htmlcontent'] = len(htmlcontent)>0 and htmlcontent[0] or '' print 'htmlcountent is:'+i['htmlcontent'] source = "广东东莞财政局" i['source'] = source.encode('utf-8') return i def parse(self, response): driver = webdriver.PhantomJS() driver.get(self.start_urls[0]) #html=driver.page_source data = driver.find_element_by_xpath('//div[@id="pageSize_12"]') newsurl = data.find_elements_by_xpath('div')[0:] print "dsfdsfds:"+str(len(newsurl)) articles = [] for news in newsurl: i = ZBGDDongGuangCZItem() data2=news.find_elements_by_xpath('table/tbody/tr/td')[0:] #获取url urltemp=data2[1].find_element_by_xpath('a').get_attribute('href') i['url'] = len(urltemp)>0 and urltemp.strip() or '' #获取日期 pubtmp=data2[2].find_element_by_xpath('span/span/*').text i['publishTime'] = len(pubtmp) > 0 and pubtmp.strip() or '' #获取title title=data2[1].find_element_by_xpath('a/*').text i['title'] = len(title) > 0 and title.strip() or '' articles.append(i) for item in articles: yield Request(item['url'],meta={'item':item},callback=self.parse_item) |
|
来自: yygy321 > 《python scrapy爬虫》