js生成页面处理方法（2）

yygy321 2015-09-10

展开全文

from selenium.webdriver.common.keys import Keys
from selenium import webdriver
导入这另个模块：案例
# -*- coding: utf-8 -*-
from scrapy.selector import Selector
#from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from gov_crawler.items import ZBGDDongGuangCZItem
from scrapy.http import Request
from selenium import webdriver
from datetime import timedelta, date
from time import strftime, localtime
from selenium.webdriver.common.keys import Keys
import datetime
import time
import re

class ZBGDDongGuangCZBSpider(CrawlSpider):
    name = 'ZBGDDongGuangCZ'
    allowed_domains = ['jtj.dg.gov.cn']
    urls = []
    urls.append(
        'http://jtj.dg.gov.cn/publicfiles/business/htmlfiles/dgcz/s25100/index.htm')
    start_urls = urls

    def parse_item(self, response):
        sel = Selector(response)
        i = response.meta['item']
        htmlcontent = sel.xpath('//div[@class="center_nrbf"]').extract()
        i['htmlcontent'] = len(htmlcontent)>0 and htmlcontent[0] or ''
        print 'htmlcountent is:'+i['htmlcontent']
        source = "广东东莞财政局"
        i['source'] = source.encode('utf-8')
        return i

    def parse(self, response):
        driver = webdriver.PhantomJS()
        driver.get(self.start_urls[0])
        #html=driver.page_source
        data = driver.find_element_by_xpath('//div[@id="pageSize_12"]')
        newsurl = data.find_elements_by_xpath('div')[0:]
        print "dsfdsfds:"+str(len(newsurl))
        articles = []
        for news in newsurl:
            i = ZBGDDongGuangCZItem()
            data2=news.find_elements_by_xpath('table/tbody/tr/td')[0:]
            #获取url
            urltemp=data2[1].find_element_by_xpath('a').get_attribute('href')
            i['url'] = len(urltemp)>0 and urltemp.strip() or ''
            #获取日期
            pubtmp=data2[2].find_element_by_xpath('span/span/*').text
            i['publishTime'] = len(pubtmp) > 0 and pubtmp.strip() or ''
            #获取title
            title=data2[1].find_element_by_xpath('a/*').text
            i['title'] = len(title) > 0 and title.strip() or ''
            articles.append(i)
        for item in articles:
            yield Request(item['url'],meta={'item':item},callback=self.parse_item)