分享

js生成页面处理方法(2)

 yygy321 2015-09-10
from selenium.webdriver.common.keys import Keys
from selenium import webdriver

导入这另个模块:案例
# -*- coding: utf-8 -*-
from scrapy.selector import Selector
#from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from gov_crawler.items import ZBGDDongGuangCZItem
from scrapy.http import Request
from selenium import webdriver
from datetime import timedelta, date
from time import strftime, localtime
from selenium.webdriver.common.keys import Keys
import datetime
import time
import re

class ZBGDDongGuangCZBSpider(CrawlSpider):
    name = 'ZBGDDongGuangCZ'
    allowed_domains = ['jtj.dg.gov.cn']
    urls = []
    urls.append(
        'http://jtj.dg.gov.cn/publicfiles/business/htmlfiles/dgcz/s25100/index.htm')
    start_urls = urls

    def parse_item(self, response):
        sel = Selector(response)
        i = response.meta['item']
        htmlcontent = sel.xpath('//div[@class="center_nrbf"]').extract()
        i['htmlcontent'] = len(htmlcontent)>0 and htmlcontent[0] or ''
        print 'htmlcountent is:'+i['htmlcontent']
        source = "广东东莞财政局"
        i['source'] = source.encode('utf-8')
        return i

    def parse(self, response):
        driver = webdriver.PhantomJS()
        driver.get(self.start_urls[0])
        #html=driver.page_source
        data = driver.find_element_by_xpath('//div[@id="pageSize_12"]')
        newsurl = data.find_elements_by_xpath('div')[0:]
        print "dsfdsfds:"+str(len(newsurl))
        articles = []
        for news in newsurl:
            i = ZBGDDongGuangCZItem()
            data2=news.find_elements_by_xpath('table/tbody/tr/td')[0:]
            #获取url
            urltemp=data2[1].find_element_by_xpath('a').get_attribute('href')
            i['url'] = len(urltemp)>0 and urltemp.strip() or ''
            #获取日期
            pubtmp=data2[2].find_element_by_xpath('span/span/*').text
            i['publishTime'] = len(pubtmp) > 0 and pubtmp.strip() or ''
            #获取title
            title=data2[1].find_element_by_xpath('a/*').text
            i['title'] = len(title) > 0 and title.strip() or ''
            articles.append(i)
        for item in articles:
            yield Request(item['url'],meta={'item':item},callback=self.parse_item)
   

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多