分享

Scrapy爬取校花网图片

 小猪窝969 2019-01-03
#爬虫内容
from scrapy.selector import Selector
from scrapy.http import Request
from ..items import MysiteItem
#from scrapy.dupefilter import RFPDupeFilter
class XiaohuaSpider(scrapy.Spider):
name = 'xiaohua1'
allowed_domains = ["wx.dxs6.cn","www.","www.dxsabc.com"]
start_urls = ['http://www./hua/']
#page_set = set()
def parse(self, response):
item = MysiteItem()
info_list = Selector(response=response).\
xpath('//div[starts-with(@class,"item_list")]/div')
for obj in info_list:
name = obj.xpath(".//a/img/@alt").extract_first()
img = obj.xpath(".//a/img/@src").extract_first()
img_request = response.urljoin(img) #response.urljoin -->根据start_url自动补全url
	    #例如 img 抓取到的链接为 update/18883004004.jpg  就会自动补全成http://www./update/18883004004.jpg,如果是完成的http请求,则忽略----->follow 一样
item["url_address"] = img_request
item["name"] = name
	   #交给pipelines做数据持久化
yield item
	    #yield response.follow(img,callback=parse)#对相对路径进行自动补全



#获取校花网深度
page_num = Selector(response = response).xpath('//*[@id="page"]/div/a/@href').extract()

for url in page_num:

# if url in self.page_set:
# pass
# #print(u"url已经存在")
# else:
# self.page_set.add(url)
#Request scrapy引擎交给调度器继续执行
yield Request(url = url,callback = self.parse)

# def parse_datile(self,response):
#
# print("request------>", response.url)
# info_list = Selector(response=response). \
# xpath('//div[starts-with(@class,"item_list")]/div')
# for obj in info_list:
# name = obj.xpath(".//a/img/@alt").extract_first()
# img = obj.xpath(".//a/img/@src").extract_first()
#
# for url in self.img_url:
# #img_request = response.follow(url, callback=self.parse)
# img_request = response.urljoin(url)
# yield Request(url=img_request, callback=self.parse)
# item = MysiteItem()
# item["url_address"] = img_request
# yield item
#item 内容 ----格式化数据

import scrapy


class MysiteItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
url_address = scrapy.Field()
#pipelines内容------》数据持久化(可以写多个class 在settings中配置权重,数字小的先执行)
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc./en/latest/topics/item-pipeline.html

import sys,os
import codecs
import json
import requests
from scrapy.http import Request

class MyPipeline(object):
def __init__(self,picture):
#self.picture = os.path.join(os.path.dirname(os.path.abspath(__file__)), "img_picture")
self.picture = picture
#处理爬取的数据
def process_item(self, item, spider):

response = requests.get(item["url_address"])

picture_img = self.picture + "\\" + item["name"] + ".jpg"
with open(picture_img,"wb") as f_write:
f_write.write(response.content)
#爬取开始前执行
def open_spider(self,spider):
print("开始爬取。。。。。。。。。。。。。。。。。。。。。")
if not os.path.exists(self.picture):
os.mkdir(self.picture)
#爬取结束后执行
def close_spider(self,spider):
print("结束爬取")
@classmethod
def from_crawler(cls,crawler):
picture = crawler.settings.get("IMG_PICTURE") #获取settings文件 自定义的数据
return cls(picture) #cls是类名,实例化MyPipeline

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多