import scrapy import re from collections import Counter from lianjia.items import LianjiaItem
class LianjiaSpiderSpider(scrapy.Spider): name = 'lianjia_spider' allowed_domains = ['wh.'] start_urls = ['https://wh./ershoufang/baibuting/']
def parse(self, response): rsp = (response.body.decode("utf-8")) #print(response.xpath("//div")) item = LianjiaItem() info_list = response.xpath("//div//ul//li[@class='clear LOGCLICKDATA']") #print(len(info_list)) #print(info_list) for i in info_list: #print(i)
item["xiaoqu_name"] = i.xpath('.//div[@class="houseInfo"]//a[@target="_blank"]/text()').extract()[0] #print(xiaoqu_name)
#xiaoqu_link = i.xpath('.//div[@class="houseInfo"]//@href').extract()[0] #print(xiaoqu_link)
item["name"] = i.xpath('.//div[@class="info clear"]//a/text()').extract()[0] #print(name)
item["area"] = i.xpath('.//div[@class="info clear"]//div[@class="positionInfo"]//a/text()').extract()[0] #print(area)
item["link"] = i.xpath(".//div[@class='title']//@href").extract()[0] #print(link)
item["summary"] = i.xpath('.//div[@class="houseInfo"]/text()').extract()[0] # summary 总结 朝向 装修等,电梯等 #print(summary)
item["floor"] = i.xpath('.//div[@class="info clear"]//div[@class="positionInfo"]/text()').extract()[0] #print(floor)
item["zongjia"] = i.xpath('.//div[@class="info clear"]//div[@class="totalPrice"]//span/text()').extract()[0]# "万" #组合上单位 #print(zongjia)
item["danjia"] = i.xpath('.//div[@class="info clear"]//div[@class="unitPrice"]//span/text()').extract()[0] #print(danjia)
yield item #经过分析发现,如果直接在 武昌 汉口 这样的大区域下搜索 ,最多显示30页数据,所以想要完全爬取,必须把所有小区域的链接挨个遍历 area_list = ["baibuting","dazhilu","dijiao","erqi2","houhu","huangpuyongqing","qianjinjianghan","sanyanglu","tazihu","yucaihuaqiao", "changqinglu","changfengchangmatou","changganglu","taibeixiangganglu","tangjiadun","wuguangwansongyuan","xinhualuwanda","yangchahu", "baofengchongren","changfengchangmatou","cbdxibeihu","gutian","hanzhengjie","jixian2","wujiashan","zongguan", "changqinghuayuan","dongxihuqita","jinyinhu","jiangjunlu","baishazhou","chuhehanjie","donghudongting","jiedaokou","jiyuqiao","shuiguohu","shouyi","shahu", "tuanjiedadao","wuchanghuochezhan","xudong","yangyuan","zhongbeilu","zhongnandingziqiao","zhuodaoquan","hongshanqita","qingshan1","huquanyangjiawan","luoshinanlu", "laonanhu","nanhuwoerma","xinnanhu","qilimiao","sixin","wangjiawan","zhongjiacun","guanxichangzhi","guangguguangchang","guanshandadao","guanggunan","guanggudong", "huakeda","jinronggang","minzudadao","sanhuannan","canglongdao","jiangxiaqita","miaoshan","wenhuadadao","caidianqita","dunkou", "hankoubei","huangbeiqita","panlongcheng","qianchuan","xinzhouqita","yangluo"] #counter = Counter(area_list) #查询列表中是否有重复 #print(counter) #遍历所有区域后,再遍历0~30页 这样才能确保网站上的所有数据都被爬取,否则信息严重缺失 for i in area_list: for num in range(0,30): yield scrapy.Request("https://wh./ershoufang/" i "/pg" str(num), callback=self.parse)
items和pipelines无特别之处,按照常规写即可使用。 来源:http://www./content-4-25501.html
|