分享

Python编程开发爬虫抓取www.tmd86.com所有妹子图片

 昵称65365553 2019-07-17

懂点编程的馆友都知道Python完善的网络接口非常适合开发爬虫和AI编程。

今天分享自动爬取妹子图片的代码,不到100行真的超级简单、快捷。

代码开始:

import requests

from lxml import etree

import os

def a ():

    url = 'http://www./xinggan/'

    response = requests.get(url)

    # with open('.txt' , 'wb' ) as f :

    #     f.write(response.content)

    html_ele = etree.HTML(response.text)

    # li_ele_list = html_ele.xpath('//ul[@id="pins"]/li/a/@href')

    # print(li_ele_list)

    max_list = html_ele.xpath('//nav[@class="navigation pagination"]/div/a/text()')[3]

    # print(max_list)

    for i in range(1,int(max_list)+1):

        z_url = 'http://www./xinggan/list_{}.html/'.format(i)

        # print(z_url)

        response = requests.get(z_url)

        html_ele = etree.HTML(response.text)/

        li_ele_list = html_ele.xpath('//ul[@id="pins"]/li')

        for href_ele in li_ele_list:

            href_url = href_ele.xpath('./a/@href')[0]

            print(href_url)

            name = href_ele.xpath('./span/a/text()')[0]

            print(name)

            b(href_url, name)

        # break

def b(href_url,name):

    if not os.path.exists('/'+name):

        os.makedirs('/'+name)

    headers = {

    'Referer': str(href_url),

    'Upgrade-Insecure-Requests': '1',

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',

    }

    # print(headers)

    response = requests.get(href_url,headers=headers)

    html_ele = etree.HTML(response.text)

    # print(html_ele)

    xq_max_list = html_ele.xpath('//div[@class="pagenavi"]/a')[-2]

    # print(xq_max_list)

    max_list = xq_max_list.xpath('./span/text()')[0]

    # print(max_list)

    for i in range(1,int(max_list)):

        xq_url = str(href_url)+'/'+str(i)

        print(xq_url)

        response = requests.get(xq_url,headers = headers)

        html_ele = etree.HTML(response.text)

        src_page = html_ele.xpath('//div[@class="main-image"]/p/a/img/@src')

        src_page = src_page[0]

        print(src_page)

        tname = src_page.split('/')[-1]

        print(tname)

        response = requests.get(src_page, headers=headers)

        with open( '/'+name+'/'+tname,'wb' ) as f:

            f.write(response.content)

if __name__ == '__main__':

    a()


代码结束,效率很高 so easy

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多