import requests from fake_useragent import UserAgent filename=0 classphoto_spider(object): def__init__(self): self.url = 'https://api./v1/photos?rpp=50&feature=popular&image_size%5B%5D=1&image_size%5B%5D=2&image_size%5B%5D=32&image_size%5B%5D=31&image_size%5B%5D=33&image_size%5B%5D=34&image_size%5B%5D=35&image_size%5B%5D=36&image_size%5B%5D=2048&image_size%5B%5D=4&image_size%5B%5D=14&sort=&include_states=true&include_licensing=true&formats=jpeg%2Clytro&only=&exclude=&personalized_categories=&page={}&rpp=50' ua = UserAgent(verify_ssl=False) #随机产生user-agent for i in range(1, 100): self.headers = { 'User-Agent': ua.random } defmian(self): pass if __name__ == '__main__': spider = photo_spider() spider.main()
2.发送请求,获取网页。
defget_html(self,url): response=requests.get(url,headers=self.headers) html=response.json()#动态加载的json数据 return html
3.获取图片的链接地址,保存图片格式到本地文件夹。
defget_imageUrl(self,html): global filename content_list=html['photos'] for content in content_list: image_url=content['image_url'] #print(image_url[8]) imageUrl=image_url[8] r=requests.get(imageUrl,headers=self.headers) with open('F:/pycharm文件/photo/'+str(filename)+'.jpg','wb') as f: f.write(r.content) filename+=1
这里说明一下,由于imageUrl=image_url[8]这里有多个image-url。
4.获取多页及函数调用。
defmain(self): start = int(input('输入开始页:')) end = int(input('输入结束页:')) for page in range(start, end + 1): print('第%s页内容' % page) url = self.url.format(page)#{}传入page即页码 html=self.get_html(url) self.get_imageUrl(html) print('第%s页爬取完成'%page)
运行结果
打开本地F:/pycharm文件/照片/
完整代码
import requests from fake_useragent import UserAgent filename=0 classphoto_spider(object): def__init__(self): self.url = 'https://api./v1/photos?rpp=50&feature=popular&image_size%5B%5D=1&image_size%5B%5D=2&image_size%5B%5D=32&image_size%5B%5D=31&image_size%5B%5D=33&image_size%5B%5D=34&image_size%5B%5D=35&image_size%5B%5D=36&image_size%5B%5D=2048&image_size%5B%5D=4&image_size%5B%5D=14&sort=&include_states=true&include_licensing=true&formats=jpeg%2Clytro&only=&exclude=&personalized_categories=&page={}&rpp=50' ua = UserAgent(verify_ssl=False) for i in range(1, 100): self.headers = { 'User-Agent': ua.random } defget_html(self,url): response=requests.get(url,headers=self.headers) html=response.json() return html defget_imageUrl(self,html): global filename content_list=html['photos'] for content in content_list: image_url=content['image_url'] #print(image_url[8]) imageUrl=image_url[8] r=requests.get(imageUrl,headers=self.headers) with open('F:/pycharm文件/photo/'+str(filename)+'.jpg','wb') as f: f.write(r.content) filename+=1 defmain(self): start = int(input('输入开始:')) end = int(input('输入结束页:')) for page in range(start, end + 1): print('第%s页' % page) url = self.url.format(page) html=self.get_html(url) self.get_imageUrl(html) if __name__ == '__main__': spider = photo_spider() spider.main()