配色: 字号:
python-urllib模块【下载图片】
2016-11-10 | 阅:  转:  |  分享 
  
0,python中关于下载的部分总结如下:importurllibif__name__=="__main__":url="http
://www.mntuku.cn"#根据url读取html源码content=urllib.urlopen(url).re
ad()#转为中文可读,可以直接查看当前html源文件是什么编码格式,百度的是gb2312content=content.
decode("gb2312").encode("utf-8")printcontent1,处理A标签字符串:#!/usr/b
in/python#encoding=utf-8importhtmllib,urllib,formatter,string''''''
importchardet,systype=sys.getdefaultencoding()''''''classGetLink
s(htmllib.HTMLParser):#从HTMLParser类中继承def__init__(self):#初始化的
时候调用,将links设置为空。这里的links为字典结构self.links={}#存放[地址->链接]的字典f=
formatter.NullFormatter()#将传输过来的数据不做处理,格式化为数据流htmllib.HTMLParser
.__init__(self,f)defanchor_bgn(self,href,name,type):#锚点标签开
始的时候处理self.save_bgn()self.link=hrefdefanchor_end(self):#锚点
标签结束的时候处理text=string.strip(self.save_end())#去掉A标签保留A标签的信息if
self.linkandtext:self.links[text]=self.link#self.links.get(t
ext,[])+[self.link]#fp=urllib.urlopen("http://www.baidu.com"
)#打开指定的URL#data=fp.read()#fp.close()data=''>test链接到163f="http://www.focus.cn">焦点''linkdemo=GetLinks(
)#实例化一个LinkDemo对象linkdemo.feed(data)#给HTMLParser喂食linkdemo.clos
e()forhref,linkinlinkdemo.links.items():#打印相关的信息printhref,
"=>",link输出:焦点=>http://www.focus.cn链接到163=>http://www.163.
com再如:#--coding:UTF-8--importhtmllib,urllib,formatter,
stringclassGetLinks(htmllib.HTMLParser):def__init__(self):se
lf.links={}f=formatter.NullFormatter()htmllib.HTMLParser.__
init__(self,f)defanchor_bgn(self,href,name,type):self.save
_bgn()ifhref[:4]==''http'':self.link=hrefelse:self.link=
Nonedefanchor_end(self):text=string.strip(self.save_end())i
fself.linkandtext:self.links[text]=self.linkfp=urllib.url
open("http://list.taobao.com/browse/cat-0.htm")data=fp.read()fp
.close()linkdemo=GetLinks()linkdemo.feed(data)linkdemo.close()f
orhref,linkinlinkdemo.links.items():href=href.decode(''gb23
12'').encode(''utf-8'')printhref,''-'',linkpass结果是下载到的淘宝“裤架-htt
p://ju.atpanel.com/?url=http://list.taobao.com/market/baihuo.htm?
spm=1.47613.90750.”这样的列表2,下载豆瓣图片【多线程】:#--coding:UTF-8--fr
omHTMLParserimportHTMLParserimporthtmllib,urllib,formatter,st
ringimportos,sys,timeimportthreading''''''Createdon2012-10-09@au
thor:xing.gexing''''''#建立线程池,并启动线程直到结束defparallel(urls):startTi
me=time.time()threads=[]counts=range(len(urls))foriinco
unts:t=MyThread(downloadFromURL,(urls[i],),downloadFromURL.__nam
e__)threads.append(t)foriincounts:threads[i].start()fori
incounts:threads[i].join()print''usetimecost:%s''%(time.time(
)-startTime)#自定义线程类classMyThread(threading.Thread):def__init__
(self,func,args,name=''''):threading.Thread.__init__(self)self.na
me=nameself.func=funcself.args=argsdefrun(self):apply(self.f
unc,self.args)#根据url找到图片的链接并下载defdownloadFromURL(url):fp=urll
ib.urlopen(url)data=fp.read()fp.close()hp=MyHTMLParser()h
p.feed(data)hp.close()foriinhp.links:print(i)downloadImage
(i)#根绝imageUrl下载图片到本地defdownloadImage(imageUrl):dir="./image
_douban"try:ifnotos.path.exists(dir):os.mkdir(dir)except:p
rint"Failedtocreatedirectoryin%s"%direxit()image=imageU
rl.split(''/'')[-1]path=dir+"/"+imagedata=urllib.urlopen(imag
eUrl).read()f=file(path,"wb")f.write(data)f.close()#定义html解析
,关键在于handle_starttagclassMyHTMLParser(HTMLParser):def__init__(
self):HTMLParser.__init__(self)self.links=[]defhandle_star
ttag(self,tag,attrs):iflen(attrs)==0:passelse:for(vari
able,value)inattrs:ifvariable=="src"andvalue[:4]=="http"
andvalue[-4:]==".jpg":self.links.append(value)if__name__==
"__main__":html="""google.com<
AHref="www.pythonclub.org">PythonClubcom.cn">Sina"""#url2="http://image.baidu.com/i?ct=20132
6592&cl=2&lm=-1&tn=baiduimage&pv=&word=car&z=5"#url="http://im
age.baidu.com"#url="http://movie.douban.com/"#下载豆瓣电影图片base=
20count=1urls=[]whilecount<=100:url="http://movie.d
ouban.com/tag/%E6%83%8A%E6%82%9A?start="+str(basecount)+"&type=T
"urls.append(url)count+=1parallel(urls)3,下载百度图片【单线程】:需要特别注
意的是对于百度图片的处理:搜索的关键词是其中的word,注意替换。百度图片搜索的第1页(包含20张图片):http://image
.baidu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagen
ojs&s=0&word=%C6%FB%B3%B5&pn=0百度图片搜索的第2页(包含20张图片):http://image.ba
idu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagenojs
&s=0&word=%C6%FB%B3%B5&pn=20...对于其中每一页,每张图片都有个这样的后缀:/i?ct=5033164
80&z=3&tn=baiduimagedetailnojs&word=%C6%FB%B3%B5&cl=2&lm=-1&pn=20
&rn=1&di=36978446751&ln=1987,所以一共20个,查找i?ct进行匹配即可。将这个后缀与百度图片地址htt
p://image.baidu.com拼接即可得到该图片源的网页:http://image.baidu.com/i?ct=5033
16480&z=3&tn=baiduimagedetailnojs&word=%C6%FB%B3%B5&cl=2&lm=-1&pn
=20&rn=1&di=36978446751&ln=1987在该网页中匹配imgsrc即可找到图片绝对路径。#--co
ding:UTF-8--importos,sys,urllibdocString=''''''Createdon2012
-10-10@author:xing.gexing''''''defbaidu(imgsum,findstr):gbstr=("找
到相关图片约".decode("utf8")).encode("gb2312")gbstr2=("找到相关图片".decode(
"utf8")).encode("gb2312")gbstr3=("张".decode("utf8").encode("gb23
12"))iffindstr=="":return0findstr=(findstr.decode("utf8")).e
ncode("gb2312")findstr=urllib.quote(findstr)url="http://image.b
aidu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagenoj
s&s=0&word=%s&pn="%findstrwebfile=urllib.urlopen(url+"0").read()
start=webfile.find(gbstr)ifstart==-1:start=webfile.find(gbstr
2)start=start+12else:start=start+14end=webfile.find(gbstr3,st
art)sum=webfile[start:end]sum=sum.replace(",","")sum=int(sum)
#总图片数sumpage=sum/20+1
#总页数print"youhavefound%dpicsinbaiduImage"%sumi=0
#下载的图片数forpageinrange(sumpa
ge):p_url=url+"%s"%(page20)#当前页urlwebfile=urllib.urlo
pen(p_url).read()i_start=0i_end=0whileTrue:i_start=webfi
le.find(''''''10i_end=webfile.find(''''''"'''''',i_start)i_url=webfile[i_start:i_en
d]i_url="http://image.baidu.com/"+i_urlwebstr=urllib.urlopen(i_
url).read()start=0end=0whileTrue:start=webstr.find(''''''mgsrc="'''''',end)ifstart<0:breakstart+=10end=webstr.find(''''''"
'''''',start)imgurl=webstr[start:end]ifimgurl[-4:]!=".jpg":conti
nueifimgurl.find("img-jg.gif")!=-1:continuei=i+1print"downl
oadingpic%sfrom%s"%(i,imgurl)try:data=urllib.urlopen(imgurl
).read()except:print"lost1pic"breakf=open("%s/%d.jpg"%(dir
,i),"w")f.write(data)f.close()ifi==int(imgsum):print"finishdownload%spics"%ireturn1if__name__=="__main__":printdocStringprint"configyourdownloadingarguments:"findstr=raw_input("search:")iffindstr=="":findstr="汽车"imgsum=raw_input("num:")ifimgsum=="":imgsum=10dir="./baiduPic"try:ifnotos.path.exists(dir):os.mkdir(dir)except:print"Failedtocreatedirectoryinlinux:"exit()print"configOK!"baidu(imgsum,findstr)
献花(0)
+1
(本文系雨亭之东首藏)