0,python中关于下载的部分总结如下:importurllibif__name__=="__main__":url="http ://www.mntuku.cn"#根据url读取html源码content=urllib.urlopen(url).re ad()#转为中文可读,可以直接查看当前html源文件是什么编码格式,百度的是gb2312content=content. decode("gb2312").encode("utf-8")printcontent1,处理A标签字符串:#!/usr/b in/python#encoding=utf-8importhtmllib,urllib,formatter,string'''''' importchardet,systype=sys.getdefaultencoding()''''''classGetLink s(htmllib.HTMLParser):#从HTMLParser类中继承def__init__(self):#初始化的 时候调用,将links设置为空。这里的links为字典结构self.links={}#存放[地址->链接]的字典f= formatter.NullFormatter()#将传输过来的数据不做处理,格式化为数据流htmllib.HTMLParser .__init__(self,f)defanchor_bgn(self,href,name,type):#锚点标签开 始的时候处理self.save_bgn()self.link=hrefdefanchor_end(self):#锚点 标签结束的时候处理text=string.strip(self.save_end())#去掉A标签保留A标签的信息if self.linkandtext:self.links[text]=self.link#self.links.get(t ext,[])+[self.link]#fp=urllib.urlopen("http://www.baidu.com" )#打开指定的URL#data=fp.read()#fp.close()data=''>test链接到163f="http://www.focus.cn">焦点''linkdemo=GetLinks( )#实例化一个LinkDemo对象linkdemo.feed(data)#给HTMLParser喂食linkdemo.clos e()forhref,linkinlinkdemo.links.items():#打印相关的信息printhref, "=>",link输出:焦点=>http://www.focus.cn链接到163=>http://www.163. com再如:#--coding:UTF-8--importhtmllib,urllib,formatter, stringclassGetLinks(htmllib.HTMLParser):def__init__(self):se lf.links={}f=formatter.NullFormatter()htmllib.HTMLParser.__ init__(self,f)defanchor_bgn(self,href,name,type):self.save _bgn()ifhref[:4]==''http'':self.link=hrefelse:self.link= Nonedefanchor_end(self):text=string.strip(self.save_end())i fself.linkandtext:self.links[text]=self.linkfp=urllib.url open("http://list.taobao.com/browse/cat-0.htm")data=fp.read()fp .close()linkdemo=GetLinks()linkdemo.feed(data)linkdemo.close()f orhref,linkinlinkdemo.links.items():href=href.decode(''gb23 12'').encode(''utf-8'')printhref,''-'',linkpass结果是下载到的淘宝“裤架-htt p://ju.atpanel.com/?url=http://list.taobao.com/market/baihuo.htm? spm=1.47613.90750.”这样的列表2,下载豆瓣图片【多线程】:#--coding:UTF-8--fr omHTMLParserimportHTMLParserimporthtmllib,urllib,formatter,st ringimportos,sys,timeimportthreading''''''Createdon2012-10-09@au thor:xing.gexing''''''#建立线程池,并启动线程直到结束defparallel(urls):startTi me=time.time()threads=[]counts=range(len(urls))foriinco unts:t=MyThread(downloadFromURL,(urls[i],),downloadFromURL.__nam e__)threads.append(t)foriincounts:threads[i].start()fori incounts:threads[i].join()print''usetimecost:%s''%(time.time( )-startTime)#自定义线程类classMyThread(threading.Thread):def__init__ (self,func,args,name=''''):threading.Thread.__init__(self)self.na me=nameself.func=funcself.args=argsdefrun(self):apply(self.f unc,self.args)#根据url找到图片的链接并下载defdownloadFromURL(url):fp=urll ib.urlopen(url)data=fp.read()fp.close()hp=MyHTMLParser()h p.feed(data)hp.close()foriinhp.links:print(i)downloadImage (i)#根绝imageUrl下载图片到本地defdownloadImage(imageUrl):dir="./image _douban"try:ifnotos.path.exists(dir):os.mkdir(dir)except:p rint"Failedtocreatedirectoryin%s"%direxit()image=imageU rl.split(''/'')[-1]path=dir+"/"+imagedata=urllib.urlopen(imag eUrl).read()f=file(path,"wb")f.write(data)f.close()#定义html解析 ,关键在于handle_starttagclassMyHTMLParser(HTMLParser):def__init__( self):HTMLParser.__init__(self)self.links=[]defhandle_star ttag(self,tag,attrs):iflen(attrs)==0:passelse:for(vari able,value)inattrs:ifvariable=="src"andvalue[:4]=="http" andvalue[-4:]==".jpg":self.links.append(value)if__name__== "__main__":html="""google.com< AHref="www.pythonclub.org">PythonClubcom.cn">Sina"""#url2="http://image.baidu.com/i?ct=20132 6592&cl=2&lm=-1&tn=baiduimage&pv=&word=car&z=5"#url="http://im age.baidu.com"#url="http://movie.douban.com/"#下载豆瓣电影图片base= 20count=1urls=[]whilecount<=100:url="http://movie.d ouban.com/tag/%E6%83%8A%E6%82%9A?start="+str(basecount)+"&type=T "urls.append(url)count+=1parallel(urls)3,下载百度图片【单线程】:需要特别注 意的是对于百度图片的处理:搜索的关键词是其中的word,注意替换。百度图片搜索的第1页(包含20张图片):http://image .baidu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagen ojs&s=0&word=%C6%FB%B3%B5&pn=0百度图片搜索的第2页(包含20张图片):http://image.ba idu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagenojs &s=0&word=%C6%FB%B3%B5&pn=20...对于其中每一页,每张图片都有个这样的后缀:/i?ct=5033164 80&z=3&tn=baiduimagedetailnojs&word=%C6%FB%B3%B5&cl=2&lm=-1&pn=20 &rn=1&di=36978446751&ln=1987,所以一共20个,查找i?ct进行匹配即可。将这个后缀与百度图片地址htt p://image.baidu.com拼接即可得到该图片源的网页:http://image.baidu.com/i?ct=5033 16480&z=3&tn=baiduimagedetailnojs&word=%C6%FB%B3%B5&cl=2&lm=-1&pn =20&rn=1&di=36978446751&ln=1987在该网页中匹配imgsrc即可找到图片绝对路径。#--co ding:UTF-8--importos,sys,urllibdocString=''''''Createdon2012 -10-10@author:xing.gexing''''''defbaidu(imgsum,findstr):gbstr=("找 到相关图片约".decode("utf8")).encode("gb2312")gbstr2=("找到相关图片".decode( "utf8")).encode("gb2312")gbstr3=("张".decode("utf8").encode("gb23 12"))iffindstr=="":return0findstr=(findstr.decode("utf8")).e ncode("gb2312")findstr=urllib.quote(findstr)url="http://image.b aidu.com/i?z=3&fr=&cl=2&ct=201326592&lm=-1&rn=20&tn=baiduimagenoj s&s=0&word=%s&pn="%findstrwebfile=urllib.urlopen(url+"0").read() start=webfile.find(gbstr)ifstart==-1:start=webfile.find(gbstr 2)start=start+12else:start=start+14end=webfile.find(gbstr3,st art)sum=webfile[start:end]sum=sum.replace(",","")sum=int(sum) #总图片数sumpage=sum/20+1 #总页数print"youhavefound%dpicsinbaiduImage"%sumi=0 #下载的图片数forpageinrange(sumpa ge):p_url=url+"%s"%(page20)#当前页urlwebfile=urllib.urlo pen(p_url).read()i_start=0i_end=0whileTrue:i_start=webfi le.find(''''''10i_end=webfile.find(''''''"'''''',i_start)i_url=webfile[i_start:i_en d]i_url="http://image.baidu.com/"+i_urlwebstr=urllib.urlopen(i_ url).read()start=0end=0whileTrue:start=webstr.find(''''''mgsrc="'''''',end)ifstart<0:breakstart+=10end=webstr.find(''''''" '''''',start)imgurl=webstr[start:end]ifimgurl[-4:]!=".jpg":conti nueifimgurl.find("img-jg.gif")!=-1:continuei=i+1print"downl oadingpic%sfrom%s"%(i,imgurl)try:data=urllib.urlopen(imgurl ).read()except:print"lost1pic"breakf=open("%s/%d.jpg"%(dir ,i),"w")f.write(data)f.close()ifi==int(imgsum):print"finishdownload%spics"%ireturn1if__name__=="__main__":printdocStringprint"configyourdownloadingarguments:"findstr=raw_input("search:")iffindstr=="":findstr="汽车"imgsum=raw_input("num:")ifimgsum=="":imgsum=10dir="./baiduPic"try:ifnotos.path.exists(dir):os.mkdir(dir)except:print"Failedtocreatedirectoryinlinux:"exit()print"configOK!"baidu(imgsum,findstr) |
|