use LWP::UserAgent;
use HTML::TreeBuilder;
use LWP::Simple;
use URI;
use Encode;
@list_url=();
@download_url=();
foreach (1..16)#在新浪微盘里面搜索perl会有16个页面的结果
{
my $url = URI->new('http://vdisk.weibo.com/search/');
my($keyword,$sortby,$page) = ("perl","default",$_);#对perl这个关键词做测试
$url->query_form
(
# All form pairs:
'keyword' => $keyword,
'sortby' => $sortby,
'page' => $page,
);
push @list_url,$url;
}
my $ua = LWP::UserAgent->new;
#open fh,">aa.txt";
foreach (@list_url)#对我们自己合成的目标url做循环爬取适合的链接
{
my $response = $ua->get($_);
$html=$response->content;
my $tree = HTML::TreeBuilder->new; # empty tree
$tree->parse($html) or print "error : parse html ";
@pdf_name=$tree->find_by_attribute("class","sort_name_intro") or print "error : cannot find pdf_name ";
foreach (@pdf_name)
{
$node=$_->look_down(_tag=>'a');
$a=$node->attr('href');
$b=encode("cp936", decode("utf-8",$node->attr('title')));
$c="$a\t$b";
push @download_url,$c;#把目标链接的url及文件名添加到下载列表
}
}
foreach (@download_url)
{
@tmp=split;
$html=get($tmp[0]);
$html=~/fileDown\.init.*?\"url\":\"(.*?)\",/;#这个是关键,我找了半天才找到该页面的真实url地址
$a=$1;
$a=~s/\\//g;
print $a;
getstore("$a","$tmp[1]");
}
|