定时抓取文章分类:Web前端流程
1:提供要抓取的网页地址(列表) 2:提取网页列表中目标所有LINK 3:抓取LINK中的所有网页(爬虫) 4:解析正文内容 5:存入数据库 一、抓取任务(主程序) package com.test; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; public class CatchJob { public String catchJob(String url){ String document= null; List allLinks = null; try { // 获取网页内容 document = ExtractPage.getContentByUrl(url); // 获取页面指定内容的Link allLinks = ExtractPage.getLinksByConditions(document, "http://www./others/gift/"); if(allLinks!=null&&!allLinks.isEmpty()){ for(int i=0;i<allLinks.size();i++){ String link = (String)allLinks.get(i); String content = ExtractPage.getContentByUrl(link); ExtractPage.readByHtml(content); } } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return "success"; } public static void main(String[] args){ Long startTime = System.currentTimeMillis(); System.out.println(">>start......."); String httpProxyHost = "211.167.0.131";//default http proxy String httpProxyPort = "80"; //default http port System.getProperties().setProperty( "http.proxyHost", httpProxyHost); System.getProperties().setProperty( "http.proxyPort", httpProxyPort); CatchJob job = new CatchJob(); //System.out.println(job.catchJob("http://www./others/gift/2008-12-09/12288046534312.htm")); System.out.println(job.catchJob("http://www./others/gift/")); Date date = new Date(System.currentTimeMillis()-startTime); SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss "); String s = sdf.format(date); System.out.println(">>end.......USE"+s+"秒"); } } 二、抓取网页内容,并解析 package com.test; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.tags.Div; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.TitleTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class ExtractPage { //抓取页面内容 public static String getContentByUrl(String url){ System.out.println("**********抓取页面内容***********"); StringBuffer document= null; URL targetUrl; try { targetUrl = new URL(url); HttpURLConnection con = (HttpURLConnection) targetUrl.openConnection(); con.setFollowRedirects(true); con.setInstanceFollowRedirects(false); con.connect(); BufferedReader br = new BufferedReader(new InputStreamReader(con.getInputStream(),"gb2312")); String s = ""; document= new StringBuffer(); while ((s = br.readLine()) != null) { document.append(s+"\r\n"); } s=null; br.close(); return document.toString(); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } // 按页面方式处理.解析标准的html页面 public static void readByHtml(String result) throws Exception { System.out.println("**********按页面方式处理.解析标准的html页面***********"); Parser parser; NodeList nodelist; parser = Parser.createParser(result, "utf8"); NodeFilter textFilter = new NodeClassFilter(Div.class); //NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeFilter titleFilter = new NodeClassFilter(TitleTag.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { textFilter,titleFilter}); nodelist = parser.parse(lastFilter); Node[] nodes = nodelist.toNodeArray(); StringBuffer page = new StringBuffer(); String id = ""; for (int i = 0; i < nodes.length; i++) { Node node = nodes[i]; if (node instanceof Div) { Div textnode = (Div) node; id = textnode.getAttribute("id"); if ("Zoom".equals(id)) { //System.out.println(textnode.getChild(5).toHtml()); page.append(textnode.getChild(5).toHtml().toString()); page.append(textnode.getChild(6).toHtml().toString()); } }else if (node instanceof TitleTag) { TitleTag titlenode = (TitleTag) node; page.append(titlenode.getTitle().substring(0, titlenode.getTitle().indexOf("|"))); } /* else if (node instanceof LinkTag) { LinkTag link = (LinkTag) node; line = link.getLink(); } else if (node instanceof TitleTag) { TitleTag titlenode = (TitleTag) node; line = titlenode.getTitle(); } */ //if (isTrimEmpty(line)) //continue; } System.out.println(page.toString()); } //获取页面指定内容的Link public static List getLinksByConditions(String result,String coditions){ System.out.println("**********//获取页面指定内容的Link***********"); List links = null; Parser parser; NodeList nodelist; parser = Parser.createParser(result, "utf8"); NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); try { links = new ArrayList(); nodelist = parser.parse(linkFilter); Node[] nodes = nodelist.toNodeArray(); for (int i = 0; i < nodes.length; i++) { Node node = nodes[i]; if (node instanceof LinkTag) { LinkTag link = (LinkTag) node; if(link.toHtml().indexOf(coditions)!=-1&&link.toHtml().indexOf("index")==-1&&link.toHtml().indexOf(".htm")!=-1){ System.out.println(link.toHtml()); links.add(link.getLink()); } } //if (isTrimEmpty(line)) //continue; } } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } return links; } } |
|