/**
* PHPDocIndexer.java
* 用于对 PHPDoc 的 HTML 页面生成索引文件。
*/
import java.io.File;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.Date;
import java.text.DateFormat;
import java.lang.*;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.DateField;
class PHPDocIndexer
{
public static void main(String[] args) throws ClassNotFoundException, IOException
{
try {
Date start = new Date();
IndexWriter writer = new IndexWriter("/home/nio/indexes-phpdoc", new CJKAnalyzer(), true); //索引保存目录,必须存在
indexDocs(writer, new File("/home/nio/phpdoc-zh")); //HTML 文件保存目录
System.out.println("Optimizing ....");
writer.optimize();
writer.close();
Date end = new Date();
System.out.print("Total time: ");
System.out.println(end.getTime() - start.getTime());
} catch (Exception e) {
System.out.println("Class " + e.getClass() + " throws error!\n errmsg: " + e.getMessage());
} //end try
} //end main
public static void indexDocs(IndexWriter writer, File file) throws Exception
{
if (file.isDirectory()) {
String[] files = file.list();
for (int i = 0; i < files.length; i++) {
indexDocs(writer, new File(file, files[i]));
} //end for
} else if (file.getPath().endsWith(".html")) { //只对 HTML 文件做索引
System.out.print("Add file:" + file + " ....");
// Add html file ....
Document doc = new Document();
doc.add(Field.UnIndexed("file", file.getName())); //索引文件名
doc.add(Field.UnIndexed("modified", DateFormat.getDateTimeInstance().format(new Date(file.lastModified())))); //索引最后修改时间
String title = "";
String content = "";
String status = "start";
FileReader fReader = new FileReader(file);
BufferedReader bReader = new BufferedReader(fReader);
String line = bReader.readLine();
while (line != null) {
content += line;
//截取 HTML 标题 <title>
if ("start" == status && line.equalsIgnoreCase("><TITLE")) {
status = "match";
} else if ("match" == status) {
title = line.substring(1, line.length() - 7);
doc.add(Field.Text("title", title)); //索引标题
status = "end";
} //end if
line = bReader.readLine();
} //end while
bReader.close();
fReader.close();
doc.add(Field.Text("content", content.replaceAll("<[^<>]+>", ""))); //索引内容
writer.addDocument(doc);
System.out.println(" [OK]");
} //end if
}
} //end class
<%@ page language="java" import="javax.servlet.*, javax.servlet.http.*, java.io.*, java.util.Date, java.util.ArrayList, java.util.regex.*, org.apache.lucene.analysis.*, org.apache.lucene.document.*, org.apache.lucene.index.*, org.apache.lucene.search.*, org.apache.lucene.queryParser.*, org.apache.lucene.analysis.Token, org.apache.lucene.analysis.TokenStream, org.apache.lucene.analysis.cjk.CJKAnalyzer, org.apache.lucene.analysis.cjk.CJKTokenizer, com.chedong.weblucene.search.WebLuceneHighlighter" %>
<%@ page contentType="text/html;charset=GB2312" %>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>