TermVector是Lucene 1.4新增的,TermVector保存Token.getPositionIncrement() 和Token.startOffset() 以及Token.endOffset() 信息.
各种参数说明:
Field.TermVector.NO:不保存term vectors
Field.TermVector.YES:保存term vectors
Field.TermVector.WITH_POSITIONS:保存term vectors.(保存值和token位置信息)
Field.TermVector.WITH_OFFSETS:保存term vectors.(保存值和Token的offset)
Field.TermVector.WITH_POSITIONS_OFFSETS:保存term vectors.(保存值和token位置信息和Token的offset)
lucene相关文章搜索的实现的代码
public class MoreLike {
Analyzer analyzer = new IKAnalyzer(); //分词器选择 Directory ramDir = new RAMDirectory(); public void createRamIndex() throws CorruptIndexException, LockObtainFailedException, IOException { IndexWriter writer = new IndexWriter(ramDir, analyzer, IndexWriter.MaxFieldLength.LIMITED);
Document doc1 = new Document(); doc1.add(new Field("title", "wenhq", Field.Store.YES, Field.Index.ANALYZED)); doc1.add(new Field("author", "callan", Field.Store.YES, Field.Index.ANALYZED)); doc1.add(new Field("subject", "wenhq.com是亲亲宝宝网站的域名,记录软件开发的经验", Field.Store.YES,
Field.Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); Document doc2 = new Document(); doc2.add(new Field("title", "english", Field.Store.YES, Field.Index.ANALYZED)); doc2.add(new Field("author", "wcq", Field.Store.YES, Field.Index.ANALYZED)); doc2.add(new Field("subject", "学习english的人很多,亲亲宝宝网站的人也在学习", Field.Store.YES, Field.Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
Document doc3 = new Document(); doc3.add(new Field("title", "asp", Field.Store.YES, Field.Index.ANALYZED)); doc3.add(new Field("author", "ca", Field.Store.YES, Field.Index.ANALYZED)); doc3.add(new Field("subject", "asp是一种网站开发语言", Field.Store.YES, Field.Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
writer.addDocument(doc1);
writer.addDocument(doc2);
writer.addDocument(doc3);
writer.optimize();
writer.close();
}
public void search() throws CorruptIndexException, IOException { IndexReader reader = IndexReader.open(ramDir); IndexSearcher searcher = new IndexSearcher(reader); Term term = new Term("title", "wenhq"); // 在title里查询wenhq词条 TermQuery query = new TermQuery(term); TopScoreDocCollector collector = TopScoreDocCollector.create(10000, false);
searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; for (int i = 0; i < hits.length; i++) { Document doc = searcher.doc(hits[i].doc); System.out.println("search: "); System.out.println(doc.get("title") + "###" + doc.get("subject")); morelikeSearch(reader, hits[i].doc); }
}
private void morelikeSearch(IndexReader reader, int id) throws IOException { System.out.println("moreLike search: "); // 根据这个document的id获取这个field的Term Vector // 信息,就是这个field分词之后在这个field里的频率、位置、等信息 TermFreqVector vector = reader.getTermFreqVector(id, "subject"); BooleanQuery query = new BooleanQuery(); for (int i = 0; i < vector.size(); i++) { TermQuery tq = new TermQuery(new Term("subject", vector.getTerms()[i])); // 获取每个term保存的Token query.add(tq, BooleanClause.Occur.SHOULD); }
IndexSearcher searcher = new IndexSearcher(ramDir); TopScoreDocCollector collector = TopScoreDocCollector.create(10000, false);
searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; for (int i = 0; i < hits.length; i++) { Document doc = searcher.doc(hits[i].doc); System.out.println(doc.get("title") + "###" + doc.get("subject")); }
}
public static void main(String[] args) throws CorruptIndexException, IOException { MoreLike t = new MoreLike(); t.createRamIndex();
t.search();
}
}
具体的输出结果:
search:需要查询的文章内容
wenhq###wenhq.com是亲亲宝宝网站的域名,记录软件开发的经验
moreLike search: 相关的文章
wenhq###wenhq.com是亲亲宝宝网站的域名,记录软件开发的经验
english###学习english的人很多,亲亲宝宝网站的人也在学习
asp###asp是一种网站开发语言
No related posts. |
|