测试lucene的所有分词接口(原创)- -Tag: Lucene 中文 分词
Lucene本身提供了几个分词接口,我后来有给写了一个分词接口.
功能递增如下:
WhitespaceAnalyzer:仅仅是去除空格,对字符没有lowcase化,不支持中文
SimpleAnalyzer:功能强于WhitespaceAnalyzer,将除去letter之外的符号全部过滤掉,并且将所有的字符lowcase化,不支持中文
StopAnalyzer:StopAnalyzer的功能超越了SimpleAnalyzer,在SimpleAnalyzer的基础上 增加了去除StopWords的功能,不支持中文
StandardAnalyzer:英文的处理能力同于StopAnalyzer.支持中文采用的方法为单字切分.
ChineseAnalyzer:来自于Lucene的sand box.性能类似于StandardAnalyzer,缺点是不支持中英文混和分词.
CJKAnalyzer:chedong写的CJKAnalyzer的功能在英文处理上的功能和StandardAnalyzer相同 但是在汉语的分词上,不能过滤掉标点符号,即使用二元切分
TjuChineseAnalyzer:我写的,功能最为强大.TjuChineseAnlyzer的功能相当强大,在中文分词方面由于其调用的为ICTCLAS的java接口.所以其在中文方面性能上同与ICTCLAS.其在英文分词上采用了Lucene的StopAnalyzer,可以去除 stopWords,而且可以不区分大小写,过滤掉各类标点符号.
程序调试于:JBuilder 2005
package org.apache.lucene.analysis;
//Author:zhangbufeng //TjuAILab(天津大学人工智能实验室) //2005.9.22.11:00
import java.io.*; import junit.framework.*;
import org.apache.lucene.*; import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.standard.*; import org.apache.lucene.analysis.cn.*; import org.apache.lucene.analysis.cjk.*; import org.apache.lucene.analysis.tjucn.*; import com.xjt.nlp.word.*; public class TestAnalyzers extends TestCase {
public TestAnalyzers(String name) { super(name); }
public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { //前面的"dummy"好像没有用到 TokenStream ts = a.tokenStream("dummy", new StringReader(input)); StringReader readerInput=new StringReader(input); for (int i=0; i Token t = ts.next(); //System.out.println(t); assertNotNull(t); //使用下面这条语句即可以输出Token的每项的text,并且用空格分开 System.out.print(t.termText); System.out.print(" "); assertEquals(t.termText(), output[i]); } System.out.println(" "); assertNull(ts.next()); ts.close(); } public void outputAnalyzer(Analyzer a ,String input) throws Exception{ TokenStream ts = a.tokenStream("dummy",new StringReader(input)); StringReader readerInput = new StringReader(input); while(true){ Token t = ts.next(); if(t!=null){ System.out.print(t.termText); System.out.print(" "); } else break;
} System.out.println(" "); ts.close(); }
public void testSimpleAnalyzer() throws Exception { //学习使用SimpleAnalyzer(); //SimpleAnalyzer将除去letter之外的符号全部过滤掉,并且将所有的字符lowcase化 Analyzer a = new SimpleAnalyzer(); assertAnalyzesTo(a, "foo bar FOO BAR", new String[] { "foo", "bar", "foo", "bar" }); assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[] { "foo", "bar", "foo", "bar" }); assertAnalyzesTo(a, "foo.bar.FOO.BAR", new String[] { "foo", "bar", "foo", "bar" }); assertAnalyzesTo(a, "U.S.A.", new String[] { "u", "s", "a" }); assertAnalyzesTo(a, "C++", new String[] { "c" }); assertAnalyzesTo(a, "B2B", new String[] { "b", "b" }); assertAnalyzesTo(a, "2B", new String[] { "b" }); assertAnalyzesTo(a, "\"QUOTED\" word", new String[] { "quoted", "word" }); assertAnalyzesTo(a,"zhang ./ bu <> feng", new String[]{"zhang","bu","feng"}); ICTCLAS splitWord = new ICTCLAS(); String result = splitWord.paragraphProcess("我爱共产党 i LOVE chanchan"); assertAnalyzesTo(a,result, new String[]{"我","爱","共产党","i","love","chanchan"});
}
public void testWhiteSpaceAnalyzer() throws Exception { //WhiterspaceAnalyzer仅仅是去除空格,对字符没有lowcase化 Analyzer a = new WhitespaceAnalyzer(); assertAnalyzesTo(a, "foo bar FOO BAR", new String[] { "foo", "bar", "FOO", "BAR" }); assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" }); assertAnalyzesTo(a, "foo.bar.FOO.BAR", new String[] { "foo.bar.FOO.BAR" }); assertAnalyzesTo(a, "U.S.A.", new String[] { "U.S.A." }); assertAnalyzesTo(a, "C++", new String[] { "C++" });
assertAnalyzesTo(a, "B2B", new String[] { "B2B" }); assertAnalyzesTo(a, "2B", new String[] { "2B" }); assertAnalyzesTo(a, "\"QUOTED\" word", new String[] { "\"QUOTED\"", "word" });
assertAnalyzesTo(a,"zhang bu feng", new String []{"zhang","bu","feng"}); ICTCLAS splitWord = new ICTCLAS(); String result = splitWord.paragraphProcess("我爱共产党 i love chanchan"); assertAnalyzesTo(a,result, new String[]{"我","爱","共产党","i","love","chanchan"}); }
public void testStopAnalyzer() throws Exception { //StopAnalyzer的功能超越了SimpleAnalyzer,在SimpleAnalyzer的基础上 //增加了去除StopWords的功能 Analyzer a = new StopAnalyzer(); assertAnalyzesTo(a, "foo bar FOO BAR", new String[] { "foo", "bar", "foo", "bar" }); assertAnalyzesTo(a, "foo a bar such FOO THESE BAR", new String[] { "foo", "bar", "foo", "bar" }); assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ", new String[]{"foo","bar","foo","bar"}); ICTCLAS splitWord = new ICTCLAS(); String result = splitWord.paragraphProcess("我爱共产党 i Love chanchan such"); assertAnalyzesTo(a,result, new String[]{"我","爱","共产党","i","love","chanchan"});
} public void testStandardAnalyzer() throws Exception{ //StandardAnalyzer的功能最为强大,对于中文采用的为单字切分 Analyzer a = new StandardAnalyzer(); assertAnalyzesTo(a,"foo bar Foo Bar", new String[]{"foo","bar","foo","bar"}); assertAnalyzesTo(a,"foo bar ./ Foo ./ BAR", new String[]{"foo","bar","foo","bar"}); assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ", new String[]{"foo","bar","foo","bar"}); assertAnalyzesTo(a,"张步峰是天大学生", new String[]{"张","步","峰","是","天","大","学","生"}); //验证去除英文的标点符号 assertAnalyzesTo(a,"张,/步/,峰,.是.,天大<>学生", new String[]{"张","步","峰","是","天","大","学","生"}); //验证去除中文的标点符号 assertAnalyzesTo(a,"张。、步。、峰是。天大。学生", new String[]{"张","步","峰","是","天","大","学","生"}); } public void testChineseAnalyzer() throws Exception{ //可见ChineseAnalyzer在功能上和standardAnalyzer的功能差不多,但是可能在速度上慢于StandardAnalyzer Analyzer a = new ChineseAnalyzer();
//去空格 assertAnalyzesTo(a,"foo bar Foo Bar", new String[]{"foo","bar","foo","bar"}); assertAnalyzesTo(a,"foo bar ./ Foo ./ BAR", new String[]{"foo","bar","foo","bar"}); assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ", new String[]{"foo","bar","foo","bar"}); assertAnalyzesTo(a,"张步峰是天大学生", new String[]{"张","步","峰","是","天","大","学","生"}); //验证去除英文的标点符号 assertAnalyzesTo(a,"张,/步/,峰,.是.,天大<>学生", new String[]{"张","步","峰","是","天","大","学","生"}); //验证去除中文的标点符号 assertAnalyzesTo(a,"张。、步。、峰是。天大。学生", new String[]{"张","步","峰","是","天","大","学","生"}); //不支持中英文写在一起 // assertAnalyzesTo(a,"我爱你 i love chanchan", /// new String[]{"我","爱","你","i","love","chanchan"});
} public void testCJKAnalyzer() throws Exception { //chedong写的CJKAnalyzer的功能在英文处理上的功能和StandardAnalyzer相同 //但是在汉语的分词上,不能过滤掉标点符号,即使用二元切分 Analyzer a = new CJKAnalyzer(); assertAnalyzesTo(a,"foo bar Foo Bar", new String[]{"foo","bar","foo","bar"}); assertAnalyzesTo(a,"foo bar ./ Foo ./ BAR", new String[]{"foo","bar","foo","bar"}); assertAnalyzesTo(a,"foo ./ a bar such ,./<> FOO THESE BAR ", new String[]{"foo","bar","foo","bar"});
// assertAnalyzesTo(a,"张,/步/,峰,.是.,天大<>学生", // new String[]{"张步","步峰","峰是","是天","天大","大学","学生"}); //assertAnalyzesTo(a,"张。、步。、峰是。天大。学生", // new String[]{"张步","步峰","峰是","是天","天大","大学","学生"}); //支持中英文同时写 assertAnalyzesTo(a,"张步峰是天大学生 i love", new String[]{"张步","步峰","峰是","是天","天大","大学","学生","i","love"});
} public void testTjuChineseAnalyzer() throws Exception{ /** * TjuChineseAnlyzer的功能相当强大,在中文分词方面由于其调用的为ICTCLAS的java接口. * 所以其在中文方面性能上同与ICTCLAS.其在英文分词上采用了Lucene的StopAnalyzer,可以去除 * stopWords,而且可以不区分大小写,过滤掉各类标点符号. */ Analyzer a = new TjuChineseAnalyzer(); String input = "体育讯 在被尤文淘汰之后,皇马主帅博斯克拒绝接受媒体对球队后防线的批评,同时还为自己排出的首发阵容进行了辩护。"+ "“失利是全队的责任,而不仅仅是后防线该受指责,”博斯克说,“我并不认为我们踢得一塌糊涂。”“我们进入了半决赛,而且在晋级的道路上一路奋 "+ "战。即使是今天的比赛我们也有几个翻身的机会,但我们面对的对手非常强大,他们踢得非常好。”“我们的球迷应该为过去几个赛季里我们在冠军杯中的表现感到骄傲。”"+ "博斯克还说。对于博斯克在首发中排出了久疏战阵的坎比亚索,赛后有记者提出了质疑,认为完全应该将队内的另一 "+ "名球员帕文派遣上场以加强后卫线。对于这一疑议,博斯克拒绝承担所谓的“责任”,认为球队的首发没有问题。“我们按照整个赛季以来的方式做了,"+ "对于人员上的变化我没有什么可说的。”对于球队在本赛季的前景,博斯克表示皇马还有西甲联赛的冠军作为目标。“皇家马德里在冠军 "+ "杯中战斗到了最后,我们在联赛中也将这么做。”"+ "A Java User Group is a group of people who share a common interest in Java technology and meet on a regular basis to share"+ " technical ideas and information. The actual structure of a JUG can vary greatly - from a small number of friends and coworkers"+ " meeting informally in the evening, to a large group of companies based in the same geographic area. "+ "Regardless of the size and focus of a particular JUG, the sense of community spirit remains the same. ";
outputAnalyzer(a,input); //此处我已经对大文本进行过测试,不会有问题效果很好 outputAnalyzer(a,"我爱共产党 ,,。 I love China 我喜欢唱歌 "); assertAnalyzesTo(a,"我爱共产党 ,,。I love China 我喜欢唱歌", new String[]{"爱","共产党","i","love","china","喜欢","唱歌"}); } }
|