nutch 高亮和增加索引长度
高亮显示比较简单,网上也有很多介绍代码。修改如下:
将 org.apache.nutch.searcher.Summary 第 54行 代码 修改为: public String toString() { return "<span style=‘color:red‘>" + super.toString() + "</span>"; } 增加索引长度花了我比较长的时间 , 不过后来发现原来有两个参数是专门调整索引长度的 ,刚看代码的时候没有注意到 ,在org.apache.nutch.searcher.Summarizer 的36行左右 有 /** The number of context terms to display preceding and following matches.*/ /** The total number of terms to display in a summary.*/ 这 两个 是 Term 的长度 , 第一个参数是 SUM_CONTEXT 在摘要中间最多有 5个 高亮显示的关键词(注:这里的NutchConf.get().getInt()第二个参数 5表示 默认值是5,也就是在取得searcher.summary.context为NULL时候给一个默认值), 第二个SUM_LENGTH 是在摘要中最多显示 100个 Term ,这个Term 是分词得到的结果 ,在后面的摘要截取算法中需要用到 Term ,不过可以通过Luncene 的保存Term的坐标 来实现 索引关键词的快速高亮显示 ,这样的好处是可以在查询的时候不再使用分词,以减少查询相应时间。 不过如果分词系统是基于词库的,则词库增长以后会有一定问题,这个以后在做专题讨论。 下面帖一下改过的算法内容,显示文字数大约在 150个左右 ,如果需要增加到更多 ,则可以修改 相应的代码。
/** *//** Returns a summary for the given pre-tokenized text. */
public Summary getSummary(String text, Query query) throws IOException ...{ // Simplistic implementation. Finds the first fragments in the document // containing any query terms. // // TODO: check that phrases in the query are matched in the fragment Token[] tokens = getTokens(text); // parse text to token array if (tokens.length == 0) return new Summary(); String[] terms = query.getTerms(); HashSet highlight = new HashSet(); // put query terms in table for (int i = 0; i < terms.length; i++) highlight.add(terms[i]); // // Create a SortedSet that ranks excerpts according to // how many query terms are present. An excerpt is // a Vector full of Fragments and Highlights // SortedSet excerptSet = new TreeSet(new Comparator() ...{ public int compare(Object o1, Object o2) ...{ Excerpt excerpt1 = (Excerpt) o1; Excerpt excerpt2 = (Excerpt) o2; if (excerpt1 == null && excerpt2 != null) ...{ return -1; } else if (excerpt1 != null && excerpt2 == null) ...{ return 1; } else if (excerpt1 == null && excerpt2 == null) ...{ return 0; } int numToks1 = excerpt1.numUniqueTokens(); int numToks2 = excerpt2.numUniqueTokens(); if (numToks1 < numToks2) ...{ return -1; } else if (numToks1 == numToks2) ...{ return excerpt1.numFragments() - excerpt2.numFragments(); } else ...{ return 1; } } } ); // // Iterate through all terms in the document // int lastExcerptPos = 0; for (int i = 0; i < tokens.length; i++) ...{ // // If we find a term that‘s in the query... // if (highlight.contains(tokens[i].termText())) ...{ // // Start searching at a point SUM_CONTEXT terms back, // and move SUM_CONTEXT terms into the future. // int startToken = (i > SUM_CONTEXT) ? i-SUM_CONTEXT : 0; int endToken = Math.min(i+SUM_CONTEXT*20, tokens.length); int offset = tokens[startToken].startOffset(); int j = startToken; // // Iterate from the start point to the finish, adding // terms all the way. The end of the passage is always // SUM_CONTEXT beyond the last query-term. // Excerpt excerpt = new Excerpt(); if (i != 0) ...{ excerpt.add(new Summary.Ellipsis()); } // // Iterate through as long as we‘re before the end of // the document and we haven‘t hit the max-number-of-items // -in-a-summary. // Token a = null ; while ((j < endToken) && (j - startToken < SUM_LENGTH)) ...{ // // Now grab the hit-element, if present // Token t = tokens[j]; if (highlight.contains(t.termText())) ...{ excerpt.addToken(t.termText()); //System.out.println("Text:"+text.substring(offset, t.startOffset()) +" OffSet:"+offset +" Start:"+ t.startOffset()); excerpt.add(new Fragment(text.substring(offset, t.startOffset()))); excerpt.add(new Highlight(text.substring(t.startOffset(), t.endOffset()))); a = (Token)t.cloneToken() ; offset = a.endOffset(); //endToken = Math.min(j+SUM_LENGTH, tokens.length); } j++; } ...{ ...{ if(offset<text.length()&& Math.min(endToken, i + SUM_LENGTH)<tokens.length && tokens[Math.min(endToken, i + SUM_LENGTH)].endOffset()<text.length()) ...{ excerpt.add(new Fragment(text.substring(offset, tokens[Math.min(endToken, i + SUM_LENGTH)].endOffset()))); } } } lastExcerptPos = endToken; // // We found the series of search-term hits and added // them (with intervening text) to the excerpt. Now // we need to add the trailing edge of text. // // So if (j < tokens.length) then there is still trailing // text to add. (We haven‘t hit the end of the source doc.) // Add the words since the last hit-term insert. // // if (j < tokens.length) { // System.out.println(text.length()+" Ooffset:"+offset + " EndOff:"+ tokens[j].endOffset()+" "+text ); // excerpt.add(new Fragment(text.substring(offset,offset+tokens[j].endOffset()))); // } // // Remember how many terms are in this excerpt // excerpt.setNumTerms(j - startToken); // // Store the excerpt for later sorting // excerptSet.add(excerpt); // // Start SUM_CONTEXT places away. The next // search for relevant excerpts begins at i-SUM_CONTEXT // i = j+SUM_CONTEXT; } } // // If the target text doesn‘t appear, then we just // excerpt the first SUM_LENGTH words from the document. // if (excerptSet.size() == 0) ...{ Excerpt excerpt = new Excerpt(); int excerptLen = Math.min(SUM_LENGTH, tokens.length); lastExcerptPos = excerptLen; excerpt.add(new Fragment(text.substring(tokens[0].startOffset(), tokens[excerptLen-1].startOffset()))); excerpt.setNumTerms(excerptLen); excerptSet.add(excerpt); } // // Now choose the best items from the excerpt set. // Stop when our Summary grows too large. // double tokenCount = 0; Summary s = new Summary(); while (tokenCount <= SUM_LENGTH && excerptSet.size() > 0) ...{ Excerpt excerpt = (Excerpt) excerptSet.last(); excerptSet.remove(excerpt); double tokenFraction = (1.0 * excerpt.getNumTerms()) / excerpt.numFragments(); for (Enumeration e = excerpt.elements(); e.hasMoreElements(); ) ...{ Fragment f = (Fragment) e.nextElement(); // Don‘t add fragments if it takes us over the max-limit if ((int)(tokenCount + tokenFraction) <= SUM_LENGTH) ...{ s.add(f); } tokenCount += tokenFraction; } } if (tokenCount > 0 && lastExcerptPos < tokens.length) s.add(new Ellipsis()); return s; }
|
|