Lucene 5.5.0 StopFilter 错误
Lucene 5.5.0 StopFilter Error
我正在尝试在 Lucene 5.5.0 中使用 StopFilter。我尝试了以下方法:
package lucenedemo;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Iterator;
import org.apache.lucene.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.Version;
public class lucenedemo {
public static void main(String[] args) throws Exception {
System.out.println(removeStopWords("hello how are you? I am fine. This is a great day!"));
}
public static String removeStopWords(String strInput) throws Exception {
AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
StandardTokenizer tokenizer = new StandardTokenizer(factory);
tokenizer.setReader(new StringReader(strInput));
tokenizer.reset();
CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
TokenStream streamStop = new StopFilter(tokenizer, stopWords);
StringBuilder sb = new StringBuilder();
CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
streamStop.reset();
while (streamStop.incrementToken()) {
String term = charTermAttribute.toString();
sb.append(term + " ");
}
streamStop.end();
streamStop.close();
tokenizer.close();
return sb.toString();
}
}
但它给我以下错误:
Exception in thread "main" java.lang.IllegalStateException: TokenStream contract violation: reset()/close() call missing, reset() called multiple times, or subclass does not call super.reset(). Please see Javadocs of TokenStream class for more information about the correct consuming workflow.
at org.apache.lucene.analysis.Tokenizer.read(Tokenizer.java:109)
at org.apache.lucene.analysis.standard.StandardTokenizerImpl.zzRefill(StandardTokenizerImpl.java:527)
at org.apache.lucene.analysis.standard.StandardTokenizerImpl.getNextToken(StandardTokenizerImpl.java:738)
at org.apache.lucene.analysis.standard.StandardTokenizer.incrementToken(StandardTokenizer.java:159)
at org.apache.lucene.analysis.util.FilteringTokenFilter.incrementToken(FilteringTokenFilter.java:51)
at lucenedemo.lucenedemo.removeStopWords(lucenedemo.java:42)
at lucenedemo.lucenedemo.main(lucenedemo.java:27)
我到底做错了什么?我已经关闭了 Tokeinzer 和 TokenStream 类。我还缺少其他东西吗?
在过滤器上调用重置将反过来重置基础流。由于您手动重置了分词器,然后使用分词器创建了一个 StopFilter 它是底层流,并重置了 that,分词器被重置了两次。
所以只需删除这一行:
tokenizer.reset();
我正在尝试在 Lucene 5.5.0 中使用 StopFilter。我尝试了以下方法:
package lucenedemo;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Iterator;
import org.apache.lucene.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.Version;
public class lucenedemo {
public static void main(String[] args) throws Exception {
System.out.println(removeStopWords("hello how are you? I am fine. This is a great day!"));
}
public static String removeStopWords(String strInput) throws Exception {
AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
StandardTokenizer tokenizer = new StandardTokenizer(factory);
tokenizer.setReader(new StringReader(strInput));
tokenizer.reset();
CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
TokenStream streamStop = new StopFilter(tokenizer, stopWords);
StringBuilder sb = new StringBuilder();
CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
streamStop.reset();
while (streamStop.incrementToken()) {
String term = charTermAttribute.toString();
sb.append(term + " ");
}
streamStop.end();
streamStop.close();
tokenizer.close();
return sb.toString();
}
}
但它给我以下错误:
Exception in thread "main" java.lang.IllegalStateException: TokenStream contract violation: reset()/close() call missing, reset() called multiple times, or subclass does not call super.reset(). Please see Javadocs of TokenStream class for more information about the correct consuming workflow.
at org.apache.lucene.analysis.Tokenizer.read(Tokenizer.java:109)
at org.apache.lucene.analysis.standard.StandardTokenizerImpl.zzRefill(StandardTokenizerImpl.java:527)
at org.apache.lucene.analysis.standard.StandardTokenizerImpl.getNextToken(StandardTokenizerImpl.java:738)
at org.apache.lucene.analysis.standard.StandardTokenizer.incrementToken(StandardTokenizer.java:159)
at org.apache.lucene.analysis.util.FilteringTokenFilter.incrementToken(FilteringTokenFilter.java:51)
at lucenedemo.lucenedemo.removeStopWords(lucenedemo.java:42)
at lucenedemo.lucenedemo.main(lucenedemo.java:27)
我到底做错了什么?我已经关闭了 Tokeinzer 和 TokenStream 类。我还缺少其他东西吗?
在过滤器上调用重置将反过来重置基础流。由于您手动重置了分词器,然后使用分词器创建了一个 StopFilter 它是底层流,并重置了 that,分词器被重置了两次。
所以只需删除这一行:
tokenizer.reset();