在 apache lucene 中保留空格的同时生成 N-gram
Generate N-grams while preserving spaces in apache lucene
我正在尝试使用 apache Lucene 5.5.4
为给定的输入文本集生成 N-gram。以下是我的 java 代码来执行相同的操作。
public static void main( String[] args )
{
Analyzer analyzer = createAnalyzer( 2 );
List<String> nGrams = generateNgrams( analyzer, "blah1 blah2 blah3" );
for ( String nGram : nGrams ) {
System.out.println( nGram );
}
}
public static Analyzer createAnalyzer( final int shingles )
{
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents( @NotNull String field )
{
final Tokenizer source = new WhitespaceTokenizer();
final ShingleFilter shingleFilter = new ShingleFilter( new LowerCaseFilter( source ), shingles );
shingleFilter.setOutputUnigrams( true );
return new TokenStreamComponents( source, shingleFilter );
}
};
}
public static List<String> generateNgrams( Analyzer analyzer, String str )
{
List<String> result = new ArrayList<>();
try {
TokenStream stream = analyzer.tokenStream( null, new StringReader( str ) );
stream.reset();
while ( stream.incrementToken() ) {
String nGram = stream.getAttribute( CharTermAttribute.class ).toString();
result.add( nGram );
LOG.debug( "Generated N-gram = {}", nGram );
}
} catch ( IOException e ) {
LOG.error( "IO Exception occured! {}", e );
}
return result;
}
对于我的输入blah1 blah2 blah3
,输出如下,我可以接受。
blah1
blah1 blah2
blah2
blah2 blah3
blah3
但是,当输入为Foo bar Foo2
时,我的要求是生成以下输出:
Foo
Foo bar
bar
bar Foo2
Foo2
如果您注意到了,我必须保留输入中两个单词之间的空格。(Foo bar
而不是 Foo bar
)。
我可以做一些调整并让 lucene 在内部处理它吗?
可能是添加过滤器之类的小调整,因为我是 Lucene 的新手,所以我不知道从哪里开始。
提前致谢。
我不得不编写自定义分词器和 trim 过滤器来实现这一点。
1) 我通过扩展 org.apache.lucene.analysis.Tokenizer
class 创建了一个摘要 class DelimiterPreservingCharTokenizer
。接下来,给出了我对 incrementToken
方法的实现。如果 class 不是最终的,我会延长 org.apache.lucene.analysis.util.CharTokenizer
。 DelimiterPreservingCharTokenizer
如下所示。
package lucene.tokenizers;
import java.io.IOException;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
import org.apache.lucene.util.AttributeFactory;
/**
*
* @author Arun Gowda.
* This class is exactly same as {@link CharTokenizer}. Except that, the stream will have leading delimiters. This is to support N-gram vicinity matches.
*
* We are creating a new class instead of extending CharTokenizer because, incrementToken method is final and we can not override it.
*
*/
public abstract class DelimiterPreservingCharTokenizer extends Tokenizer
{
/**
* Creates a new {@link DelimiterPreservingCharTokenizer} instance
*/
public DelimiterPreservingCharTokenizer()
{}
/**
* Creates a new {@link DelimiterPreservingCharTokenizer} instance
*
* @param factory
* the attribute factory to use for this {@link Tokenizer}
*/
public DelimiterPreservingCharTokenizer( AttributeFactory factory )
{
super( factory );
}
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
private final CharTermAttribute termAtt = addAttribute( CharTermAttribute.class );
private final OffsetAttribute offsetAtt = addAttribute( OffsetAttribute.class );
private final CharacterUtils charUtils = CharacterUtils.getInstance();
private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer( IO_BUFFER_SIZE );
/**
* Returns true iff a codepoint should be included in a token. This tokenizer
* generates as tokens adjacent sequences of codepoints which satisfy this
* predicate. Codepoints for which this is false are used to define token
* boundaries and are not included in tokens.
*/
protected abstract boolean isTokenChar( int c );
/**
* Called on each token character to normalize it before it is added to the
* token. The default implementation does nothing. Subclasses may use this to,
* e.g., lowercase tokens.
*/
protected int normalize( int c )
{
return c;
}
@Override
public final boolean incrementToken() throws IOException
{
clearAttributes();
int length = 0;
int start = -1; // this variable is always initialized
int end = -1;
char[] buffer = termAtt.buffer();
while ( true ) {
if ( bufferIndex >= dataLen ) {
offset += dataLen;
charUtils.fill( ioBuffer, input ); // read supplementary char aware with CharacterUtils
if ( ioBuffer.getLength() == 0 ) {
dataLen = 0; // so next offset += dataLen won't decrement offset
if ( length > 0 ) {
break;
} else {
finalOffset = correctOffset( offset );
return false;
}
}
dataLen = ioBuffer.getLength();
bufferIndex = 0;
}
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
final int c = charUtils.codePointAt( ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength() );
final int charCount = Character.charCount( c );
bufferIndex += charCount;
if ( isTokenChar( c ) ) { // if it's a token char
if ( length == 0 ) { // start of token
assert start == -1;
start = offset + bufferIndex - charCount;
end = start;
} else if ( length >= buffer.length - 1 ) { // check if a supplementary could run out of bounds
buffer = termAtt.resizeBuffer( 2 + length ); // make sure a supplementary fits in the buffer
}
end += charCount;
length += Character.toChars( normalize( c ), buffer, length ); // buffer it, normalized
if ( length >= MAX_WORD_LEN ) // buffer overflow! make sure to check for >= surrogate pair could break == test
break;
} else if ( length > 0 ) // at non-Letter w/ chars
break; // return 'em
}
if ( length > 0 && bufferIndex < ioBuffer.getLength() ) {//If at least one token is found,
//THIS IS THE PART WHICH IS DIFFERENT FROM LUCENE's CHARTOKENIZER
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
int c = charUtils.codePointAt( ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength() );
int charCount = Character.charCount( c );
bufferIndex += charCount;
while ( !isTokenChar( c ) && bufferIndex < ioBuffer.getLength() ) {// As long as we find delimiter(not token char), keep appending it to output stream.
if ( length >= buffer.length - 1 ) { // check if a supplementary could run out of bounds
buffer = termAtt.resizeBuffer( 2 + length ); // make sure a supplementary fits in the buffer
}
end += charCount;
length += Character.toChars( normalize( c ), buffer, length ); // buffer it, normalized
if ( length >= MAX_WORD_LEN ) {// buffer overflow! make sure to check for >= surrogate pair could break == test
break;
}
c = charUtils.codePointAt( ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength() );
charCount = Character.charCount( c );
bufferIndex += charCount;
}
//ShingleFilter will add a delimiter. Hence, the last iteration is skipped.
//That is, for input `abc def ghi`, this tokenizer will return `abc `(2 spaces only). Then, Shingle filter will by default add another delimiter making it `abc `(3 spaces as it is in the input).
//If there are N delimiters, this token will at max return N-1 delimiters
bufferIndex -= charCount;
}
termAtt.setLength( length );
assert start != -1;
offsetAtt.setOffset( correctOffset( start ), finalOffset = correctOffset( end ) );
return true;
}
@Override
public final void end() throws IOException
{
super.end();
// set final offset
offsetAtt.setOffset( finalOffset, finalOffset );
}
@Override
public void reset() throws IOException
{
super.reset();
bufferIndex = 0;
offset = 0;
dataLen = 0;
finalOffset = 0;
ioBuffer.reset(); // make sure to reset the IO buffer!!
}
}
2) 一个具体的 class WhiteSpacePreservingTokenizer
扩展上述摘要 class 以提供定界符
package spellcheck.lucene.tokenizers;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.util.AttributeFactory;
/**
*
* @author Arun Gowda
*
* This class is exactly same as {@link WhitespaceTokenizer} Only difference is, it extends DelimiterPreservingCharTokenizer instead of CharTokenizer
*/
public class WhiteSpacePreservingTokenizer extends DelimiterPreservingCharTokenizer
{
/**
* Construct a new WhitespaceTokenizer.
*/
public WhiteSpacePreservingTokenizer()
{}
/**
* Construct a new WhitespaceTokenizer using a given
* {@link org.apache.lucene.util.AttributeFactory}.
*
* @param factory
* the attribute factory to use for this {@link Tokenizer}
*/
public WhiteSpacePreservingTokenizer( AttributeFactory factory )
{
super( factory );
}
/** Collects only characters which do not satisfy
* {@link Character#isWhitespace(int)}.*/
@Override
protected boolean isTokenChar( int c )
{
return !Character.isWhitespace( c );
}
}
3) 上面的分词器会产生尾部空格。 (例如:blah____
)我们需要向 trim 这些空格添加过滤器。所以我们需要 DelimiterTrimFilter
如下。(我们也可以通过使用 java 的 trim 来 trim。但是这样做效率很低,因为它会创建新的字符串)
package spellcheck.lucene.filters;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class DelimiterTrimFilter extends TokenFilter
{
private final CharTermAttribute termAtt = addAttribute( CharTermAttribute.class );
private char delimiter;
/**
* Create a new {@link DelimiterTrimFilter}.
* @param in the stream to consume
* @param delimiterToTrim delimiter that should be trimmed
*/
public DelimiterTrimFilter( TokenStream in, char delimiterToTrim )
{
super( in );
this.delimiter = delimiterToTrim;
}
@Override
public boolean incrementToken() throws IOException
{
if ( !input.incrementToken() )
return false;
char[] termBuffer = termAtt.buffer();
int len = termAtt.length();
if ( len == 0 ) {
return true;
}
int start = 0;
int end = 0;
// eat the first characters
for ( start = 0; start < len && termBuffer[start] == delimiter; start++ ) {
}
// eat the end characters
for ( end = len; end >= start && termBuffer[end - 1] == delimiter; end-- ) {
}
if ( start > 0 || end < len ) {
if ( start < end ) {
termAtt.copyBuffer( termBuffer, start, ( end - start ) );
} else {
termAtt.setEmpty();
}
}
return true;
}
}
4) 我的 createAnalyzer
如下所示
public static Analyzer createAnalyzer( final int shingles )
{
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents( @NotNull String field )
{
final Tokenizer source = new WhiteSpacePreservingTokenizer();
final TokenStream filter = new ShingleFilter( new LowerCaseFilter( source ), shingles );
filter = new DelimiterTrimFilter( filter, ' ' );
return new TokenStreamComponents( source, filter );
}
};
}
其余代码将保持不变
我正在尝试使用 apache Lucene 5.5.4
为给定的输入文本集生成 N-gram。以下是我的 java 代码来执行相同的操作。
public static void main( String[] args )
{
Analyzer analyzer = createAnalyzer( 2 );
List<String> nGrams = generateNgrams( analyzer, "blah1 blah2 blah3" );
for ( String nGram : nGrams ) {
System.out.println( nGram );
}
}
public static Analyzer createAnalyzer( final int shingles )
{
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents( @NotNull String field )
{
final Tokenizer source = new WhitespaceTokenizer();
final ShingleFilter shingleFilter = new ShingleFilter( new LowerCaseFilter( source ), shingles );
shingleFilter.setOutputUnigrams( true );
return new TokenStreamComponents( source, shingleFilter );
}
};
}
public static List<String> generateNgrams( Analyzer analyzer, String str )
{
List<String> result = new ArrayList<>();
try {
TokenStream stream = analyzer.tokenStream( null, new StringReader( str ) );
stream.reset();
while ( stream.incrementToken() ) {
String nGram = stream.getAttribute( CharTermAttribute.class ).toString();
result.add( nGram );
LOG.debug( "Generated N-gram = {}", nGram );
}
} catch ( IOException e ) {
LOG.error( "IO Exception occured! {}", e );
}
return result;
}
对于我的输入blah1 blah2 blah3
,输出如下,我可以接受。
blah1
blah1 blah2
blah2
blah2 blah3
blah3
但是,当输入为Foo bar Foo2
时,我的要求是生成以下输出:
Foo
Foo bar
bar
bar Foo2
Foo2
如果您注意到了,我必须保留输入中两个单词之间的空格。(Foo bar
而不是 Foo bar
)。
我可以做一些调整并让 lucene 在内部处理它吗?
可能是添加过滤器之类的小调整,因为我是 Lucene 的新手,所以我不知道从哪里开始。 提前致谢。
我不得不编写自定义分词器和 trim 过滤器来实现这一点。
1) 我通过扩展 org.apache.lucene.analysis.Tokenizer
class 创建了一个摘要 class DelimiterPreservingCharTokenizer
。接下来,给出了我对 incrementToken
方法的实现。如果 class 不是最终的,我会延长 org.apache.lucene.analysis.util.CharTokenizer
。 DelimiterPreservingCharTokenizer
如下所示。
package lucene.tokenizers;
import java.io.IOException;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
import org.apache.lucene.util.AttributeFactory;
/**
*
* @author Arun Gowda.
* This class is exactly same as {@link CharTokenizer}. Except that, the stream will have leading delimiters. This is to support N-gram vicinity matches.
*
* We are creating a new class instead of extending CharTokenizer because, incrementToken method is final and we can not override it.
*
*/
public abstract class DelimiterPreservingCharTokenizer extends Tokenizer
{
/**
* Creates a new {@link DelimiterPreservingCharTokenizer} instance
*/
public DelimiterPreservingCharTokenizer()
{}
/**
* Creates a new {@link DelimiterPreservingCharTokenizer} instance
*
* @param factory
* the attribute factory to use for this {@link Tokenizer}
*/
public DelimiterPreservingCharTokenizer( AttributeFactory factory )
{
super( factory );
}
private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0;
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
private final CharTermAttribute termAtt = addAttribute( CharTermAttribute.class );
private final OffsetAttribute offsetAtt = addAttribute( OffsetAttribute.class );
private final CharacterUtils charUtils = CharacterUtils.getInstance();
private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer( IO_BUFFER_SIZE );
/**
* Returns true iff a codepoint should be included in a token. This tokenizer
* generates as tokens adjacent sequences of codepoints which satisfy this
* predicate. Codepoints for which this is false are used to define token
* boundaries and are not included in tokens.
*/
protected abstract boolean isTokenChar( int c );
/**
* Called on each token character to normalize it before it is added to the
* token. The default implementation does nothing. Subclasses may use this to,
* e.g., lowercase tokens.
*/
protected int normalize( int c )
{
return c;
}
@Override
public final boolean incrementToken() throws IOException
{
clearAttributes();
int length = 0;
int start = -1; // this variable is always initialized
int end = -1;
char[] buffer = termAtt.buffer();
while ( true ) {
if ( bufferIndex >= dataLen ) {
offset += dataLen;
charUtils.fill( ioBuffer, input ); // read supplementary char aware with CharacterUtils
if ( ioBuffer.getLength() == 0 ) {
dataLen = 0; // so next offset += dataLen won't decrement offset
if ( length > 0 ) {
break;
} else {
finalOffset = correctOffset( offset );
return false;
}
}
dataLen = ioBuffer.getLength();
bufferIndex = 0;
}
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
final int c = charUtils.codePointAt( ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength() );
final int charCount = Character.charCount( c );
bufferIndex += charCount;
if ( isTokenChar( c ) ) { // if it's a token char
if ( length == 0 ) { // start of token
assert start == -1;
start = offset + bufferIndex - charCount;
end = start;
} else if ( length >= buffer.length - 1 ) { // check if a supplementary could run out of bounds
buffer = termAtt.resizeBuffer( 2 + length ); // make sure a supplementary fits in the buffer
}
end += charCount;
length += Character.toChars( normalize( c ), buffer, length ); // buffer it, normalized
if ( length >= MAX_WORD_LEN ) // buffer overflow! make sure to check for >= surrogate pair could break == test
break;
} else if ( length > 0 ) // at non-Letter w/ chars
break; // return 'em
}
if ( length > 0 && bufferIndex < ioBuffer.getLength() ) {//If at least one token is found,
//THIS IS THE PART WHICH IS DIFFERENT FROM LUCENE's CHARTOKENIZER
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
int c = charUtils.codePointAt( ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength() );
int charCount = Character.charCount( c );
bufferIndex += charCount;
while ( !isTokenChar( c ) && bufferIndex < ioBuffer.getLength() ) {// As long as we find delimiter(not token char), keep appending it to output stream.
if ( length >= buffer.length - 1 ) { // check if a supplementary could run out of bounds
buffer = termAtt.resizeBuffer( 2 + length ); // make sure a supplementary fits in the buffer
}
end += charCount;
length += Character.toChars( normalize( c ), buffer, length ); // buffer it, normalized
if ( length >= MAX_WORD_LEN ) {// buffer overflow! make sure to check for >= surrogate pair could break == test
break;
}
c = charUtils.codePointAt( ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength() );
charCount = Character.charCount( c );
bufferIndex += charCount;
}
//ShingleFilter will add a delimiter. Hence, the last iteration is skipped.
//That is, for input `abc def ghi`, this tokenizer will return `abc `(2 spaces only). Then, Shingle filter will by default add another delimiter making it `abc `(3 spaces as it is in the input).
//If there are N delimiters, this token will at max return N-1 delimiters
bufferIndex -= charCount;
}
termAtt.setLength( length );
assert start != -1;
offsetAtt.setOffset( correctOffset( start ), finalOffset = correctOffset( end ) );
return true;
}
@Override
public final void end() throws IOException
{
super.end();
// set final offset
offsetAtt.setOffset( finalOffset, finalOffset );
}
@Override
public void reset() throws IOException
{
super.reset();
bufferIndex = 0;
offset = 0;
dataLen = 0;
finalOffset = 0;
ioBuffer.reset(); // make sure to reset the IO buffer!!
}
}
2) 一个具体的 class WhiteSpacePreservingTokenizer
扩展上述摘要 class 以提供定界符
package spellcheck.lucene.tokenizers;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.util.AttributeFactory;
/**
*
* @author Arun Gowda
*
* This class is exactly same as {@link WhitespaceTokenizer} Only difference is, it extends DelimiterPreservingCharTokenizer instead of CharTokenizer
*/
public class WhiteSpacePreservingTokenizer extends DelimiterPreservingCharTokenizer
{
/**
* Construct a new WhitespaceTokenizer.
*/
public WhiteSpacePreservingTokenizer()
{}
/**
* Construct a new WhitespaceTokenizer using a given
* {@link org.apache.lucene.util.AttributeFactory}.
*
* @param factory
* the attribute factory to use for this {@link Tokenizer}
*/
public WhiteSpacePreservingTokenizer( AttributeFactory factory )
{
super( factory );
}
/** Collects only characters which do not satisfy
* {@link Character#isWhitespace(int)}.*/
@Override
protected boolean isTokenChar( int c )
{
return !Character.isWhitespace( c );
}
}
3) 上面的分词器会产生尾部空格。 (例如:blah____
)我们需要向 trim 这些空格添加过滤器。所以我们需要 DelimiterTrimFilter
如下。(我们也可以通过使用 java 的 trim 来 trim。但是这样做效率很低,因为它会创建新的字符串)
package spellcheck.lucene.filters;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class DelimiterTrimFilter extends TokenFilter
{
private final CharTermAttribute termAtt = addAttribute( CharTermAttribute.class );
private char delimiter;
/**
* Create a new {@link DelimiterTrimFilter}.
* @param in the stream to consume
* @param delimiterToTrim delimiter that should be trimmed
*/
public DelimiterTrimFilter( TokenStream in, char delimiterToTrim )
{
super( in );
this.delimiter = delimiterToTrim;
}
@Override
public boolean incrementToken() throws IOException
{
if ( !input.incrementToken() )
return false;
char[] termBuffer = termAtt.buffer();
int len = termAtt.length();
if ( len == 0 ) {
return true;
}
int start = 0;
int end = 0;
// eat the first characters
for ( start = 0; start < len && termBuffer[start] == delimiter; start++ ) {
}
// eat the end characters
for ( end = len; end >= start && termBuffer[end - 1] == delimiter; end-- ) {
}
if ( start > 0 || end < len ) {
if ( start < end ) {
termAtt.copyBuffer( termBuffer, start, ( end - start ) );
} else {
termAtt.setEmpty();
}
}
return true;
}
}
4) 我的 createAnalyzer
如下所示
public static Analyzer createAnalyzer( final int shingles )
{
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents( @NotNull String field )
{
final Tokenizer source = new WhiteSpacePreservingTokenizer();
final TokenStream filter = new ShingleFilter( new LowerCaseFilter( source ), shingles );
filter = new DelimiterTrimFilter( filter, ' ' );
return new TokenStreamComponents( source, filter );
}
};
}
其余代码将保持不变