如何将自定义 TokenFilter 从 Lucene.Net 3.0.3 迁移到 4.8
How to migrate custom TokenFilter from Lucene.Net 3.0.3 to 4.8
我有以下适用于 Lucene.Net 3.0.3 的自定义 TokenFilter,我需要将其迁移到 Lucene.Net 4.8:
public sealed class AccentFoldingFilter : TokenFilter
{
private ITermAttribute termAttribute;
public AccentFoldingFilter(TokenStream input) : base(input)
{
termAttribute = this.input.GetAttribute<ITermAttribute>();
}
public override bool IncrementToken()
{
if (this.input.IncrementToken())
{
termAttribute.SetTermBuffer(termAttribute.Term.RemoveDiacritics());
return true;
}
return false;
}
}
ITermAttribute
不存在了,我想我需要使用ICharTermAttribute
,但我不知道该怎么做。
如何在 4.8 中执行相同的操作?
作为参考,这是 RemoveDiacritics
扩展方法:
public static string RemoveDiacritics(this string text)
{
var normalizedString = text.Normalize(NormalizationForm.FormD);
var stringBuilder = new StringBuilder();
foreach (var c in normalizedString)
{
var unicodeCategory = CharUnicodeInfo.GetUnicodeCategory(c);
if (unicodeCategory != UnicodeCategory.NonSpacingMark)
{
stringBuilder.Append(c);
}
}
return stringBuilder.ToString().Normalize(NormalizationForm.FormC);
}
Although you could use the answer below, do note that Lucene.NET 4.8.0 includes a ICUNormalizer2Filter, an ICUNormalizer2CharFilter, and an ICUFoldingFilter in the box. However, you may still be inclined to use your existing solution rather than drag in a 20MB+ dependency (ICU4N).
要翻译,您需要将 ICharTermAttribute
直接添加到您的过滤器(而不是 TokenStream
)。该属性将通过调用 GetAttribute<ICharTermAttribute>()
.
从令牌流的共享上下文中拉出
public sealed class AccentFoldingFilter : TokenFilter
{
private ICharTermAttribute termAttribute;
public AccentFoldingFilter(TokenStream input) : base(input)
{
termAttribute = this.GetAttribute<ICharTermAttribute>();
}
public override bool IncrementToken()
{
if (this.m_input.IncrementToken())
{
string buffer = termAttribute.ToString().RemoveDiacritics();
termAttribute.SetEmpty().Append(buffer);
return true;
}
return false;
}
}
此外,RemoveDiacritics
方法实现不考虑代理对,这可能会导致以后难以诊断错误。
public static string RemoveDiacritics(this string text)
{
var normalizedString = text.Normalize(NormalizationForm.FormD);
int inputLength = normalizedString.Length;
char[] buffer = new char[inputLength];
// TODO: If the strings are short (less than 256 chars),
// consider using this (must be unsafe context)
// char* buffer = stackalloc char[inputLength];
int bufferLength = 0;
for (int i = 0; i < inputLength;)
{
// Handle surrogate pairs
int charCount = char.IsHighSurrogate(normalizedString, i)
&& i < inputLength - 1
&& char.IsLowSurrogate(normalizedString, i + 1) ? 2 : 1;
var unicodeCategory = CharUnicodeInfo.GetUnicodeCategory(normalizedString, i);
if (unicodeCategory != UnicodeCategory.NonSpacingMark)
{
buffer[bufferLength++] = normalizedString[i]; // high surrogate / BMP char
if (charCount == 2)
{
buffer[bufferLength++] = normalizedString[i + 1]; // low surrogate
}
}
i += charCount;
}
return new string(buffer, 0, bufferLength).Normalize(NormalizationForm.FormC);
}
我有以下适用于 Lucene.Net 3.0.3 的自定义 TokenFilter,我需要将其迁移到 Lucene.Net 4.8:
public sealed class AccentFoldingFilter : TokenFilter
{
private ITermAttribute termAttribute;
public AccentFoldingFilter(TokenStream input) : base(input)
{
termAttribute = this.input.GetAttribute<ITermAttribute>();
}
public override bool IncrementToken()
{
if (this.input.IncrementToken())
{
termAttribute.SetTermBuffer(termAttribute.Term.RemoveDiacritics());
return true;
}
return false;
}
}
ITermAttribute
不存在了,我想我需要使用ICharTermAttribute
,但我不知道该怎么做。
如何在 4.8 中执行相同的操作?
作为参考,这是 RemoveDiacritics
扩展方法:
public static string RemoveDiacritics(this string text)
{
var normalizedString = text.Normalize(NormalizationForm.FormD);
var stringBuilder = new StringBuilder();
foreach (var c in normalizedString)
{
var unicodeCategory = CharUnicodeInfo.GetUnicodeCategory(c);
if (unicodeCategory != UnicodeCategory.NonSpacingMark)
{
stringBuilder.Append(c);
}
}
return stringBuilder.ToString().Normalize(NormalizationForm.FormC);
}
Although you could use the answer below, do note that Lucene.NET 4.8.0 includes a ICUNormalizer2Filter, an ICUNormalizer2CharFilter, and an ICUFoldingFilter in the box. However, you may still be inclined to use your existing solution rather than drag in a 20MB+ dependency (ICU4N).
要翻译,您需要将 ICharTermAttribute
直接添加到您的过滤器(而不是 TokenStream
)。该属性将通过调用 GetAttribute<ICharTermAttribute>()
.
public sealed class AccentFoldingFilter : TokenFilter
{
private ICharTermAttribute termAttribute;
public AccentFoldingFilter(TokenStream input) : base(input)
{
termAttribute = this.GetAttribute<ICharTermAttribute>();
}
public override bool IncrementToken()
{
if (this.m_input.IncrementToken())
{
string buffer = termAttribute.ToString().RemoveDiacritics();
termAttribute.SetEmpty().Append(buffer);
return true;
}
return false;
}
}
此外,RemoveDiacritics
方法实现不考虑代理对,这可能会导致以后难以诊断错误。
public static string RemoveDiacritics(this string text)
{
var normalizedString = text.Normalize(NormalizationForm.FormD);
int inputLength = normalizedString.Length;
char[] buffer = new char[inputLength];
// TODO: If the strings are short (less than 256 chars),
// consider using this (must be unsafe context)
// char* buffer = stackalloc char[inputLength];
int bufferLength = 0;
for (int i = 0; i < inputLength;)
{
// Handle surrogate pairs
int charCount = char.IsHighSurrogate(normalizedString, i)
&& i < inputLength - 1
&& char.IsLowSurrogate(normalizedString, i + 1) ? 2 : 1;
var unicodeCategory = CharUnicodeInfo.GetUnicodeCategory(normalizedString, i);
if (unicodeCategory != UnicodeCategory.NonSpacingMark)
{
buffer[bufferLength++] = normalizedString[i]; // high surrogate / BMP char
if (charCount == 2)
{
buffer[bufferLength++] = normalizedString[i + 1]; // low surrogate
}
}
i += charCount;
}
return new string(buffer, 0, bufferLength).Normalize(NormalizationForm.FormC);
}