Apache Lucene 模糊搜索多词短语
Apache Lucene fuzzy search for multi-worded phrases
我有以下 Apache Lucene 7 应用程序:
StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
Directory directory = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(standardAnalyzer);
IndexWriter writer = new IndexWriter(directory, config);
Document document = new Document();
document.add(new TextField("content", new FileReader("document.txt")));
writer.addDocument(document);
writer.close();
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Query fuzzyQuery = new FuzzyQuery(new Term("content", "Company"), 2);
TopDocs results = searcher.search(fuzzyQuery, 5);
System.out.println("Hits: " + results.totalHits);
System.out.println("Max score:" + results.getMaxScore())
当我将它与 :
一起使用时
new FuzzyQuery(new Term("content", "Company"), 2);
应用程序运行良好,returns结果如下:
Hits: 1
Max score:0.35161147
但是当我尝试使用多词查询进行搜索时,例如:
new FuzzyQuery(new Term("content", "Company name"), 2);
它returns结果如下:
Hits: 0
Max score:NaN
无论如何,短语 Company name
存在于源 document.txt
文件中。
在这种情况下如何正确使用 FuzzyQuery
以便能够对多词短语进行模糊搜索。
已更新
基于提供的解决方案,我根据以下文本信息对其进行了测试:
Company name: BlueCross BlueShield Customer Service
1-800-521-2227
of Texas Preauth-Medical 1-800-441-9188
Preauth-MH/CD 1-800-528-7264
Blue Card Access 1-800-810-2583
对于以下查询:
SpanQuery[] clauses = new SpanQuery[2];
clauses[0] = new SpanMultiTermQueryWrapper<FuzzyQuery>(new FuzzyQuery(new Term("content", "BlueCross"), 2));
clauses[1] = new SpanMultiTermQueryWrapper<FuzzyQuery>(new FuzzyQuery(new Term("content", "BlueShield"), 2));
SpanNearQuery query = new SpanNearQuery(clauses, 0, true);
搜索正常:
Hits: 1
Max score:0.5753642
但是当我尝试稍微破坏搜索查询时(例如从 BlueCross
到 BlueCros
)
SpanQuery[] clauses = new SpanQuery[2];
clauses[0] = new SpanMultiTermQueryWrapper<FuzzyQuery>(new FuzzyQuery(new Term("content", "BlueCros"), 2));
clauses[1] = new SpanMultiTermQueryWrapper<FuzzyQuery>(new FuzzyQuery(new Term("content", "BlueShield"), 2));
SpanNearQuery query = new SpanNearQuery(clauses, 0, true);
它停止工作并且 returns:
Hits: 0
Max score:NaN
这里的问题如下,您使用的是 TextField
,这是标记化字段。例如。您的文本 "Company name is working on something"
将被空格(和其他分隔符)有效分割。因此,即使您有文本 Company name
,在索引期间它也会变成 Company
、name
、is
等
在这种情况下,此 TermQuery 将无法找到您要查找的内容。可以帮助您的技巧如下所示:
SpanQuery[] clauses = new SpanQuery[2];
clauses[0] = new SpanMultiTermQueryWrapper(new FuzzyQuery(new Term("content", "some"), 2));
clauses[1] = new SpanMultiTermQueryWrapper(new FuzzyQuery(new Term("content", "text"), 2));
SpanNearQuery query = new SpanNearQuery(clauses, 0, true);
但是,我不太推荐这种方法,特别是如果您的负载很大并且您计划搜索 10 个术语的公司名称。应该知道,这些查询执行起来可能很繁重。
BlueCros
的问题如下。默认情况下,Lucene 使用 StandardAnalyzer 作为 TextField。所以这意味着它有效地小写了术语,基本上它意味着 content
字段中的 BlueCross
变成了 bluecross
。
BlueCros
和 bluecross
之间的模糊差异是 3,这就是你没有匹配的原因。
简单的建议是将查询中的术语转换为小写,方法是 .toLowerCase()
一般来说,人们应该更喜欢在查询期间也使用相同的分析器(例如在查询的构造期间)
对于Lucene.Net可以这样。
private string _IndexPath = @"Your Index Path";
private Directory _Directory;
private Searcher _IndexSearcher;
private MultiPhraseQuery _MultiPhraseQuery;
_Directory = FSDirectory.Open(_IndexPath);
IndexReader indexReader = IndexReader.Open(_Directory, true);
string field = "Name" // Your field name
string keyword = "big red fox"; // your search term
float fuzzy = 0,7f; // between 0-1
using (_IndexSearcher = new IndexSearcher(indexReader))
{
// "big red fox" to [big,red,fox]
var keywordSplit = keyword.Split();
_MultiPhraseQuery = new MultiPhraseQuery();
FuzzyTermEnum[] _FuzzyTermEnum = new FuzzyTermEnum[keywordSplit.Length];
Term[] _Term = new Term[keywordSplit.Length];
for (int i = 0; i < keywordSplit.Length; i++)
{
_FuzzyTermEnum[i] = new FuzzyTermEnum(indexReader, new Term(field, keywordSplit[i]),fuzzy);
_Term[i] = _FuzzyTermEnum[i].Term;
if (_Term[i] == null)
{
_MultiPhraseQuery.Add(new Term(field, keywordSplit[i]));
}
else
{
_MultiPhraseQuery.Add(_FuzzyTermEnum[i].Term);
}
}
var results = _IndexSearcher.Search(_MultiPhraseQuery, indexReader.MaxDoc);
foreach (var loopDoc in results.ScoreDocs.OrderByDescending(s => s.Score))
{
//YourCode Here
}
}
我有以下 Apache Lucene 7 应用程序:
StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
Directory directory = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(standardAnalyzer);
IndexWriter writer = new IndexWriter(directory, config);
Document document = new Document();
document.add(new TextField("content", new FileReader("document.txt")));
writer.addDocument(document);
writer.close();
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Query fuzzyQuery = new FuzzyQuery(new Term("content", "Company"), 2);
TopDocs results = searcher.search(fuzzyQuery, 5);
System.out.println("Hits: " + results.totalHits);
System.out.println("Max score:" + results.getMaxScore())
当我将它与 :
一起使用时new FuzzyQuery(new Term("content", "Company"), 2);
应用程序运行良好,returns结果如下:
Hits: 1
Max score:0.35161147
但是当我尝试使用多词查询进行搜索时,例如:
new FuzzyQuery(new Term("content", "Company name"), 2);
它returns结果如下:
Hits: 0
Max score:NaN
无论如何,短语 Company name
存在于源 document.txt
文件中。
在这种情况下如何正确使用 FuzzyQuery
以便能够对多词短语进行模糊搜索。
已更新
基于提供的解决方案,我根据以下文本信息对其进行了测试:
Company name: BlueCross BlueShield Customer Service
1-800-521-2227
of Texas Preauth-Medical 1-800-441-9188
Preauth-MH/CD 1-800-528-7264
Blue Card Access 1-800-810-2583
对于以下查询:
SpanQuery[] clauses = new SpanQuery[2];
clauses[0] = new SpanMultiTermQueryWrapper<FuzzyQuery>(new FuzzyQuery(new Term("content", "BlueCross"), 2));
clauses[1] = new SpanMultiTermQueryWrapper<FuzzyQuery>(new FuzzyQuery(new Term("content", "BlueShield"), 2));
SpanNearQuery query = new SpanNearQuery(clauses, 0, true);
搜索正常:
Hits: 1
Max score:0.5753642
但是当我尝试稍微破坏搜索查询时(例如从 BlueCross
到 BlueCros
)
SpanQuery[] clauses = new SpanQuery[2];
clauses[0] = new SpanMultiTermQueryWrapper<FuzzyQuery>(new FuzzyQuery(new Term("content", "BlueCros"), 2));
clauses[1] = new SpanMultiTermQueryWrapper<FuzzyQuery>(new FuzzyQuery(new Term("content", "BlueShield"), 2));
SpanNearQuery query = new SpanNearQuery(clauses, 0, true);
它停止工作并且 returns:
Hits: 0
Max score:NaN
这里的问题如下,您使用的是 TextField
,这是标记化字段。例如。您的文本 "Company name is working on something"
将被空格(和其他分隔符)有效分割。因此,即使您有文本 Company name
,在索引期间它也会变成 Company
、name
、is
等
在这种情况下,此 TermQuery 将无法找到您要查找的内容。可以帮助您的技巧如下所示:
SpanQuery[] clauses = new SpanQuery[2];
clauses[0] = new SpanMultiTermQueryWrapper(new FuzzyQuery(new Term("content", "some"), 2));
clauses[1] = new SpanMultiTermQueryWrapper(new FuzzyQuery(new Term("content", "text"), 2));
SpanNearQuery query = new SpanNearQuery(clauses, 0, true);
但是,我不太推荐这种方法,特别是如果您的负载很大并且您计划搜索 10 个术语的公司名称。应该知道,这些查询执行起来可能很繁重。
BlueCros
的问题如下。默认情况下,Lucene 使用 StandardAnalyzer 作为 TextField。所以这意味着它有效地小写了术语,基本上它意味着 content
字段中的 BlueCross
变成了 bluecross
。
BlueCros
和 bluecross
之间的模糊差异是 3,这就是你没有匹配的原因。
简单的建议是将查询中的术语转换为小写,方法是 .toLowerCase()
一般来说,人们应该更喜欢在查询期间也使用相同的分析器(例如在查询的构造期间)
对于Lucene.Net可以这样。
private string _IndexPath = @"Your Index Path";
private Directory _Directory;
private Searcher _IndexSearcher;
private MultiPhraseQuery _MultiPhraseQuery;
_Directory = FSDirectory.Open(_IndexPath);
IndexReader indexReader = IndexReader.Open(_Directory, true);
string field = "Name" // Your field name
string keyword = "big red fox"; // your search term
float fuzzy = 0,7f; // between 0-1
using (_IndexSearcher = new IndexSearcher(indexReader))
{
// "big red fox" to [big,red,fox]
var keywordSplit = keyword.Split();
_MultiPhraseQuery = new MultiPhraseQuery();
FuzzyTermEnum[] _FuzzyTermEnum = new FuzzyTermEnum[keywordSplit.Length];
Term[] _Term = new Term[keywordSplit.Length];
for (int i = 0; i < keywordSplit.Length; i++)
{
_FuzzyTermEnum[i] = new FuzzyTermEnum(indexReader, new Term(field, keywordSplit[i]),fuzzy);
_Term[i] = _FuzzyTermEnum[i].Term;
if (_Term[i] == null)
{
_MultiPhraseQuery.Add(new Term(field, keywordSplit[i]));
}
else
{
_MultiPhraseQuery.Add(_FuzzyTermEnum[i].Term);
}
}
var results = _IndexSearcher.Search(_MultiPhraseQuery, indexReader.MaxDoc);
foreach (var loopDoc in results.ScoreDocs.OrderByDescending(s => s.Score))
{
//YourCode Here
}
}