将整个句子与lucene BooleanQuery中的空格匹配
Matching entire sentence with spaces in lucene BooleanQuery
我有一个搜索字符串,
Tulip INN Riyadhh
Tulip INN Riyadhh LUXURY
Suites of Tulip INN RIYAHdhh
我需要搜索字词,如果我提到
*Tulip INN Riyadhh*
它必须 return 以上所有三个,我有限制,我必须在没有 QueryParser 或 Analyser 的情况下实现这个,它必须只是 BooleanQuery/WildCardQuery/etc...
此致,
拉加文
这里需要的是PhraseQuery
。让我解释一下。
我不知道您使用的是哪种分析器,但为了简单起见,我假设您有一个非常基本的分析器,它只是将文本转换为小写。不要告诉我你没有使用分析器,因为 Lucene 必须执行任何工作,至少在索引阶段 - 这就是定义标记器和标记过滤器链的内容。
以下是您的字符串在此示例中的标记化方式:
tulip
inn
ryiadhh
tulip
inn
ryiadhh
luxury
suites
of
tulip
inn
ryiadhh
注意这些都包含标记序列 tulip
inn
ryiadhh
。 PhraseQuery
正在寻找的是一系列标记。
在 Lucene.Net 中构建这样的查询如下所示(未经测试):
var query = new PhraseQuery();
query.Add(new Term("propertyName", "tulip"));
query.Add(new Term("propertyName", "inn"));
query.Add(new Term("propertyName", "ryiadhh"));
请注意,这些术语需要与分析器生成的术语相匹配(在本例中,它们都是小写的)。 QueryParser
通过分析器通过查询的 运行 部分为您完成这项工作,但如果您不使用解析器,则必须自己完成。
现在,为什么 WildcardQuery
或 RegexQuery
在这种情况下不起作用?这些查询始终匹配 单个 术语,但您需要匹配 有序的术语序列 。例如 WildcardQuery
与词条 Riyadhh*
会找到 所有以 Riyadhh
.
开头的单词
带有 TermQuery
MUST
子句集合的 BooleanQuery
将匹配恰好以任何顺序包含这 3 个术语的任何文本 - 也不完全是您想要的。
Lucas 的想法是正确的,但是有一个更专业的 MultiPhraseQuery
可用于根据索引中已有的数据构建查询以获得前缀匹配 demonstrated in this unit test. MultiPhraseQuery
的文档如下:
MultiPhraseQuery
is a generalized version of PhraseQuery
, with an added method Add(Term[])
. To use this class, to search for the phrase "Microsoft app*" first use Add(Term)
on the term "Microsoft", then find all terms that have "app" as prefix using IndexReader.GetTerms(Term)
, and use MultiPhraseQuery.Add(Term[] terms)
to add them to the query.
正如 Lucas 所指出的,*something
WildCardQuery
是进行后缀匹配的方法,前提是您了解性能影响。
然后可以将它们与 BooleanQuery
组合以获得您想要的结果。
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Store;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
namespace LuceneSQLLikeSearch
{
class Program
{
static void Main(string[] args)
{
// Prepare...
var dir = new RAMDirectory();
var writer = new IndexWriter(dir,
new IndexWriterConfig(LuceneVersion.LUCENE_48,
new StandardAnalyzer(LuceneVersion.LUCENE_48)));
WriteIndex(writer);
// Search...
var reader = writer.GetReader(false);
// Get all terms that end with tulip
var wildCardQuery = new WildcardQuery(new Term("field", "*tulip"));
var multiPhraseQuery = new MultiPhraseQuery();
multiPhraseQuery.Add(new Term("field", "inn"));
// Get all terms that start with riyadhh
multiPhraseQuery.Add(GetPrefixTerms(reader, "field", "riyadhh"));
var query = new BooleanQuery();
query.Add(wildCardQuery, Occur.SHOULD);
query.Add(multiPhraseQuery, Occur.SHOULD);
var result = ExecuteSearch(writer, query);
foreach (var item in result)
{
Console.WriteLine("Match: {0} - Score: {1:0.0########}",
item.Value, item.Score);
}
Console.ReadKey();
}
}
}
写入索引
public static void WriteIndex(IndexWriter writer)
{
Document document;
document = new Document();
document.Add(new TextField("field", "Tulip INN Riyadhh", Field.Store.YES));
writer.AddDocument(document);
document = new Document();
document.Add(new TextField("field", "Tulip INN Riyadhh LUXURY", Field.Store.YES));
writer.AddDocument(document);
document = new Document();
document.Add(new TextField("field", "Suites of Tulip INN RIYAHdhh", Field.Store.YES));
writer.AddDocument(document);
document = new Document();
document.Add(new TextField("field", "Suites of Tulip INN RIYAHdhhll", Field.Store.YES));
writer.AddDocument(document);
document = new Document();
document.Add(new TextField("field", "myTulip INN Riyadhh LUXURY", Field.Store.YES));
writer.AddDocument(document);
document = new Document();
document.Add(new TextField("field", "some bogus data that should not match", Field.Store.YES));
writer.AddDocument(document);
writer.Commit();
}
GetPrefixTerms
这里我们扫描索引以查找以传入前缀开头的所有术语。然后将这些术语添加到 MultiPhraseQuery
.
public static Term[] GetPrefixTerms(IndexReader reader, string field, string prefix)
{
var result = new List<Term>();
TermsEnum te = MultiFields.GetFields(reader).GetTerms(field).GetIterator(null);
te.SeekCeil(new BytesRef(prefix));
do
{
string s = te.Term.Utf8ToString();
if (s.StartsWith(prefix, StringComparison.Ordinal))
{
result.Add(new Term(field, s));
}
else
{
break;
}
} while (te.Next() != null);
return result.ToArray();
}
执行搜索
public static IList<SearchResult> ExecuteSearch(IndexWriter writer, Query query)
{
var result = new List<SearchResult>();
var searcherManager = new SearcherManager(writer, true, null);
// Execute the search with a fresh indexSearcher
searcherManager.MaybeRefreshBlocking();
var searcher = searcherManager.Acquire();
try
{
var topDocs = searcher.Search(query, 10);
foreach (var scoreDoc in topDocs.ScoreDocs)
{
var doc = searcher.Doc(scoreDoc.Doc);
result.Add(new SearchResult
{
Value = doc.GetField("field")?.GetStringValue(),
// Results are automatically sorted by relevance
Score = scoreDoc.Score,
});
}
}
catch (Exception e)
{
Console.WriteLine(e.ToString());
}
finally
{
searcherManager.Release(searcher);
searcher = null; // Don't use searcher after this point!
}
return result;
}
搜索结果
public class SearchResult
{
public string Value { get; set; }
public float Score { get; set; }
}
如果这看起来很麻烦,请注意 QueryParser
可以模仿 "SQL LIKE" 查询。正如 here 所指出的,QueryParser
上有一个 AllowLeadingWildCard
选项可以轻松构建正确的查询序列。不清楚为什么您有不能使用它的约束,因为它绝对是完成工作的最简单方法。
我有一个搜索字符串,
Tulip INN Riyadhh
Tulip INN Riyadhh LUXURY
Suites of Tulip INN RIYAHdhh
我需要搜索字词,如果我提到
*Tulip INN Riyadhh*
它必须 return 以上所有三个,我有限制,我必须在没有 QueryParser 或 Analyser 的情况下实现这个,它必须只是 BooleanQuery/WildCardQuery/etc...
此致, 拉加文
这里需要的是PhraseQuery
。让我解释一下。
我不知道您使用的是哪种分析器,但为了简单起见,我假设您有一个非常基本的分析器,它只是将文本转换为小写。不要告诉我你没有使用分析器,因为 Lucene 必须执行任何工作,至少在索引阶段 - 这就是定义标记器和标记过滤器链的内容。
以下是您的字符串在此示例中的标记化方式:
tulip
inn
ryiadhh
tulip
inn
ryiadhh
luxury
suites
of
tulip
inn
ryiadhh
注意这些都包含标记序列 tulip
inn
ryiadhh
。 PhraseQuery
正在寻找的是一系列标记。
在 Lucene.Net 中构建这样的查询如下所示(未经测试):
var query = new PhraseQuery();
query.Add(new Term("propertyName", "tulip"));
query.Add(new Term("propertyName", "inn"));
query.Add(new Term("propertyName", "ryiadhh"));
请注意,这些术语需要与分析器生成的术语相匹配(在本例中,它们都是小写的)。 QueryParser
通过分析器通过查询的 运行 部分为您完成这项工作,但如果您不使用解析器,则必须自己完成。
现在,为什么 WildcardQuery
或 RegexQuery
在这种情况下不起作用?这些查询始终匹配 单个 术语,但您需要匹配 有序的术语序列 。例如 WildcardQuery
与词条 Riyadhh*
会找到 所有以 Riyadhh
.
带有 TermQuery
MUST
子句集合的 BooleanQuery
将匹配恰好以任何顺序包含这 3 个术语的任何文本 - 也不完全是您想要的。
Lucas 的想法是正确的,但是有一个更专业的 MultiPhraseQuery
可用于根据索引中已有的数据构建查询以获得前缀匹配 demonstrated in this unit test. MultiPhraseQuery
的文档如下:
MultiPhraseQuery
is a generalized version ofPhraseQuery
, with an added methodAdd(Term[])
. To use this class, to search for the phrase "Microsoft app*" first useAdd(Term)
on the term "Microsoft", then find all terms that have "app" as prefix usingIndexReader.GetTerms(Term)
, and useMultiPhraseQuery.Add(Term[] terms)
to add them to the query.
正如 Lucas 所指出的,*something
WildCardQuery
是进行后缀匹配的方法,前提是您了解性能影响。
然后可以将它们与 BooleanQuery
组合以获得您想要的结果。
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Store;
using Lucene.Net.Util;
using System;
using System.Collections.Generic;
namespace LuceneSQLLikeSearch
{
class Program
{
static void Main(string[] args)
{
// Prepare...
var dir = new RAMDirectory();
var writer = new IndexWriter(dir,
new IndexWriterConfig(LuceneVersion.LUCENE_48,
new StandardAnalyzer(LuceneVersion.LUCENE_48)));
WriteIndex(writer);
// Search...
var reader = writer.GetReader(false);
// Get all terms that end with tulip
var wildCardQuery = new WildcardQuery(new Term("field", "*tulip"));
var multiPhraseQuery = new MultiPhraseQuery();
multiPhraseQuery.Add(new Term("field", "inn"));
// Get all terms that start with riyadhh
multiPhraseQuery.Add(GetPrefixTerms(reader, "field", "riyadhh"));
var query = new BooleanQuery();
query.Add(wildCardQuery, Occur.SHOULD);
query.Add(multiPhraseQuery, Occur.SHOULD);
var result = ExecuteSearch(writer, query);
foreach (var item in result)
{
Console.WriteLine("Match: {0} - Score: {1:0.0########}",
item.Value, item.Score);
}
Console.ReadKey();
}
}
}
写入索引
public static void WriteIndex(IndexWriter writer)
{
Document document;
document = new Document();
document.Add(new TextField("field", "Tulip INN Riyadhh", Field.Store.YES));
writer.AddDocument(document);
document = new Document();
document.Add(new TextField("field", "Tulip INN Riyadhh LUXURY", Field.Store.YES));
writer.AddDocument(document);
document = new Document();
document.Add(new TextField("field", "Suites of Tulip INN RIYAHdhh", Field.Store.YES));
writer.AddDocument(document);
document = new Document();
document.Add(new TextField("field", "Suites of Tulip INN RIYAHdhhll", Field.Store.YES));
writer.AddDocument(document);
document = new Document();
document.Add(new TextField("field", "myTulip INN Riyadhh LUXURY", Field.Store.YES));
writer.AddDocument(document);
document = new Document();
document.Add(new TextField("field", "some bogus data that should not match", Field.Store.YES));
writer.AddDocument(document);
writer.Commit();
}
GetPrefixTerms
这里我们扫描索引以查找以传入前缀开头的所有术语。然后将这些术语添加到 MultiPhraseQuery
.
public static Term[] GetPrefixTerms(IndexReader reader, string field, string prefix)
{
var result = new List<Term>();
TermsEnum te = MultiFields.GetFields(reader).GetTerms(field).GetIterator(null);
te.SeekCeil(new BytesRef(prefix));
do
{
string s = te.Term.Utf8ToString();
if (s.StartsWith(prefix, StringComparison.Ordinal))
{
result.Add(new Term(field, s));
}
else
{
break;
}
} while (te.Next() != null);
return result.ToArray();
}
执行搜索
public static IList<SearchResult> ExecuteSearch(IndexWriter writer, Query query)
{
var result = new List<SearchResult>();
var searcherManager = new SearcherManager(writer, true, null);
// Execute the search with a fresh indexSearcher
searcherManager.MaybeRefreshBlocking();
var searcher = searcherManager.Acquire();
try
{
var topDocs = searcher.Search(query, 10);
foreach (var scoreDoc in topDocs.ScoreDocs)
{
var doc = searcher.Doc(scoreDoc.Doc);
result.Add(new SearchResult
{
Value = doc.GetField("field")?.GetStringValue(),
// Results are automatically sorted by relevance
Score = scoreDoc.Score,
});
}
}
catch (Exception e)
{
Console.WriteLine(e.ToString());
}
finally
{
searcherManager.Release(searcher);
searcher = null; // Don't use searcher after this point!
}
return result;
}
搜索结果
public class SearchResult
{
public string Value { get; set; }
public float Score { get; set; }
}
如果这看起来很麻烦,请注意 QueryParser
可以模仿 "SQL LIKE" 查询。正如 here 所指出的,QueryParser
上有一个 AllowLeadingWildCard
选项可以轻松构建正确的查询序列。不清楚为什么您有不能使用它的约束,因为它绝对是完成工作的最简单方法。