Elasticsearch Soundex 匹配查询 - NEST
Elasticsearch Soundex Match Query - NEST
谁能想到为什么这可能不起作用?我基本上有两个字段,我使用 soundex 分析器对其进行索引,请参阅下面的配置,但是当我使用类似于存储在索引中的名称进行搜索时,它不起作用。
anz.Custom("soundex_analyzer", dma => dma
.Tokenizer("keyword")
.Filters("lowercase", "icu_folding", "soundex_filter"));
tk.Phonetic("soundex_filter", ph => ph.Encoder(PhoneticEncoder.RefinedSoundex).Replace(false));
[String(Name = "surnameSoundex", Index = FieldIndexOption.Analyzed, Analyzer = "soundex_analyzer")]
public string SurnameSoundex { get; set; }
[String(Name = "forenameSoundex", Index = FieldIndexOption.Analyzed, Analyzer = "soundex_analyzer")]
public string ForenameSoundex { get; set; }
if (string.IsNullOrEmpty(oReq.person.ForenameSoundex) || oReq.person.ForenameSoundex.Length < 3)
{
_qc = _qd.Match(mt => mt.Field(fld => fld.SurnameSoundex).Query(oReq.person.SurnameSoundex));
_AndQueries.Add(_qc);
_qc = null;
}
else
{
//search on surname and combination of forename and surname
_qc = _qd.Match(mt => mt.Field(fld => fld.SurnameSoundex).Query(oReq.person.SurnameSoundex))
|| _qd.Match(mt => mt.Field(fld => fld.SurnameSoundex).Query(oReq.person.SurnameSoundex))
&& _qd.Match(mt => mt.Field(fld => fld.ForenameSoundex).Query(oReq.person.ForenameSoundex));
_AndQueries.Add(_qc);
_qc = null;
}
查询的构造没有任何问题,因为我已经检查过了,但基本上这些查询被传递到一个查询容器列表并变成一个数组可以传递到 bool 查询中。
我不确定我是否不能在这个分析器中使用关键字分词器。
提前致谢!
编辑:
所以基本上我有一个人 class 定义了我的 POCO 属性:
[ElasticsearchType(Name = "person", IdProperty = "id")]
public class Person
{
[String(Name = "id", Index = FieldIndexOption.NotAnalyzed)]
public string id { get; set; }
[String(Name = "forename", Index = FieldIndexOption.Analyzed, Analyzer = "low_whit_analyzer", SearchAnalyzer = "low_whit_analyzer")]
public string forename { get; set; }
[String(Name = "forenameSoundex", Index = FieldIndexOption.Analyzed, Analyzer = "soundex_analyzer", SearchAnalyzer = "soundex_analyzer")]
public string forenameSoundex { get; set; }
[String(Name = "surname", Index = FieldIndexOption.Analyzed, Analyzer = "low_whit_analyzer", SearchAnalyzer = "low_whit_analyzer")]
public string surname { get; set; }
[String(Name = "surnameSoundex", Index = FieldIndexOption.Analyzed, Analyzer = "soundex_analyzer", SearchAnalyzer = "soundex_analyzer")]
public string surnameSoundex { get; set; }
[Date(Name = "dob", Index = NonStringIndexOption.NotAnalyzed, Format = "date_optional_time")]
public DateTime dob { get; set; }
[String(Name = "postCode1", Index = FieldIndexOption.Analyzed, Analyzer = "keyword_analyzer", NullValue = null)]
public string postCode1 { get; set; }
[String(Name = "postCode2", Index = FieldIndexOption.Analyzed, Analyzer = "keyword_analyzer", NullValue = null)]
public string postCode2 { get; set; }
[String(Name = "identifier", Index = FieldIndexOption.Analyzed, Analyzer = "low_whit_analyzer", NullValue = null)]
public string identifier { get; set; }
[String(Name = "email", Index = FieldIndexOption.Analyzed, Analyzer = "keyword_analyzer", NullValue = null)]
public string email { get; set; }
[String(Name = "mobile", Index = FieldIndexOption.Analyzed, Analyzer = "low_whit_analyzer", NullValue = null)]
public string mobile { get; set; }
[String(Name = "gender", Index = FieldIndexOption.Analyzed, Analyzer = "keyword_analyzer")]
public string gender { get; set; }
[String(Name = "notes", Index = FieldIndexOption.NotAnalyzed)]
public string notes { get; set; }
[String(Name = "address1", Index = FieldIndexOption.NotAnalyzed, NullValue = null)]
public string address1 { get; set; }
[String(Name = "address2", Index = FieldIndexOption.NotAnalyzed, NullValue = null)]
public string address2 { get; set; }
[String(Name = "personalReferenceId", Index = FieldIndexOption.Analyzed, Analyzer = "low_whit_analyzer")]
public string personalReferenceId { get; set; }
}
然后我使用以下代码创建索引:
Uri eSAddress = new Uri(ConfigurationManager.AppSettings["ElasticSearchUrl"]);
_clientSettings = new ConnectionSettings(eSAddress)
.MapDefaultTypeIndices(i => i.Add(typeof(Person), "people"));
_client = new ElasticClient(_clientSettings);
var oRequest = new IndexExistsRequest("people");
var bIndexExists = _client.IndexExists(oRequest);
if (bIndexExists.Exists == false)
{
var oIndexResponse = _client.CreateIndex("people", c => c
.Settings(st => st
.RefreshInterval(-1)
.Translog((ts) => SetupTranslogSettings(ts))
.NumberOfShards(1)
.NumberOfReplicas(0)
.Analysis(an => an
.TokenFilters((tf) => SetUpFilters(tf))
.Analyzers((anz) => SetUpAnalyzers(anz)
)))
.Mappings(mp => mp.Map<Person>(m => m
.AutoMap()
.AllField(al => al.Enabled(false)))));
然后我使用 logstash 使用以下配置从数据库导入我的记录:
statement => "SELECT IGF_UID AS id, IGF_FORENAME AS forename, IGF_SURNAME AS surname, IGF_FORENAME AS forenameSoundex, IGF_SURNAME AS surnameSoundex,
IGF_DATE_OF_BIRTH AS dob, IGF_POSTCODE1 AS postCode1, IGF_POSTCODE2 AS postCode2, IGF_NHS_NUMBER AS identifier, IGF_EMAIL AS email,
IGF_MOBILE AS mobile, (CASE IGF_SEX
WHEN 'male' THEN 'm'
WHEN 'female' THEN 'f'
WHEN 'transgender' THEN 't'
WHEN 'unknown' THEN 'u'
WHEN '' THEN NULL
ELSE IGF_SEX
END) AS gender, IGF_ADDRESS1 AS address1, IGF_ADDRESS2 AS address2 FROM dbo.IGT_PEOPLE"
}
}
filter {
mutate {
remove_field => [ "@timestamp", "@version" ]
}
}
output {
elasticsearch {
hosts => "localhost"
index => "people"
document_type => "person"
document_id => "%{id}"
manage_template => false
template_overwrite => false
}
}
我的分析器包含在下面 - 注意我已经更改为 double metaphone 标记过滤器:
public static void AddSoundexAnalyzer(ref AnalyzersDescriptor anz)
{
anz.Custom("soundex_analyzer", dma => dma
.Tokenizer("keyword")
.Filters("soundex_filter"));
}
public static void AddSoundexFilter(ref TokenFiltersDescriptor tk)
{
tk.Phonetic("soundex_filter", ph => ph.Encoder(PhoneticEncoder.DoubleMetaphone).Replace(true));
}
然后我使用 bool 查询进行查询,其中查询在 must should so should match at least one query.
public SearchDescriptor<Person> FuzzySearch(PersonSearchRequest oReq)
{
var oPerson = oReq.person;
var oSearchParams = oReq.searchParams;
_s = new SearchDescriptor<Person>();
_b = new BoolQueryDescriptor<Person>();
_AndQueries = new List<QueryContainer>();
_OrQueries = new List<QueryContainer>();
GetNameSearchClauses(oReq, ref _OrQueries, ref _AndQueries);
if (_OrQueries.Count > 0 || _AndQueries.Count > 0)
{
_b.Should(_OrQueries.ToArray());
_b.Must(_AndQueries.ToArray());
return _s.Query(qu => qu.Bool((z) => _b)).Sort(srt => srt.Descending(SortSpecialField.Score));
}
else
{
return null;
}
}
然后我的 forenameSoundex 和 surnameSoundex 查询构造在以下方法中:
public void GetNameSearchClauses(PersonSearchRequest oReq, ref List<QueryContainer> _OrQueries, ref List<QueryContainer> _AndQueries)
{
if (oReq.searchParams.useSoundex == true && oReq.person.surnameSoundex.Length > 3)//use different analyzers
{
if (!string.IsNullOrEmpty(oReq.person.surnameSoundex))
{//check if clause is null
//if no first name then just search on surname
if (string.IsNullOrEmpty(oReq.person.forenameSoundex) || oReq.person.forenameSoundex.Length < 3)
{
_qc = _qd.Match(mt => mt.Field(fld => fld.surnameSoundex).Query(oReq.person.surnameSoundex));
_AndQueries.Add(_qc);
_qc = null;
}
else
{
//search on surname and combination of forename and surname
_qc = _qd.Match(mt => mt.Field(fld => fld.surnameSoundex).Query(oReq.person.surnameSoundex))
|| _qd.Match(mt => mt.Field(fld => fld.surnameSoundex).Query(oReq.person.surnameSoundex))
&& _qd.Match(mt => mt.Field(fld => fld.forenameSoundex).Query(oReq.person.forenameSoundex));
_AndQueries.Add(_qc);
_qc = null;
}
}
}
}
问题在于,使用 logstash 的 jdbc 插件时,它会自动将列名称小写。因此,在我的 sql 语句中,当我创建一个别名以直接映射到 elasticsearch 字段名称时,它在通过 logstash 时被转换为小写。
我的 jdbc 配置需要以下行:
lowercase_column_names => false
谁能想到为什么这可能不起作用?我基本上有两个字段,我使用 soundex 分析器对其进行索引,请参阅下面的配置,但是当我使用类似于存储在索引中的名称进行搜索时,它不起作用。
anz.Custom("soundex_analyzer", dma => dma
.Tokenizer("keyword")
.Filters("lowercase", "icu_folding", "soundex_filter"));
tk.Phonetic("soundex_filter", ph => ph.Encoder(PhoneticEncoder.RefinedSoundex).Replace(false));
[String(Name = "surnameSoundex", Index = FieldIndexOption.Analyzed, Analyzer = "soundex_analyzer")]
public string SurnameSoundex { get; set; }
[String(Name = "forenameSoundex", Index = FieldIndexOption.Analyzed, Analyzer = "soundex_analyzer")]
public string ForenameSoundex { get; set; }
if (string.IsNullOrEmpty(oReq.person.ForenameSoundex) || oReq.person.ForenameSoundex.Length < 3)
{
_qc = _qd.Match(mt => mt.Field(fld => fld.SurnameSoundex).Query(oReq.person.SurnameSoundex));
_AndQueries.Add(_qc);
_qc = null;
}
else
{
//search on surname and combination of forename and surname
_qc = _qd.Match(mt => mt.Field(fld => fld.SurnameSoundex).Query(oReq.person.SurnameSoundex))
|| _qd.Match(mt => mt.Field(fld => fld.SurnameSoundex).Query(oReq.person.SurnameSoundex))
&& _qd.Match(mt => mt.Field(fld => fld.ForenameSoundex).Query(oReq.person.ForenameSoundex));
_AndQueries.Add(_qc);
_qc = null;
}
查询的构造没有任何问题,因为我已经检查过了,但基本上这些查询被传递到一个查询容器列表并变成一个数组可以传递到 bool 查询中。
我不确定我是否不能在这个分析器中使用关键字分词器。
提前致谢!
编辑:
所以基本上我有一个人 class 定义了我的 POCO 属性:
[ElasticsearchType(Name = "person", IdProperty = "id")]
public class Person
{
[String(Name = "id", Index = FieldIndexOption.NotAnalyzed)]
public string id { get; set; }
[String(Name = "forename", Index = FieldIndexOption.Analyzed, Analyzer = "low_whit_analyzer", SearchAnalyzer = "low_whit_analyzer")]
public string forename { get; set; }
[String(Name = "forenameSoundex", Index = FieldIndexOption.Analyzed, Analyzer = "soundex_analyzer", SearchAnalyzer = "soundex_analyzer")]
public string forenameSoundex { get; set; }
[String(Name = "surname", Index = FieldIndexOption.Analyzed, Analyzer = "low_whit_analyzer", SearchAnalyzer = "low_whit_analyzer")]
public string surname { get; set; }
[String(Name = "surnameSoundex", Index = FieldIndexOption.Analyzed, Analyzer = "soundex_analyzer", SearchAnalyzer = "soundex_analyzer")]
public string surnameSoundex { get; set; }
[Date(Name = "dob", Index = NonStringIndexOption.NotAnalyzed, Format = "date_optional_time")]
public DateTime dob { get; set; }
[String(Name = "postCode1", Index = FieldIndexOption.Analyzed, Analyzer = "keyword_analyzer", NullValue = null)]
public string postCode1 { get; set; }
[String(Name = "postCode2", Index = FieldIndexOption.Analyzed, Analyzer = "keyword_analyzer", NullValue = null)]
public string postCode2 { get; set; }
[String(Name = "identifier", Index = FieldIndexOption.Analyzed, Analyzer = "low_whit_analyzer", NullValue = null)]
public string identifier { get; set; }
[String(Name = "email", Index = FieldIndexOption.Analyzed, Analyzer = "keyword_analyzer", NullValue = null)]
public string email { get; set; }
[String(Name = "mobile", Index = FieldIndexOption.Analyzed, Analyzer = "low_whit_analyzer", NullValue = null)]
public string mobile { get; set; }
[String(Name = "gender", Index = FieldIndexOption.Analyzed, Analyzer = "keyword_analyzer")]
public string gender { get; set; }
[String(Name = "notes", Index = FieldIndexOption.NotAnalyzed)]
public string notes { get; set; }
[String(Name = "address1", Index = FieldIndexOption.NotAnalyzed, NullValue = null)]
public string address1 { get; set; }
[String(Name = "address2", Index = FieldIndexOption.NotAnalyzed, NullValue = null)]
public string address2 { get; set; }
[String(Name = "personalReferenceId", Index = FieldIndexOption.Analyzed, Analyzer = "low_whit_analyzer")]
public string personalReferenceId { get; set; }
}
然后我使用以下代码创建索引:
Uri eSAddress = new Uri(ConfigurationManager.AppSettings["ElasticSearchUrl"]);
_clientSettings = new ConnectionSettings(eSAddress)
.MapDefaultTypeIndices(i => i.Add(typeof(Person), "people"));
_client = new ElasticClient(_clientSettings);
var oRequest = new IndexExistsRequest("people");
var bIndexExists = _client.IndexExists(oRequest);
if (bIndexExists.Exists == false)
{
var oIndexResponse = _client.CreateIndex("people", c => c
.Settings(st => st
.RefreshInterval(-1)
.Translog((ts) => SetupTranslogSettings(ts))
.NumberOfShards(1)
.NumberOfReplicas(0)
.Analysis(an => an
.TokenFilters((tf) => SetUpFilters(tf))
.Analyzers((anz) => SetUpAnalyzers(anz)
)))
.Mappings(mp => mp.Map<Person>(m => m
.AutoMap()
.AllField(al => al.Enabled(false)))));
然后我使用 logstash 使用以下配置从数据库导入我的记录:
statement => "SELECT IGF_UID AS id, IGF_FORENAME AS forename, IGF_SURNAME AS surname, IGF_FORENAME AS forenameSoundex, IGF_SURNAME AS surnameSoundex,
IGF_DATE_OF_BIRTH AS dob, IGF_POSTCODE1 AS postCode1, IGF_POSTCODE2 AS postCode2, IGF_NHS_NUMBER AS identifier, IGF_EMAIL AS email,
IGF_MOBILE AS mobile, (CASE IGF_SEX
WHEN 'male' THEN 'm'
WHEN 'female' THEN 'f'
WHEN 'transgender' THEN 't'
WHEN 'unknown' THEN 'u'
WHEN '' THEN NULL
ELSE IGF_SEX
END) AS gender, IGF_ADDRESS1 AS address1, IGF_ADDRESS2 AS address2 FROM dbo.IGT_PEOPLE"
}
}
filter {
mutate {
remove_field => [ "@timestamp", "@version" ]
}
}
output {
elasticsearch {
hosts => "localhost"
index => "people"
document_type => "person"
document_id => "%{id}"
manage_template => false
template_overwrite => false
}
}
我的分析器包含在下面 - 注意我已经更改为 double metaphone 标记过滤器:
public static void AddSoundexAnalyzer(ref AnalyzersDescriptor anz)
{
anz.Custom("soundex_analyzer", dma => dma
.Tokenizer("keyword")
.Filters("soundex_filter"));
}
public static void AddSoundexFilter(ref TokenFiltersDescriptor tk)
{
tk.Phonetic("soundex_filter", ph => ph.Encoder(PhoneticEncoder.DoubleMetaphone).Replace(true));
}
然后我使用 bool 查询进行查询,其中查询在 must should so should match at least one query.
public SearchDescriptor<Person> FuzzySearch(PersonSearchRequest oReq)
{
var oPerson = oReq.person;
var oSearchParams = oReq.searchParams;
_s = new SearchDescriptor<Person>();
_b = new BoolQueryDescriptor<Person>();
_AndQueries = new List<QueryContainer>();
_OrQueries = new List<QueryContainer>();
GetNameSearchClauses(oReq, ref _OrQueries, ref _AndQueries);
if (_OrQueries.Count > 0 || _AndQueries.Count > 0)
{
_b.Should(_OrQueries.ToArray());
_b.Must(_AndQueries.ToArray());
return _s.Query(qu => qu.Bool((z) => _b)).Sort(srt => srt.Descending(SortSpecialField.Score));
}
else
{
return null;
}
}
然后我的 forenameSoundex 和 surnameSoundex 查询构造在以下方法中:
public void GetNameSearchClauses(PersonSearchRequest oReq, ref List<QueryContainer> _OrQueries, ref List<QueryContainer> _AndQueries)
{
if (oReq.searchParams.useSoundex == true && oReq.person.surnameSoundex.Length > 3)//use different analyzers
{
if (!string.IsNullOrEmpty(oReq.person.surnameSoundex))
{//check if clause is null
//if no first name then just search on surname
if (string.IsNullOrEmpty(oReq.person.forenameSoundex) || oReq.person.forenameSoundex.Length < 3)
{
_qc = _qd.Match(mt => mt.Field(fld => fld.surnameSoundex).Query(oReq.person.surnameSoundex));
_AndQueries.Add(_qc);
_qc = null;
}
else
{
//search on surname and combination of forename and surname
_qc = _qd.Match(mt => mt.Field(fld => fld.surnameSoundex).Query(oReq.person.surnameSoundex))
|| _qd.Match(mt => mt.Field(fld => fld.surnameSoundex).Query(oReq.person.surnameSoundex))
&& _qd.Match(mt => mt.Field(fld => fld.forenameSoundex).Query(oReq.person.forenameSoundex));
_AndQueries.Add(_qc);
_qc = null;
}
}
}
}
问题在于,使用 logstash 的 jdbc 插件时,它会自动将列名称小写。因此,在我的 sql 语句中,当我创建一个别名以直接映射到 elasticsearch 字段名称时,它在通过 logstash 时被转换为小写。
我的 jdbc 配置需要以下行:
lowercase_column_names => false