使用 Nest 创建自定义分析器(用于电子邮件地址)
Creating a custom analyzer with Nest (for email address)
我有一个 class 和一个包含邮件地址的 UserID
字段,我希望能够在该字段上搜索完全匹配或部分有一些例外的邮件地址。
例如,如果 UserID
包含 "john.doe@foo.com",我希望能够搜索 "john.doe@foo.com"、"john" 和 "doe",但是 "foo" 和 "com" 应该从索引中排除。
我尝试创建一个自定义分析器,它使用带有排除词列表的停止过滤器,然后将多索引应用于 属性,如本例所示:
using Nest;
using System;
using System.Collections.Generic;
[ElasticType]
public class ElasticUser {
[ElasticProperty(Index = FieldIndexOption.NotAnalyzed)]
public string UserID { get; set; }
}
class Program {
static void Main(string[] args) {
const string IndexName = "test_index";
var settings = new ConnectionSettings(uri: new Uri("http://localhost:9200/"), defaultIndex: IndexName);
var client = new ElasticClient(settings);
// delete the index for the test
var deleteIndexResp = client.DeleteIndex(IndexName);
// create the custom filter and analyzer
var user_id_stop_filter = new StopTokenFilter {
Stopwords = new[] { "foo", "bar", "com" }
};
var user_id_analyzer = new CustomAnalyzer {
Filter = new List<string> {
"user_id_stop_filter"
},
Tokenizer = "letter"
};
// create the index with the custom filter and analyzer
var createIndexResp = client.CreateIndex(IndexName, index => index
.Analysis(analysis => analysis
.TokenFilters(t => t
.Add("user_id_stop_filter", user_id_stop_filter))
.Analyzers(a => a
.Add("user_id_analyzer", user_id_analyzer))));
// add a mapping for the "ElasticUser" type
var putMapResp = client.Map<ElasticUser>(
m => m.MapFromAttributes()
.Properties(properties => properties
.MultiField(multi => multi
.Name(p => p.UserID)
.Fields(fields => fields
.String(s => s
.Name(p => p.UserID)
.Index(FieldIndexOption.NotAnalyzed)
)
.String(s => s
.Name(p => p.UserID.Suffix("searchable"))
.Analyzer("user_id_analyzer")
)
)
)
));
// add a couple of entries
client.Index(new ElasticUser {
UserID = "some.one@foo.com"
});
client.Index(new ElasticUser {
UserID = "another.one@bar.com"
});
}
}
但是,这似乎不起作用,因为我只能搜索完全匹配,但电子邮件地址未按非单词进行标记化拆分。
我怎样才能让这个多重索引按照描述的那样工作?
当我尝试 运行 这个查询时,我没有得到任何结果:
GET /test_index/elasticuser/_search
{
"query": {
"query_string": {
"query": "one"
}
}
}
实现您想要的效果的最简单方法是在 searchable
字段上使用 simple
analyzer:
...
.String(s => s
.Name(p => p.UserID.Suffix("searchable"))
.Analyzer("simple") <---- change this
)
...
电子邮件将被标记为任何 non-letter 个字符,您将能够搜索 john
和 doe
。
更新
如果你想保留你的排除列表,你绝对可以这样做。您可以保留现有的分析器,但您需要使用 lowercase
tokenizer(即与 simple
分析器中使用的相同)而不是 letter
。
var user_id_analyzer = new CustomAnalyzer {
Filter = new List<string> {
"user_id_stop_filter"
},
Tokenizer = "lowercase" <--- change this
};
更新 2
纯粹的JSON,这是我所拥有的
curl -XPUT localhost:9200/users -d '{
"settings": {
"analysis": {
"analyzer": {
"email_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": [
"my_stop"
]
}
},
"filter": {
"my_stop": {
"type": "stop",
"stopwords": [
"foo",
"bar",
"com"
]
}
}
}
},
"mappings": {
"user": {
"properties": {
"email": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
},
"parts": {
"type": "string",
"analyzer": "email_analyzer"
}
}
}
}
}
}
}'
然后当我分析 some.one@foo.com
时,这就是我得到的结果
$ curl -XGET 'localhost:9200/users/_analyze?field=email.parts&pretty' -d 'some.one@foo.com'
{
"tokens" : [ {
"token" : "some",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 1
}, {
"token" : "one",
"start_offset" : 5,
"end_offset" : 8,
"type" : "word",
"position" : 2
} ]
}
我有一个 class 和一个包含邮件地址的 UserID
字段,我希望能够在该字段上搜索完全匹配或部分有一些例外的邮件地址。
例如,如果 UserID
包含 "john.doe@foo.com",我希望能够搜索 "john.doe@foo.com"、"john" 和 "doe",但是 "foo" 和 "com" 应该从索引中排除。
我尝试创建一个自定义分析器,它使用带有排除词列表的停止过滤器,然后将多索引应用于 属性,如本例所示:
using Nest;
using System;
using System.Collections.Generic;
[ElasticType]
public class ElasticUser {
[ElasticProperty(Index = FieldIndexOption.NotAnalyzed)]
public string UserID { get; set; }
}
class Program {
static void Main(string[] args) {
const string IndexName = "test_index";
var settings = new ConnectionSettings(uri: new Uri("http://localhost:9200/"), defaultIndex: IndexName);
var client = new ElasticClient(settings);
// delete the index for the test
var deleteIndexResp = client.DeleteIndex(IndexName);
// create the custom filter and analyzer
var user_id_stop_filter = new StopTokenFilter {
Stopwords = new[] { "foo", "bar", "com" }
};
var user_id_analyzer = new CustomAnalyzer {
Filter = new List<string> {
"user_id_stop_filter"
},
Tokenizer = "letter"
};
// create the index with the custom filter and analyzer
var createIndexResp = client.CreateIndex(IndexName, index => index
.Analysis(analysis => analysis
.TokenFilters(t => t
.Add("user_id_stop_filter", user_id_stop_filter))
.Analyzers(a => a
.Add("user_id_analyzer", user_id_analyzer))));
// add a mapping for the "ElasticUser" type
var putMapResp = client.Map<ElasticUser>(
m => m.MapFromAttributes()
.Properties(properties => properties
.MultiField(multi => multi
.Name(p => p.UserID)
.Fields(fields => fields
.String(s => s
.Name(p => p.UserID)
.Index(FieldIndexOption.NotAnalyzed)
)
.String(s => s
.Name(p => p.UserID.Suffix("searchable"))
.Analyzer("user_id_analyzer")
)
)
)
));
// add a couple of entries
client.Index(new ElasticUser {
UserID = "some.one@foo.com"
});
client.Index(new ElasticUser {
UserID = "another.one@bar.com"
});
}
}
但是,这似乎不起作用,因为我只能搜索完全匹配,但电子邮件地址未按非单词进行标记化拆分。
我怎样才能让这个多重索引按照描述的那样工作?
当我尝试 运行 这个查询时,我没有得到任何结果:
GET /test_index/elasticuser/_search
{
"query": {
"query_string": {
"query": "one"
}
}
}
实现您想要的效果的最简单方法是在 searchable
字段上使用 simple
analyzer:
...
.String(s => s
.Name(p => p.UserID.Suffix("searchable"))
.Analyzer("simple") <---- change this
)
...
电子邮件将被标记为任何 non-letter 个字符,您将能够搜索 john
和 doe
。
更新
如果你想保留你的排除列表,你绝对可以这样做。您可以保留现有的分析器,但您需要使用 lowercase
tokenizer(即与 simple
分析器中使用的相同)而不是 letter
。
var user_id_analyzer = new CustomAnalyzer {
Filter = new List<string> {
"user_id_stop_filter"
},
Tokenizer = "lowercase" <--- change this
};
更新 2
纯粹的JSON,这是我所拥有的
curl -XPUT localhost:9200/users -d '{
"settings": {
"analysis": {
"analyzer": {
"email_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": [
"my_stop"
]
}
},
"filter": {
"my_stop": {
"type": "stop",
"stopwords": [
"foo",
"bar",
"com"
]
}
}
}
},
"mappings": {
"user": {
"properties": {
"email": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
},
"parts": {
"type": "string",
"analyzer": "email_analyzer"
}
}
}
}
}
}
}'
然后当我分析 some.one@foo.com
时,这就是我得到的结果
$ curl -XGET 'localhost:9200/users/_analyze?field=email.parts&pretty' -d 'some.one@foo.com'
{
"tokens" : [ {
"token" : "some",
"start_offset" : 0,
"end_offset" : 4,
"type" : "word",
"position" : 1
}, {
"token" : "one",
"start_offset" : 5,
"end_offset" : 8,
"type" : "word",
"position" : 2
} ]
}