使用 Elasticsearch 后端在 Django Haystack 中启用同义词
Enabling Synonyms in Django Haystack with Elasticsearch Backend
我在使用 Elasticsearch 自定义后端的 hackstack 中使用同义词过滤器时遇到问题。
此时我要做的就是创建一个用于测试的同义词。我想为单词 'tricklenutz' 添加一个同义词对 'lipstick'.
我正在使用以下自定义 haystack 后端:
from django.conf import settings
from haystack.backends.elasticsearch_backend import (ElasticsearchSearchBackend,
ElasticsearchSearchEngine)
class SiteElasticBackend(ElasticsearchSearchBackend):
def __init__(self, connection_alias, **connection_options):
super(SiteElasticBackend, self).__init__(
connection_alias, **connection_options)
MY_SETTINGS = {
'settings': {
"analysis": {
"analyzer": {
"synonym_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": ["synonym"]
},
"ngram_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": ["haystack_ngram", "synonym"]
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": ["haystack_edgengram", "synonym"]
}
},
"tokenizer": {
"haystack_ngram_tokenizer": {
"type": "nGram",
"min_gram": 3,
"max_gram": 15,
},
"haystack_edgengram_tokenizer": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 15,
"side": "front"
}
},
"filter": {
"synonym": {
"type": "synonym",
"synonyms": [
"tricklenutz, lipstick"
]
},
"haystack_ngram": {
"type": "nGram",
"min_gram": 3,
"max_gram": 15
},
"haystack_edgengram": {
"type": "edgeNGram",
"min_gram": 5,
"max_gram": 15
}
}
}
}
}
setattr(self, 'DEFAULT_SETTINGS', MY_SETTINGS)
class ConfigurableElasticSearchEngine(ElasticsearchSearchEngine):
backend = SiteElasticBackend
如您所见,我只是想为 'lipstick' 创建一个 'tricklenutz' 的同义词(一个不会出现在任何搜索中的词)。
我的 settings.py
文件中有以下条目:
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'search.backends.site_elasticsearch_backend.ConfigurableElasticSearchEngine',
'URL': 'http://127.0.0.1:9200/',
'INDEX_NAME': 'sitename'
},
}
这是我的 search_index.py Brand
:
class BrandIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
ngram_text = indexes.EdgeNgramField()
name = indexes.NgramField(model_attr='name')
brand_name = indexes.CharField(model_attr='name')
created_date = indexes.DateTimeField(model_attr='created_date')
def get_model(self):
return Brand
def prepare(self, obj):
"""Add the content of text field from final prepared data into ngram_text field
"""
prepared_data = super(BrandIndex, self).prepare(obj)
prepared_data['ngram_text'] = prepared_data['text']
return prepared_data
def index_queryset(self, using=None):
"""Used when the entire index for model is updated."""
return Brand.objects.filter(created_date__lte=datetime.datetime.now())
这里是搜索的视图部分:
class BrandListSearchResults(ListSearchResultsViewMixin, BrandListBase):
template_name = 'search/brand/search.html'
page_template = 'search/brand/page.html'
paginate_by = 50
paginate_by_first = 50
def get_queryset(self):
return self.get_sqs().filter(text=self.search_term)
def get_context_data(self, **kwargs):
data = super(BrandListSearchResults, self).get_context_data(**kwargs)
meta = Meta(
title='All brands matching the search term %s' % self.search_term,
description='Brand search results for %s' % self.search_term
)
data['meta'] = meta
data['paginate_by'] = self.paginate_by
data['paginate_by_first'] = self.paginate_by_first
data['size_list'] = ["90","110","185"]
return data
我重新运行我的索引,但同义词似乎不起作用。
有什么方法可以查询 Elasticsearch 以查看同义词是否确实存在? haystack 管理命令并没有很详细地说明它对自定义过滤器等的作用
更新
我已经能够直接从 elasticsearch 查询我的设置,我看到那里有同义词:
curl -XGET 'http://localhost:9200/sitename/_settings?pretty'
{
"sitename" : {
"settings" : {
"index" : {
"creation_date" : "1427470212556",
"uuid" : "6eznekoORQKqwswTq1G24w",
"analysis" : {
"analyzer" : {
"synonym_analyzer" : {
"type" : "custom",
"filter" : [ "synonym" ],
"tokenizer" : "lowercase"
},
"ngram_analyzer" : {
"type" : "custom",
"filter" : [ "haystack_ngram", "synonym" ],
"tokenizer" : "lowercase"
},
"edgengram_analyzer" : {
"type" : "custom",
"filter" : [ "haystack_edgengram", "synonym" ],
"tokenizer" : "lowercase"
}
},
"filter" : {
"haystack_ngram" : {
"type" : "nGram",
"min_gram" : "3",
"max_gram" : "15"
},
"haystack_edgengram" : {
"type" : "edgeNGram",
"min_gram" : "5",
"max_gram" : "15"
},
"synonym" : {
"type" : "synonym",
"synonyms" : [ "tricklenutz, lipstick" ]
}
},
"tokenizer" : {
"haystack_edgengram_tokenizer" : {
"max_gram" : "15",
"min_gram" : "2",
"type" : "edgeNGram",
"side" : "front"
},
"haystack_ngram_tokenizer" : {
"type" : "nGram",
"min_gram" : "3",
"max_gram" : "15"
}
}
},
"number_of_replicas" : "1",
"number_of_shards" : "5",
"version" : {
"created" : "1040399"
}
}
}
}
}
我注意到的第一件事是您配置了 synonym_analyzer
分析器但未使用!您需要设置默认分析器或逐个字段地执行此操作(这需要在您的自定义后端和扩展字段中进行额外更改 类;here's an example)。
我在理解文档从 Django 到 ElasticSearch 的实际处理方式时遇到过类似的挫败感。您可以结合使用 ElasticSearch 的 HTTP API 和通过 Haystack 进行的一些额外自省。我在链接的 elasticstack
包中写了一个名为 show_mapping
的命令,它显示了用于创建映射的 JSON。这样您至少可以看到您的字段是否配置为使用您设置的分析器。
简短免责声明 - 我没有跟上 Haystack 的最新变化(2.0 或 2.1 之后),所以这些建议中的一些可能需要更新。
我在使用 Elasticsearch 自定义后端的 hackstack 中使用同义词过滤器时遇到问题。
此时我要做的就是创建一个用于测试的同义词。我想为单词 'tricklenutz' 添加一个同义词对 'lipstick'.
我正在使用以下自定义 haystack 后端:
from django.conf import settings
from haystack.backends.elasticsearch_backend import (ElasticsearchSearchBackend,
ElasticsearchSearchEngine)
class SiteElasticBackend(ElasticsearchSearchBackend):
def __init__(self, connection_alias, **connection_options):
super(SiteElasticBackend, self).__init__(
connection_alias, **connection_options)
MY_SETTINGS = {
'settings': {
"analysis": {
"analyzer": {
"synonym_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": ["synonym"]
},
"ngram_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": ["haystack_ngram", "synonym"]
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": ["haystack_edgengram", "synonym"]
}
},
"tokenizer": {
"haystack_ngram_tokenizer": {
"type": "nGram",
"min_gram": 3,
"max_gram": 15,
},
"haystack_edgengram_tokenizer": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 15,
"side": "front"
}
},
"filter": {
"synonym": {
"type": "synonym",
"synonyms": [
"tricklenutz, lipstick"
]
},
"haystack_ngram": {
"type": "nGram",
"min_gram": 3,
"max_gram": 15
},
"haystack_edgengram": {
"type": "edgeNGram",
"min_gram": 5,
"max_gram": 15
}
}
}
}
}
setattr(self, 'DEFAULT_SETTINGS', MY_SETTINGS)
class ConfigurableElasticSearchEngine(ElasticsearchSearchEngine):
backend = SiteElasticBackend
如您所见,我只是想为 'lipstick' 创建一个 'tricklenutz' 的同义词(一个不会出现在任何搜索中的词)。
我的 settings.py
文件中有以下条目:
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'search.backends.site_elasticsearch_backend.ConfigurableElasticSearchEngine',
'URL': 'http://127.0.0.1:9200/',
'INDEX_NAME': 'sitename'
},
}
这是我的 search_index.py Brand
:
class BrandIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
ngram_text = indexes.EdgeNgramField()
name = indexes.NgramField(model_attr='name')
brand_name = indexes.CharField(model_attr='name')
created_date = indexes.DateTimeField(model_attr='created_date')
def get_model(self):
return Brand
def prepare(self, obj):
"""Add the content of text field from final prepared data into ngram_text field
"""
prepared_data = super(BrandIndex, self).prepare(obj)
prepared_data['ngram_text'] = prepared_data['text']
return prepared_data
def index_queryset(self, using=None):
"""Used when the entire index for model is updated."""
return Brand.objects.filter(created_date__lte=datetime.datetime.now())
这里是搜索的视图部分:
class BrandListSearchResults(ListSearchResultsViewMixin, BrandListBase):
template_name = 'search/brand/search.html'
page_template = 'search/brand/page.html'
paginate_by = 50
paginate_by_first = 50
def get_queryset(self):
return self.get_sqs().filter(text=self.search_term)
def get_context_data(self, **kwargs):
data = super(BrandListSearchResults, self).get_context_data(**kwargs)
meta = Meta(
title='All brands matching the search term %s' % self.search_term,
description='Brand search results for %s' % self.search_term
)
data['meta'] = meta
data['paginate_by'] = self.paginate_by
data['paginate_by_first'] = self.paginate_by_first
data['size_list'] = ["90","110","185"]
return data
我重新运行我的索引,但同义词似乎不起作用。
有什么方法可以查询 Elasticsearch 以查看同义词是否确实存在? haystack 管理命令并没有很详细地说明它对自定义过滤器等的作用
更新
我已经能够直接从 elasticsearch 查询我的设置,我看到那里有同义词:
curl -XGET 'http://localhost:9200/sitename/_settings?pretty'
{
"sitename" : {
"settings" : {
"index" : {
"creation_date" : "1427470212556",
"uuid" : "6eznekoORQKqwswTq1G24w",
"analysis" : {
"analyzer" : {
"synonym_analyzer" : {
"type" : "custom",
"filter" : [ "synonym" ],
"tokenizer" : "lowercase"
},
"ngram_analyzer" : {
"type" : "custom",
"filter" : [ "haystack_ngram", "synonym" ],
"tokenizer" : "lowercase"
},
"edgengram_analyzer" : {
"type" : "custom",
"filter" : [ "haystack_edgengram", "synonym" ],
"tokenizer" : "lowercase"
}
},
"filter" : {
"haystack_ngram" : {
"type" : "nGram",
"min_gram" : "3",
"max_gram" : "15"
},
"haystack_edgengram" : {
"type" : "edgeNGram",
"min_gram" : "5",
"max_gram" : "15"
},
"synonym" : {
"type" : "synonym",
"synonyms" : [ "tricklenutz, lipstick" ]
}
},
"tokenizer" : {
"haystack_edgengram_tokenizer" : {
"max_gram" : "15",
"min_gram" : "2",
"type" : "edgeNGram",
"side" : "front"
},
"haystack_ngram_tokenizer" : {
"type" : "nGram",
"min_gram" : "3",
"max_gram" : "15"
}
}
},
"number_of_replicas" : "1",
"number_of_shards" : "5",
"version" : {
"created" : "1040399"
}
}
}
}
}
我注意到的第一件事是您配置了 synonym_analyzer
分析器但未使用!您需要设置默认分析器或逐个字段地执行此操作(这需要在您的自定义后端和扩展字段中进行额外更改 类;here's an example)。
我在理解文档从 Django 到 ElasticSearch 的实际处理方式时遇到过类似的挫败感。您可以结合使用 ElasticSearch 的 HTTP API 和通过 Haystack 进行的一些额外自省。我在链接的 elasticstack
包中写了一个名为 show_mapping
的命令,它显示了用于创建映射的 JSON。这样您至少可以看到您的字段是否配置为使用您设置的分析器。
简短免责声明 - 我没有跟上 Haystack 的最新变化(2.0 或 2.1 之后),所以这些建议中的一些可能需要更新。