使用 Elasticsearch 后端在 Django Haystack 中启用同义词

Enabling Synonyms in Django Haystack with Elasticsearch Backend

我在使用 Elasticsearch 自定义后端的 hackstack 中使用同义词过滤器时遇到问题。

此时我要做的就是创建一个用于测试的同义词。我想为单词 'tricklenutz' 添加一个同义词对 'lipstick'.

我正在使用以下自定义 haystack 后端:

from django.conf import settings
from haystack.backends.elasticsearch_backend import (ElasticsearchSearchBackend,
    ElasticsearchSearchEngine)

class SiteElasticBackend(ElasticsearchSearchBackend):

    def __init__(self, connection_alias, **connection_options):
        super(SiteElasticBackend, self).__init__(
                                connection_alias, **connection_options)
        MY_SETTINGS = {
            'settings': {
                "analysis": {
                    "analyzer": {
                        "synonym_analyzer": {
                            "type": "custom",
                            "tokenizer": "lowercase",
                            "filter": ["synonym"]
                        },
                        "ngram_analyzer": {
                            "type": "custom",
                            "tokenizer": "lowercase",
                            "filter": ["haystack_ngram", "synonym"]
                        },
                        "edgengram_analyzer": {
                            "type": "custom",
                            "tokenizer": "lowercase",
                            "filter": ["haystack_edgengram", "synonym"]
                        }
                    },
                    "tokenizer": {
                        "haystack_ngram_tokenizer": {
                            "type": "nGram",
                            "min_gram": 3,
                            "max_gram": 15,
                        },
                        "haystack_edgengram_tokenizer": {
                            "type": "edgeNGram",
                            "min_gram": 2,
                            "max_gram": 15,
                            "side": "front"
                        }
                    },
                    "filter": {
                        "synonym": {
                            "type": "synonym",
                            "synonyms": [
                                "tricklenutz, lipstick"
                            ]
                        },
                        "haystack_ngram": {
                            "type": "nGram",
                            "min_gram": 3,
                            "max_gram": 15
                        },
                        "haystack_edgengram": {
                            "type": "edgeNGram",
                            "min_gram": 5,
                            "max_gram": 15
                        }
                    }
                }
            }
        }
        setattr(self, 'DEFAULT_SETTINGS', MY_SETTINGS)


class ConfigurableElasticSearchEngine(ElasticsearchSearchEngine):
    backend = SiteElasticBackend

如您所见,我只是想为 'lipstick' 创建一个 'tricklenutz' 的同义词(一个不会出现在任何搜索中的词)。

我的 settings.py 文件中有以下条目:

HAYSTACK_CONNECTIONS = { 'default': { 'ENGINE': 'search.backends.site_elasticsearch_backend.ConfigurableElasticSearchEngine', 'URL': 'http://127.0.0.1:9200/', 'INDEX_NAME': 'sitename' }, }

这是我的 search_index.py Brand:

class BrandIndex(indexes.SearchIndex, indexes.Indexable):
    text = indexes.CharField(document=True, use_template=True)
    ngram_text = indexes.EdgeNgramField()
    name = indexes.NgramField(model_attr='name')
    brand_name = indexes.CharField(model_attr='name')
    created_date = indexes.DateTimeField(model_attr='created_date')

    def get_model(self):
        return Brand

    def prepare(self, obj):
            """Add the content of text field from final prepared data into ngram_text field
            """
            prepared_data = super(BrandIndex, self).prepare(obj)
            prepared_data['ngram_text'] = prepared_data['text']
            return prepared_data

    def index_queryset(self, using=None):
        """Used when the entire index for model is updated."""
        return Brand.objects.filter(created_date__lte=datetime.datetime.now())

这里是搜索的视图部分:

class BrandListSearchResults(ListSearchResultsViewMixin, BrandListBase):
    template_name = 'search/brand/search.html'
    page_template = 'search/brand/page.html'
    paginate_by = 50
    paginate_by_first = 50

    def get_queryset(self):
        return self.get_sqs().filter(text=self.search_term)

    def get_context_data(self, **kwargs):
        data = super(BrandListSearchResults, self).get_context_data(**kwargs)
        meta = Meta(
            title='All brands matching the search term %s' % self.search_term,
            description='Brand search results for %s' % self.search_term
        )
        data['meta'] = meta
        data['paginate_by'] = self.paginate_by
        data['paginate_by_first'] = self.paginate_by_first
        data['size_list'] = ["90","110","185"]
        return data

我重新运行我的索引,但同义词似乎不起作用。

有什么方法可以查询 Elasticsearch 以查看同义词是否确实存在? haystack 管理命令并没有很详细地说明它对自定义过滤器等的作用

更新

我已经能够直接从 elasticsearch 查询我的设置,我看到那里有同义词:

curl -XGET 'http://localhost:9200/sitename/_settings?pretty'
{
  "sitename" : {
    "settings" : {
      "index" : {
        "creation_date" : "1427470212556",
        "uuid" : "6eznekoORQKqwswTq1G24w",
        "analysis" : {
          "analyzer" : {
            "synonym_analyzer" : {
              "type" : "custom",
              "filter" : [ "synonym" ],
              "tokenizer" : "lowercase"
            },
            "ngram_analyzer" : {
              "type" : "custom",
              "filter" : [ "haystack_ngram", "synonym" ],
              "tokenizer" : "lowercase"
            },
            "edgengram_analyzer" : {
              "type" : "custom",
              "filter" : [ "haystack_edgengram", "synonym" ],
              "tokenizer" : "lowercase"
            }
          },
          "filter" : {
            "haystack_ngram" : {
              "type" : "nGram",
              "min_gram" : "3",
              "max_gram" : "15"
            },
            "haystack_edgengram" : {
              "type" : "edgeNGram",
              "min_gram" : "5",
              "max_gram" : "15"
            },
            "synonym" : {
              "type" : "synonym",
              "synonyms" : [ "tricklenutz, lipstick" ]
            }
          },
          "tokenizer" : {
            "haystack_edgengram_tokenizer" : {
              "max_gram" : "15",
              "min_gram" : "2",
              "type" : "edgeNGram",
              "side" : "front"
            },
            "haystack_ngram_tokenizer" : {
              "type" : "nGram",
              "min_gram" : "3",
              "max_gram" : "15"
            }
          }
        },
        "number_of_replicas" : "1",
        "number_of_shards" : "5",
        "version" : {
          "created" : "1040399"
        }
      }
    }
  }
}

我注意到的第一件事是您配置了 synonym_analyzer 分析器但未使用!您需要设置默认分析器或逐个字段地执行此操作(这需要在您的自定义后端和扩展字段中进行额外更改 类;here's an example)。

我在理解文档从 Django 到 ElasticSearch 的实际处理方式时遇到过类似的挫败感。您可以结合使用 ElasticSearch 的 HTTP API 和通过 Haystack 进行的一些额外自省。我在链接的 elasticstack 包中写了一个名为 show_mapping 的命令,它显示了用于创建映射的 JSON。这样您至少可以看到您的字段是否配置为使用您设置的分析器。

简短免责声明 - 我没有跟上 Haystack 的最新变化(2.0 或 2.1 之后),所以这些建议中的一些可能需要更新。