ES 建议,搜索索引项中的所有单词(不仅是第一个单词)

ES suggest, search all words in index item (not only the first word)

基于 答案(第一个选项)我创建了这个索引:

    'settings' => array(
        'analysis' => array(
            'analyzer' => array(
                'stop_analyzer' => array( 
                    'type' => 'custom',
                    'tokenizer' => 'standard',
                    'filter' => array(
                        'lowercase',
                        'english_stop'
                    )
                )
            ),
            "filter" => array(
                "english_stop" => array(
                    "type" => "stop",
                    "stopwords" => "_english_"
                )
            )
        )
    ),
    'mappings' => array(
        'properties' => array(
            'texts' => array(
                'type' => 'completion',
                "analyzer" => "stop_analyzer",
                "search_analyzer" => "stop_analyzer", 
                'preserve_position_increments' => false
            ),
        ),
    )

当我使用或不使用停用词开始建议搜索时,这非常有效。但是,例如,当我的索引中有这个:This is the text,并且我搜索 text 时,我不会得到任何结果,那么执行此操作的正确方法是什么?我宁愿不使用 N-gram。

我的搜索查询:

'suggest' => array(
    'suggestion' => array(
        'prefix'=> 'text',
        'completion' => array(
            'field' => 'texts'
        )
    )
)

The best way to the completion suggester that can match the middle of fields is n-gram filter.

但由于您不想使用 n-gram,您可以尝试以下方法:

您可以使用多个建议,其中一个建议基于前缀,对于字段中间的匹配,您可以使用正则表达式。

添加具有索引映射、数据、搜索查询和搜索结果的工作示例

索引映射:

{
  "settings": {
    "analysis": {
      "filter": {
        "my_custom_stop_words_filter": {
          "type": "stop",
          "ignore_case": true,
          "stopwords": [ "and", "is", "the" ]
        }
      },
      "analyzer": {
        "autocomplete": {
          "type": "custom",
          "tokenizer": "whitespace",
          "filter": [
            "lowercase",
            "my_custom_stop_words_filter"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "keyword"
      },
      "suggest": {
        "type": "completion",
        "analyzer": "autocomplete",
        "search_analyzer": "standard"
      }
    }
  }
}

索引数据:

{
  "suggest": [
    {
      "input": "This is the text"
    }
  ]
}
{
  "suggest": [
    {
      "input": "Software Manager"
    }
  ]
}

搜索查询:

{
    "suggest": {
        "suggest-exact": {
            "prefix": "text",
            "completion": {
                "field": "suggest",
                "skip_duplicates": true
            }
        },
        "suggest-regex": {
            "regex": ".*text.*",
            "completion": {
                "field": "suggest",
                "skip_duplicates": true
            }
        }
    }
}

搜索结果:

"suggest": {
    "suggest-exact": [
      {
        "text": "text",
        "offset": 0,
        "length": 4,
        "options": []
      }
    ],
    "suggest-regex": [
      {
        "text": ".*text.*",
        "offset": 0,
        "length": 8,
        "options": [
          {
            "text": "This is the text",
            "_index": "test",
            "_type": "_doc",
            "_id": "1",
            "_score": 1.0,
            "_source": {
              "suggest": [
                {
                  "input": "This is the text"
                }
              ]
            }
          }
        ]
      }
    ]
  }

Based on the comment given by the user, adding another answer, for searching all the words using n-grams. The previous method works perfectly but it is quite expensive to use regex.

添加具有索引映射、索引数据、搜索查询和搜索结果的工作示例

索引映射:

{
  "settings": {
    "analysis": {
      "filter": {
        "my_custom_stop_words_filter": {
          "type": "stop",
          "ignore_case": true,
          "stopwords": [
            "and",
            "is",
            "the"
          ]
        },
        "ngram_filter": {
          "type": "ngram",
          "min_gram": 4,
          "max_gram": 20
        }
      },
      "analyzer": {
        "ngram_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "ngram_filter",
            "my_custom_stop_words_filter"
          ]
        }
      }
    },
    "max_ngram_diff": 50
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "text",
        "analyzer": "ngram_analyzer",
        "search_analyzer": "standard"
      }
    }
  }
}

分析API

POST/_analyze
{
  "analyzer" : "ngram_analyzer",
  "text" : "This is the text"
}

生成了以下令牌:

{
    "tokens": [
        {
            "token": "this",
            "start_offset": 0,
            "end_offset": 4,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "text",
            "start_offset": 12,
            "end_offset": 16,
            "type": "<ALPHANUM>",
            "position": 3
        }
    ]
}

索引数据:

{
  "title": [
    "This is the text"
  ]
}

搜索查询:

{
    "query": {
        "match": {
           "title": "text"
        }
    }
}

搜索结果:

"hits": [
            {
                "_index": "stof_29753971",
                "_type": "_doc",
                "_id": "1",
                "_score": 0.41978103,
                "_source": {
                    "title": [
                        "This is the text"
                    ]
                }
            }
        ]