轻松循环遍历 ElasticSearch 文档源数组

Loop though ElasticSearch documents source array in painless

我的网上商店产品有以下 ElasticSearch 数据结构:

{
  "_index": "vue_storefront_catalog_1_product_1617378559",
  "_type": "_doc",
  "_source": {
    "configurable_children": [
      {
        "price": 49.99,
        "special_price": 34.99,
        "special_from_date": "2020-11-27 00:00:00",
        "special_to_date": "2020-11-30 23:59:59",
        "stock": {
          "qty": 0,
          "is_in_stock": false,
          "stock_status": 0
        }
      }
      {
        "price": 49.99,
        "special_price": null,
        "special_from_date": null,
        "special_to_date": null,
        "stock": {
          "qty": 0,
          "is_in_stock": false,
          "stock_status": 0
        }
      }
    ]
}

使用以下映射:

{
  "vue_storefront_catalog_1_product_1614928276" : {
    "mappings" : {
      "properties" : {
        "configurable_children" : {
          "properties" : {
            "price" : {
              "type" : "double"
            },
            "special_from_date" : {
              "type" : "date",
              "format" : "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
            },
            "special_price" : {
              "type" : "double"
            },
            "special_to_date" : {
              "type" : "date",
              "format" : "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
            },
          }
        }
      }
    }
  }
}

我创建了一个 Elasticsearch 查询以仅过滤掉正在销售的产品,这意味着:special_price 必须低于价格并且当前日期必须介于 special_from_date 和special_to_date.

这是我创建的无痛脚本:

  boolean hasSale = false;

  long timestampNow = new Date().getTime();
  if (doc.containsKey('configurable_children.special_from_date') && !doc['configurable_children.special_from_date'].empty) {
    long timestampSpecialFromDate = doc['configurable_children.special_from_date'].value.toInstant().toEpochMilli();
    if (timestampSpecialFromDate > timestampNow) {
      hasSale = false;
    }
  } else if (doc.containsKey('configurable_children.special_to_date') && !doc['configurable_children.special_to_date'].empty) {
    long timestampSpecialToDate = doc['configurable_children.special_to_date'].value.toInstant().toEpochMilli();
    if (timestampSpecialToDate < timestampNow) {
      hasSale = false;
    }
  } else if (doc.containsKey('configurable_children.stock.is_in_stock') && doc['configurable_children.stock.is_in_stock'].value == false) {
      hasSale = false;
  } else if (1 - (doc['configurable_children.special_price'].value / doc['configurable_children.price'].value) > params.fraction) {
    hasSale = true;
  }

  return hasSale

此 returns 产品只要 configurable_children 之一符合销售产品的条件。这是不正确的,因为我需要遍历整个集合 op configurable_children 以确定它是否是促销产品。如何确保所有 children 都被计算在内?有循环吗?


这是乔在回答中建议的新查询:

GET vue_storefront_catalog_1_product/_search
{
  "query": {
    "function_score": {
      "query": {
        "match_all": {}
      },
      "functions": [
        {
          "script_score": {
            "script": {
              "source": """
                int allEntriesAreTrue(def arrayList) {
                  return arrayList.stream().allMatch(Boolean::valueOf) == true ? 1 : 0
                } 
                
                ArrayList childrenAreMatching = [];
                
                long timestampNow = params.timestampNow;
                
                ArrayList children = params._source['configurable_children'];
                
                if (children == null || children.size() == 0) {
                  return allEntriesAreTrue(childrenAreMatching);
                }
                
                for (config in children) {
                  if (!config.containsKey('stock')) {
                    childrenAreMatching.add(false);
                    continue;
                  } else if (!config['stock']['is_in_stock']
                      || config['special_price'] == null
                      || config['special_from_date'] == null 
                      || config['special_to_date'] == null) {
                    childrenAreMatching.add(false);
                    continue;
                  } 
                  
                  if (config['special_from_date'] != null && config['special_to_date'] != null) {
                    SimpleDateFormat sf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                    def from_millis = sf.parse(config['special_from_date']).getTime();
                    def to_millis = sf.parse(config['special_to_date']).getTime();
                    
                    if (!(timestampNow >= from_millis && timestampNow <= to_millis)) {
                      childrenAreMatching.add(false);
                      continue;
                    }
                  }
                  
                  def sale_fraction = 1 - (config['special_price'] / config['price']);
                  if (sale_fraction <= params.fraction) {
                    childrenAreMatching.add(false);
                    continue;
                  }
                  
                  childrenAreMatching.add(true);
                }
                return allEntriesAreTrue(childrenAreMatching);
              """,
              "params": {
                "timestampNow": 1617393889567,
                "fraction": 0.1
              }
            }
          }
        }
      ],
      "min_score": 1
    }
  }
}

回复如下:

{
  "took" : 15155,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2936,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [... hits here ...]
  }
}

知道为什么查询需要大约 15 秒吗?

你的直觉是正确的——如果你想检查数组列表 objects 的 all,你需要使用 for 循环。

现在,在我跳到迭代方面之前,关于 Elasticsearch 中的数组,有一件重要的事情需要了解。当它们未定义为 nested 时,它们的内容 will be flattened 和各个 key/value 对 之间的关系将丢失 。因此,您绝对应该像这样调整您的映射:

{
  "vue_storefront_catalog_1_product_1614928276" : {
    "mappings" : {
      "properties" : {
        "configurable_children" : {
          "type": "nested",        <---
          "properties" : {
            "price" : {
              "type" : "double"
            },
            ...
          }
        }
      }
    }
  }
}

并重新索引您的数据以确保 configurable_children 被视为单独的独立实体。

一旦它们被映射为 nested,您就可以只检索那些 children 确实符合您的脚本条件的那些:

POST vue_storefront_catalog_1_product_1614928276/_search
{
  "_source": "configurable_children_that_match", 
  "query": {
    "nested": {
      "path": "configurable_children",
      "inner_hits": {
        "name": "configurable_children_that_match"
      }, 
      "query": {
        "bool": {
          "must": [
            {
              "script": {
                "script": {
                  "source": """
                    boolean hasSale = false;
                    
                    long timestampNow = new Date().getTime();
                    
                    if (doc.containsKey('configurable_children.special_from_date') && !doc['configurable_children.special_from_date'].empty) {
                      long timestampSpecialFromDate = doc['configurable_children.special_from_date'].value.toInstant().toEpochMilli();
                      if (timestampSpecialFromDate > timestampNow) {
                       return false
                      }
                    } 
                    
                    if (doc.containsKey('configurable_children.special_to_date') && !doc['configurable_children.special_to_date'].empty) {
                      long timestampSpecialToDate = doc['configurable_children.special_to_date'].value.toInstant().toEpochMilli();
                      if (timestampSpecialToDate < timestampNow) {
                        return false
                      }
                    }
                    
                    if (doc.containsKey('configurable_children.stock.is_in_stock') && doc['configurable_children.stock.is_in_stock'].value == false) {
                        return false
                    }
                    
                    if (1 - (doc['configurable_children.special_price'].value / doc['configurable_children.price'].value) > params.fraction) {
                      hasSale = true;
                    }
                    
                    return hasSale
                  """,
                  "params": {
                    "fraction": 0.1
                  }
                }
              }
            }
          ]
        }
      }
    }
  }
}

这里要注意两点:

  1. 将跳过 inner_hits attribute of a nested query allows you to let Elasticsearch know that you're only interested in those children that truly matched. Otherwise, all configurable_children would be returned. When specified in the _source parameter 原始的完整 JSON 文档源,仅返回命名的 inner_hits
  2. 由于ES的分布式特性,不推荐使用java的new Date()。我已经解释了它背后的原因 my answer to How to get current time as unix timestamp for script use你会看到我在这个答案底部的查询中使用参数化 now

继续,重要的是要提到 嵌套 objects 在内部表示为单独的子文档

这一事实的一个副作用是,一旦您处于 nested 查询的上下文中,您将无法访问同一文档的其他嵌套 children。

为了缓解这种情况,通常会定期保持嵌套 children 同步,这样当您将 objects 中的一个属性展平以用于 top-level ,您可以使用简单地迭代相应的文档值。这种扁平化通常是通过我在 to

中说明的 copy_to 功能完成的

在您的特定用例中,这意味着您将例如在字段 stock.is_in_stock 上使用 copy_to,这将导致 top-level 布尔数组比 objects.

的数组列表更容易使用的列表

到目前为止一切顺利,但您仍然缺少一种基于 special_dates.

进行过滤的方法

现在,无论您是处理 nested 还是常规 object 字段类型,在常规脚本查询中访问 params._source 在 ES 中都不起作用,因为 v6.4.

但是,仍然有一种查询类型支持迭代 _source — 输入 function_score 查询。

如您的问题所述,您

..need to loop through the whole set of configurable_children to determine if it's a sale product..

话虽如此,下面是我的查询的工作原理:

  1. function_score query 通常生成自定义计算分数,但在 min_score 的帮助下,它可以用作布尔 yes/no 过滤器以排除 [=15] 的文档=] 不满足某个条件。
  2. 随着 configurable_children 的迭代,每个循环都会向 childrenAreMatching 附加一个布尔值,然后将其传递给 allEntriesAreTrue 助手,如果它们是 returns 则为 1 , 如果不是则为 0。
  3. 解析日期并与参数化的日期进行比较nowfraction 也进行了比较。如果在任何时候 some 条件失败,则循环跳转到下一个迭代。
POST vue_storefront_catalog_1_product_1614928276/_search
{
  "query": {
    "function_score": {
      "query": {
        "match_all": {}
      },
      "functions": [
        {
          "script_score": {
            "script": {
              "source": """
                // casting helper
                int allEntriesAreTrue(def arrayList) {
                  return arrayList.stream().allMatch(Boolean::valueOf) == true ? 1 : 0
                } 
                
                ArrayList childrenAreMatching = [];
                
                long timestampNow = params.timestampNow;
                
                ArrayList children = params._source['configurable_children'];
                
                if (children == null || children.size() == 0) {
                  return allEntriesAreTrue(childrenAreMatching);
                }
                
                for (config in children) {
                  if (!config['stock']['is_in_stock']
                      || config['special_price'] == null
                      || config['special_from_date'] == null 
                      || config['special_to_date'] == null) {
                    // nothing to do here...
                    childrenAreMatching.add(false);
                    continue;
                  } 
                  
                  SimpleDateFormat sf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                  def from_millis = sf.parse(config['special_from_date']).getTime();
                  def to_millis = sf.parse(config['special_to_date']).getTime();
                  
                  if (!(timestampNow >= from_millis && timestampNow <= to_millis)) {
                    // not in date range
                    childrenAreMatching.add(false);
                    continue;
                  }
                  
                  def sale_fraction = 1 - (config['special_price'] / config['price']);
                  if (sale_fraction <= params.fraction) {
                    // fraction condition not met
                    childrenAreMatching.add(false);
                    continue;
                  }
                  
                  childrenAreMatching.add(true);
                }
                
                // need to return a number because it's a script score query
                return allEntriesAreTrue(childrenAreMatching);
              """,
              "params": {
                "timestampNow": 1617393889567,
                "fraction": 0.1
              }
            }
          }
        }
      ],
      "min_score": 1
    }
  }
}

总之,只有所有configurable_children满足指定条件的文档才会被返回。


P.S。如果你从这个答案中学到了一些东西并想了解更多,我专门写了一篇 whole chapter to ES scripts in my Elasticsearch Handbook.