在 arangodb 中优化查询

Optimize Query in arangodb

我是 运行 arangodb 中的以下查询:

LET catalogDatasets = []
LET openDatasets = ( FOR d IN datasets FILTER d.visibility == "open"  RETURN d._id )
LET myDatasets = []
LET myPurchasedDatasets = []
LET searchTarget = UNIQUE( UNION( catalogDatasets, openDatasets, myDatasets, myPurchasedDatasets ) )

LET unorderedDatasetsIds = (
    FOR dataset IN FULLTEXT(datasets, "word_list", @searchWords)
     FILTER dataset._id IN searchTarget  RETURN dataset._id
)

LET ordered = (
    FOR wl IN wordLinks
    FILTER wl._from IN unorderedDatasetsIds
    FOR x IN words
        FILTER x._id == wl._to
        COLLECT did = wl._from INTO score = wl.invFq/(x.numEdges+@epsilon)
        SORT score
        LIMIT 0, 20
       RETURN did
)
RETURN {
    dids: ordered,
    number_of_items: LENGTH(unorderedDatasetsIds)
}

我的搜索词都使用如下前缀:

pref:banana,|pref:chocollate

基本上我想优化这个查询,因为 return 大约需要 2 秒。我的一个想法是在全文搜索中将项目数量限制为 1000,但这样做时,数据集将是随机的,因为它取决于查询的顺序 arangodb return。

我可以对此查询应用什么样的优化以使其更快?

P.S.: 我有一个空数据集的联合,但有时它们不是空的。就发生在这个查询的情况下。

编辑 我的查询解释:

Query string:
 LET catalogDatasets = []
 LET openDatasets = ( FOR d IN datasets FILTER d.visibility == "open"  RETURN d._id )
 LET myDatasets = []
 LET myPurchasedDatasets = []
 LET searchTarget = UNIQUE( UNION( catalogDatasets, openDatasets, myDatasets, myPurchasedDatasets ) )
 LET unorderedDatasetsIds = (
     FOR dataset IN FULLTEXT(datasets, "word_list", @searchWords)
      FILTER dataset._id IN searchTarget  RETURN dataset._id
 )
 LET ordered = (
     FOR wl IN wordLinks
     FILTER wl._from IN unorderedDatasetsIds
     FOR x IN words
         FILTER x._id == wl._to
         COLLECT did = wl._from INTO score = wl.invFq/(x.numEdges+@epsilon)
         SORT score
         LIMIT 0, 20
        RETURN did
 )
 RETURN {
     dids: ordered,
     number_of_items: LENGTH(unorderedDatasetsIds)
 }

Execution plan:
 Id   NodeType                  Est.   Comment
  1   SingletonNode                1   * ROOT
  9   SubqueryNode                 1     - LET openDatasets = ...   /* const subquery */
  3   SingletonNode                1       * ROOT
  4   EnumerateCollectionNode   9752         - FOR d IN datasets   /* full collection scan */
  5   CalculationNode           9752           - LET #19 = (d.`visibility` == "open")   /* simple expression */   /* collections used: d : datasets */
  6   FilterNode                9752           - FILTER #19
  7   CalculationNode           9752           - LET #21 = d.`_id`   /* attribute expression */   /* collections used: d : datasets */
  8   ReturnNode                9752           - RETURN #21
 41   CalculationNode              1     - LET #39 = SORTED_UNIQUE(UNIQUE(UNION([ ], openDatasets, [ ], [ ])))   /* simple expression */
 20   SubqueryNode                 1     - LET unorderedDatasetsIds = ...   /* subquery */
 13   SingletonNode                1       * ROOT
 38   IndexNode                 9752         - FOR dataset IN datasets   /* fulltext index scan */
 16   CalculationNode           9752           - LET #25 = (dataset.`_id` in /* sorted */ #39)   /* simple expression */   /* collections used: dataset : datasets */
 17   FilterNode                9752           - FILTER #25
 18   CalculationNode           9752           - LET #27 = dataset.`_id`   /* attribute expression */   /* collections used: dataset : datasets */
 19   ReturnNode                9752           - RETURN #27
 34   SubqueryNode                 1     - LET ordered = ...   /* subquery */
 21   SingletonNode                1       * ROOT
 40   IndexNode                  410         - FOR wl IN wordLinks   /* edge index scan */
 28   CalculationNode            410           - LET #33 = wl.`_from`   /* attribute expression */   /* collections used: wl : wordLinks */
 39   IndexNode                  410           - FOR x IN words   /* primary index scan */
 37   SortNode                   410             - SORT #33 ASC
 29   CalculationNode            410             - LET #35 = (wl.`invFq` / (x.`numEdges` + 0.1))   /* simple expression */   /* collections used: wl : wordLinks, x : words */
 30   CollectNode                328             - COLLECT did = #33 INTO score = #35   /* sorted */
 31   SortNode                   328             - SORT score ASC
 32   LimitNode                   20             - LIMIT 0, 20
 33   ReturnNode                  20             - RETURN did
 35   CalculationNode              1     - LET #37 = { "dids" : ordered, "number_of_items" : LENGTH(unorderedDatasetsIds) }   /* simple expression */
 36   ReturnNode                   1     - RETURN #37

Indexes used:
 By   Type       Collection   Unique   Sparse   Selectivity   Fields               Ranges
 38   fulltext   datasets     false    true             n/a   [ `word_list` ]      FULLTEXT(datasets   /* all collection documents */, "word_list", "'prefix:トウ,|prefix:とう'")
 40   edge       wordLinks    false    false         3.05 %   [ `_from`, `_to` ]   (wl.`_from` in unorderedDatasetsIds)
 39   primary    words        true     false       100.00 %   [ `_key` ]           (x.`_id` == wl.`_to`)

Optimization rules applied:
 Id   RuleName
  1   move-calculations-up
  2   move-filters-up
  3   remove-redundant-calculations
  4   remove-unnecessary-calculations
  5   move-calculations-up-2
  6   move-filters-up-2
  7   fulltext-index-optimizer
  8   use-indexes
  9   remove-filter-covered-by-index
 10   sort-in-values
 11   remove-unnecessary-calculations-2
 12   move-calculations-down

好的。有点艰难的提议。查询非常昂贵。但我尝试了一些东西:

LET catalogDatasets = []
LET myDatasets = []
LET myPurchasedDatasets = []
LET searchTarget = UNIQUE( UNION( catalogDatasets, myDatasets, myPurchasedDatasets ) )
LET unorderedDatasetsIds = (
  FOR dataset IN FULLTEXT(datasets, "word_list", @searchWords)
    FILTER dataset._id IN searchTarget || d.visibility == "open" RETURN dataset._id
 )
 LET ordered = (
   FOR wl IN wordLinks
     FILTER wl._from IN unorderedDatasetsIds
     FOR x IN words
       FILTER x._id == wl._to
       COLLECT did = wl._from INTO score = wl.invFq/(x.numEdges+@epsilon)
       SORT score
       LIMIT 0, 20
       RETURN did
 )
 RETURN {
     dids: ordered,
     number_of_items: LENGTH(unorderedDatasetsIds)
}

这里看不到任何明显的东西。但是,如果没有太多 "open".

,显然让 openDatasets 的查询消失应该很重要