在 arangodb 中优化查询
Optimize Query in arangodb
我是 运行 arangodb 中的以下查询:
LET catalogDatasets = []
LET openDatasets = ( FOR d IN datasets FILTER d.visibility == "open" RETURN d._id )
LET myDatasets = []
LET myPurchasedDatasets = []
LET searchTarget = UNIQUE( UNION( catalogDatasets, openDatasets, myDatasets, myPurchasedDatasets ) )
LET unorderedDatasetsIds = (
FOR dataset IN FULLTEXT(datasets, "word_list", @searchWords)
FILTER dataset._id IN searchTarget RETURN dataset._id
)
LET ordered = (
FOR wl IN wordLinks
FILTER wl._from IN unorderedDatasetsIds
FOR x IN words
FILTER x._id == wl._to
COLLECT did = wl._from INTO score = wl.invFq/(x.numEdges+@epsilon)
SORT score
LIMIT 0, 20
RETURN did
)
RETURN {
dids: ordered,
number_of_items: LENGTH(unorderedDatasetsIds)
}
我的搜索词都使用如下前缀:
pref:banana,|pref:chocollate
基本上我想优化这个查询,因为 return 大约需要 2 秒。我的一个想法是在全文搜索中将项目数量限制为 1000,但这样做时,数据集将是随机的,因为它取决于查询的顺序 arangodb return。
我可以对此查询应用什么样的优化以使其更快?
P.S.: 我有一个空数据集的联合,但有时它们不是空的。就发生在这个查询的情况下。
编辑
我的查询解释:
Query string:
LET catalogDatasets = []
LET openDatasets = ( FOR d IN datasets FILTER d.visibility == "open" RETURN d._id )
LET myDatasets = []
LET myPurchasedDatasets = []
LET searchTarget = UNIQUE( UNION( catalogDatasets, openDatasets, myDatasets, myPurchasedDatasets ) )
LET unorderedDatasetsIds = (
FOR dataset IN FULLTEXT(datasets, "word_list", @searchWords)
FILTER dataset._id IN searchTarget RETURN dataset._id
)
LET ordered = (
FOR wl IN wordLinks
FILTER wl._from IN unorderedDatasetsIds
FOR x IN words
FILTER x._id == wl._to
COLLECT did = wl._from INTO score = wl.invFq/(x.numEdges+@epsilon)
SORT score
LIMIT 0, 20
RETURN did
)
RETURN {
dids: ordered,
number_of_items: LENGTH(unorderedDatasetsIds)
}
Execution plan:
Id NodeType Est. Comment
1 SingletonNode 1 * ROOT
9 SubqueryNode 1 - LET openDatasets = ... /* const subquery */
3 SingletonNode 1 * ROOT
4 EnumerateCollectionNode 9752 - FOR d IN datasets /* full collection scan */
5 CalculationNode 9752 - LET #19 = (d.`visibility` == "open") /* simple expression */ /* collections used: d : datasets */
6 FilterNode 9752 - FILTER #19
7 CalculationNode 9752 - LET #21 = d.`_id` /* attribute expression */ /* collections used: d : datasets */
8 ReturnNode 9752 - RETURN #21
41 CalculationNode 1 - LET #39 = SORTED_UNIQUE(UNIQUE(UNION([ ], openDatasets, [ ], [ ]))) /* simple expression */
20 SubqueryNode 1 - LET unorderedDatasetsIds = ... /* subquery */
13 SingletonNode 1 * ROOT
38 IndexNode 9752 - FOR dataset IN datasets /* fulltext index scan */
16 CalculationNode 9752 - LET #25 = (dataset.`_id` in /* sorted */ #39) /* simple expression */ /* collections used: dataset : datasets */
17 FilterNode 9752 - FILTER #25
18 CalculationNode 9752 - LET #27 = dataset.`_id` /* attribute expression */ /* collections used: dataset : datasets */
19 ReturnNode 9752 - RETURN #27
34 SubqueryNode 1 - LET ordered = ... /* subquery */
21 SingletonNode 1 * ROOT
40 IndexNode 410 - FOR wl IN wordLinks /* edge index scan */
28 CalculationNode 410 - LET #33 = wl.`_from` /* attribute expression */ /* collections used: wl : wordLinks */
39 IndexNode 410 - FOR x IN words /* primary index scan */
37 SortNode 410 - SORT #33 ASC
29 CalculationNode 410 - LET #35 = (wl.`invFq` / (x.`numEdges` + 0.1)) /* simple expression */ /* collections used: wl : wordLinks, x : words */
30 CollectNode 328 - COLLECT did = #33 INTO score = #35 /* sorted */
31 SortNode 328 - SORT score ASC
32 LimitNode 20 - LIMIT 0, 20
33 ReturnNode 20 - RETURN did
35 CalculationNode 1 - LET #37 = { "dids" : ordered, "number_of_items" : LENGTH(unorderedDatasetsIds) } /* simple expression */
36 ReturnNode 1 - RETURN #37
Indexes used:
By Type Collection Unique Sparse Selectivity Fields Ranges
38 fulltext datasets false true n/a [ `word_list` ] FULLTEXT(datasets /* all collection documents */, "word_list", "'prefix:トウ,|prefix:とう'")
40 edge wordLinks false false 3.05 % [ `_from`, `_to` ] (wl.`_from` in unorderedDatasetsIds)
39 primary words true false 100.00 % [ `_key` ] (x.`_id` == wl.`_to`)
Optimization rules applied:
Id RuleName
1 move-calculations-up
2 move-filters-up
3 remove-redundant-calculations
4 remove-unnecessary-calculations
5 move-calculations-up-2
6 move-filters-up-2
7 fulltext-index-optimizer
8 use-indexes
9 remove-filter-covered-by-index
10 sort-in-values
11 remove-unnecessary-calculations-2
12 move-calculations-down
好的。有点艰难的提议。查询非常昂贵。但我尝试了一些东西:
LET catalogDatasets = []
LET myDatasets = []
LET myPurchasedDatasets = []
LET searchTarget = UNIQUE( UNION( catalogDatasets, myDatasets, myPurchasedDatasets ) )
LET unorderedDatasetsIds = (
FOR dataset IN FULLTEXT(datasets, "word_list", @searchWords)
FILTER dataset._id IN searchTarget || d.visibility == "open" RETURN dataset._id
)
LET ordered = (
FOR wl IN wordLinks
FILTER wl._from IN unorderedDatasetsIds
FOR x IN words
FILTER x._id == wl._to
COLLECT did = wl._from INTO score = wl.invFq/(x.numEdges+@epsilon)
SORT score
LIMIT 0, 20
RETURN did
)
RETURN {
dids: ordered,
number_of_items: LENGTH(unorderedDatasetsIds)
}
这里看不到任何明显的东西。但是,如果没有太多 "open"
.
,显然让 openDatasets
的查询消失应该很重要
我是 运行 arangodb 中的以下查询:
LET catalogDatasets = []
LET openDatasets = ( FOR d IN datasets FILTER d.visibility == "open" RETURN d._id )
LET myDatasets = []
LET myPurchasedDatasets = []
LET searchTarget = UNIQUE( UNION( catalogDatasets, openDatasets, myDatasets, myPurchasedDatasets ) )
LET unorderedDatasetsIds = (
FOR dataset IN FULLTEXT(datasets, "word_list", @searchWords)
FILTER dataset._id IN searchTarget RETURN dataset._id
)
LET ordered = (
FOR wl IN wordLinks
FILTER wl._from IN unorderedDatasetsIds
FOR x IN words
FILTER x._id == wl._to
COLLECT did = wl._from INTO score = wl.invFq/(x.numEdges+@epsilon)
SORT score
LIMIT 0, 20
RETURN did
)
RETURN {
dids: ordered,
number_of_items: LENGTH(unorderedDatasetsIds)
}
我的搜索词都使用如下前缀:
pref:banana,|pref:chocollate
基本上我想优化这个查询,因为 return 大约需要 2 秒。我的一个想法是在全文搜索中将项目数量限制为 1000,但这样做时,数据集将是随机的,因为它取决于查询的顺序 arangodb return。
我可以对此查询应用什么样的优化以使其更快?
P.S.: 我有一个空数据集的联合,但有时它们不是空的。就发生在这个查询的情况下。
编辑 我的查询解释:
Query string:
LET catalogDatasets = []
LET openDatasets = ( FOR d IN datasets FILTER d.visibility == "open" RETURN d._id )
LET myDatasets = []
LET myPurchasedDatasets = []
LET searchTarget = UNIQUE( UNION( catalogDatasets, openDatasets, myDatasets, myPurchasedDatasets ) )
LET unorderedDatasetsIds = (
FOR dataset IN FULLTEXT(datasets, "word_list", @searchWords)
FILTER dataset._id IN searchTarget RETURN dataset._id
)
LET ordered = (
FOR wl IN wordLinks
FILTER wl._from IN unorderedDatasetsIds
FOR x IN words
FILTER x._id == wl._to
COLLECT did = wl._from INTO score = wl.invFq/(x.numEdges+@epsilon)
SORT score
LIMIT 0, 20
RETURN did
)
RETURN {
dids: ordered,
number_of_items: LENGTH(unorderedDatasetsIds)
}
Execution plan:
Id NodeType Est. Comment
1 SingletonNode 1 * ROOT
9 SubqueryNode 1 - LET openDatasets = ... /* const subquery */
3 SingletonNode 1 * ROOT
4 EnumerateCollectionNode 9752 - FOR d IN datasets /* full collection scan */
5 CalculationNode 9752 - LET #19 = (d.`visibility` == "open") /* simple expression */ /* collections used: d : datasets */
6 FilterNode 9752 - FILTER #19
7 CalculationNode 9752 - LET #21 = d.`_id` /* attribute expression */ /* collections used: d : datasets */
8 ReturnNode 9752 - RETURN #21
41 CalculationNode 1 - LET #39 = SORTED_UNIQUE(UNIQUE(UNION([ ], openDatasets, [ ], [ ]))) /* simple expression */
20 SubqueryNode 1 - LET unorderedDatasetsIds = ... /* subquery */
13 SingletonNode 1 * ROOT
38 IndexNode 9752 - FOR dataset IN datasets /* fulltext index scan */
16 CalculationNode 9752 - LET #25 = (dataset.`_id` in /* sorted */ #39) /* simple expression */ /* collections used: dataset : datasets */
17 FilterNode 9752 - FILTER #25
18 CalculationNode 9752 - LET #27 = dataset.`_id` /* attribute expression */ /* collections used: dataset : datasets */
19 ReturnNode 9752 - RETURN #27
34 SubqueryNode 1 - LET ordered = ... /* subquery */
21 SingletonNode 1 * ROOT
40 IndexNode 410 - FOR wl IN wordLinks /* edge index scan */
28 CalculationNode 410 - LET #33 = wl.`_from` /* attribute expression */ /* collections used: wl : wordLinks */
39 IndexNode 410 - FOR x IN words /* primary index scan */
37 SortNode 410 - SORT #33 ASC
29 CalculationNode 410 - LET #35 = (wl.`invFq` / (x.`numEdges` + 0.1)) /* simple expression */ /* collections used: wl : wordLinks, x : words */
30 CollectNode 328 - COLLECT did = #33 INTO score = #35 /* sorted */
31 SortNode 328 - SORT score ASC
32 LimitNode 20 - LIMIT 0, 20
33 ReturnNode 20 - RETURN did
35 CalculationNode 1 - LET #37 = { "dids" : ordered, "number_of_items" : LENGTH(unorderedDatasetsIds) } /* simple expression */
36 ReturnNode 1 - RETURN #37
Indexes used:
By Type Collection Unique Sparse Selectivity Fields Ranges
38 fulltext datasets false true n/a [ `word_list` ] FULLTEXT(datasets /* all collection documents */, "word_list", "'prefix:トウ,|prefix:とう'")
40 edge wordLinks false false 3.05 % [ `_from`, `_to` ] (wl.`_from` in unorderedDatasetsIds)
39 primary words true false 100.00 % [ `_key` ] (x.`_id` == wl.`_to`)
Optimization rules applied:
Id RuleName
1 move-calculations-up
2 move-filters-up
3 remove-redundant-calculations
4 remove-unnecessary-calculations
5 move-calculations-up-2
6 move-filters-up-2
7 fulltext-index-optimizer
8 use-indexes
9 remove-filter-covered-by-index
10 sort-in-values
11 remove-unnecessary-calculations-2
12 move-calculations-down
好的。有点艰难的提议。查询非常昂贵。但我尝试了一些东西:
LET catalogDatasets = []
LET myDatasets = []
LET myPurchasedDatasets = []
LET searchTarget = UNIQUE( UNION( catalogDatasets, myDatasets, myPurchasedDatasets ) )
LET unorderedDatasetsIds = (
FOR dataset IN FULLTEXT(datasets, "word_list", @searchWords)
FILTER dataset._id IN searchTarget || d.visibility == "open" RETURN dataset._id
)
LET ordered = (
FOR wl IN wordLinks
FILTER wl._from IN unorderedDatasetsIds
FOR x IN words
FILTER x._id == wl._to
COLLECT did = wl._from INTO score = wl.invFq/(x.numEdges+@epsilon)
SORT score
LIMIT 0, 20
RETURN did
)
RETURN {
dids: ordered,
number_of_items: LENGTH(unorderedDatasetsIds)
}
这里看不到任何明显的东西。但是,如果没有太多 "open"
.
openDatasets
的查询消失应该很重要