mongodb 子文档数组中的多键索引范围
mongodb multikey index bounds in arrays of subdocs
作为背景,文档中与我的问题最相关的页面是 this page on multikey index bounds and index intersection。
我遇到一个问题,似乎 mongo 没有正确组合子文档数组中某个字段的 $elemMatch 查询中的索引边界。
文档似乎没有说我的特定用例不应该工作,并且有很多示例非常接近我正在做的事情。从技术的角度来看,我想不出任何这不起作用的理由。我错过了什么或者有人可以向我解释为什么会这样吗?
我从这样的文档集开始:
mongos> db.test.findOne()
{
"_id" : ObjectId("54c7fdaa9a9950e75fa616b9"),
"data" : [
{
"point" : 1,
"other" : "what"
}
]
}
我有这样的索引:
mongos> db.test.getIndexes()
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "temp.test"
},
{
"v" : 1,
"key" : {
"data.point" : 1
},
"name" : "data.point_1",
"ns" : "temp.test"
}
]
当数据数组中只有一个子文档时:
mongos> db.test.find()
{ "_id" : ObjectId("54c7fdaa9a9950e75fa616b9"), "data" : [ { "point" : 1, "other" : "what" } ] }
{ "_id" : ObjectId("54c7fdaf9a9950e75fa616ba"), "data" : [ { "point" : 2, "other" : "who" } ] }
{ "_id" : ObjectId("54c7fdb59a9950e75fa616bb"), "data" : [ { "point" : 3, "other" : "where" } ] }
对数据的 $elemMatch 查询工作正常:
mongos> db.test.find({data: {$elemMatch: {point: {$gte: 2, $lte: 2}}}})
{ "_id" : ObjectId("54c7fdaf9a9950e75fa616ba"), "data" : [ { "point" : 2, "other" : "who" } ] }
mongos> db.test.find({data: {$elemMatch: {point: {$gte: 2, $lte: 2}}}}).explain(true)
{
"cursor" : "BtreeCursor data.point_1",
"isMultiKey" : false,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"nscannedObjectsAllPlans" : 1,
"nscannedAllPlans" : 1,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"data.point" : [
[
2,
2
]
]
},
"allPlans" : [
{
"cursor" : "BtreeCursor data.point_1",
"isMultiKey" : false,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"scanAndOrder" : false,
"indexOnly" : false,
"nChunkSkips" : 0,
"indexBounds" : {
"data.point" : [
[
2,
2
]
]
}
}
],
"server" : "XXXXXX",
"filterSet" : false,
"stats" : {
"type" : "KEEP_MUTATIONS",
"works" : 2,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 0,
"needFetch" : 0,
"isEOF" : 1,
"children" : [
{
"type" : "FETCH",
"works" : 2,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 0,
"needFetch" : 0,
"isEOF" : 1,
"alreadyHasObj" : 0,
"forcedFetches" : 0,
"matchTested" : 1,
"children" : [
{
"type" : "IXSCAN",
"works" : 2,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 0,
"needFetch" : 0,
"isEOF" : 1,
"keyPattern" : "{ data.point: 1.0 }",
"isMultiKey" : 0,
"boundsVerbose" : "field #0['data.point']: [2.0, 2.0]",
"yieldMovedCursor" : 0,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0,
"matchTested" : 0,
"keysExamined" : 1,
"children" : [ ]
}
]
}
]
},
"millis" : 0
}
但是当我在 data
数组中添加任何包含多个子文档的文档时,如下所示:
mongos> db.test.insert({data: [{point: 3, other: 'where'}, {point:4, other:"huh"}]})
WriteResult({ "nInserted" : 1 })
mongos> db.test.find()
{ "_id" : ObjectId("54c7fdaa9a9950e75fa616b9"), "data" : [ { "point" : 1, "other" : "what" } ] }
{ "_id" : ObjectId("54c7fdaf9a9950e75fa616ba"), "data" : [ { "point" : 2, "other" : "who" } ] }
{ "_id" : ObjectId("54c7fdb59a9950e75fa616bb"), "data" : [ { "point" : 3, "other" : "where" } ] }
{ "_id" : ObjectId("54c806c39a9950e75fa616bc"), "data" : [ { "point" : 3, "other" : "where" }, { "point" : 4, "other" : "huh" } ] }
查询需要更长的数量级(在非平凡的测试用例中),并且解释将边界从正确的 [2, 2]
更改为 [-Infinity, 2]
并且 isMultiKey
标志滴答作响转到 true
:
mongos> db.test.find({data: {$elemMatch: {point: {$gte: 2, $lte: 2}}}}).explain(true)
{
"cursor" : "BtreeCursor data.point_1",
"isMultiKey" : true,
"n" : 1,
"nscannedObjects" : 2,
"nscanned" : 2,
"nscannedObjectsAllPlans" : 2,
"nscannedAllPlans" : 2,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"data.point" : [
[
-Infinity,
2
]
]
},
"allPlans" : [
{
"cursor" : "BtreeCursor data.point_1",
"isMultiKey" : true,
"n" : 1,
"nscannedObjects" : 2,
"nscanned" : 2,
"scanAndOrder" : false,
"indexOnly" : false,
"nChunkSkips" : 0,
"indexBounds" : {
"data.point" : [
[
-Infinity,
2
]
]
}
}
],
"server" : "XXXXXX",
"filterSet" : false,
"stats" : {
"type" : "KEEP_MUTATIONS",
"works" : 3,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 1,
"needFetch" : 0,
"isEOF" : 1,
"children" : [
{
"type" : "FETCH",
"works" : 3,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 1,
"needFetch" : 0,
"isEOF" : 1,
"alreadyHasObj" : 0,
"forcedFetches" : 0,
"matchTested" : 1,
"children" : [
{
"type" : "IXSCAN",
"works" : 3,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 2,
"needTime" : 0,
"needFetch" : 0,
"isEOF" : 1,
"keyPattern" : "{ data.point: 1.0 }",
"isMultiKey" : 1,
"boundsVerbose" : "field #0['data.point']: [-inf.0, 2.0]",
"yieldMovedCursor" : 0,
"dupsTested" : 2,
"dupsDropped" : 0,
"seenInvalidated" : 0,
"matchTested" : 0,
"keysExamined" : 2,
"children" : [ ]
}
]
}
]
},
"millis" : 0
}
它仍然使用正确的 btree 索引并得到正确的结果,但这个问题是在大型数据集上可用和不可用之间的区别。
我知道我正在使用的查询也等同于此:
db.test.find({data: {$elemMatch: {point: 2}}})
但我这样做是为了简单起见 - 当使用 $gt $gte $lt $lte 指定任何边界时会观察到相同的行为,因为索引边界在索引中设置不正确(或者看起来是这样对我来说)。
作为参考,当我执行上述查询时,我实际上得到了我期望的索引范围,所以它不像 mongo 不能使用多个数组发出满足我想要的计划的查询子文档:
{
"cursor" : "BtreeCursor data.point_1",
"isMultiKey" : true,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"nscannedObjectsAllPlans" : 1,
"nscannedAllPlans" : 1,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"data.point" : [
[
2,
2
]
]
},
"allPlans" : [
{
"cursor" : "BtreeCursor data.point_1",
"isMultiKey" : true,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"scanAndOrder" : false,
"indexOnly" : false,
"nChunkSkips" : 0,
"indexBounds" : {
"data.point" : [
[
2,
2
]
]
}
}
],
"server" : "XXXXXX",
"filterSet" : false,
"stats" : {
"type" : "KEEP_MUTATIONS",
"works" : 2,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 0,
"needFetch" : 0,
"isEOF" : 1,
"children" : [
{
"type" : "FETCH",
"works" : 2,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 0,
"needFetch" : 0,
"isEOF" : 1,
"alreadyHasObj" : 0,
"forcedFetches" : 0,
"matchTested" : 1,
"children" : [
{
"type" : "IXSCAN",
"works" : 2,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 0,
"needFetch" : 0,
"isEOF" : 1,
"keyPattern" : "{ data.point: 1.0 }",
"isMultiKey" : 1,
"boundsVerbose" : "field #0['data.point']: [2.0, 2.0]",
"yieldMovedCursor" : 0,
"dupsTested" : 1,
"dupsDropped" : 0,
"seenInvalidated" : 0,
"matchTested" : 0,
"keysExamined" : 1,
"children" : [ ]
}
]
}
]
},
"millis" : 0
}
...所以再次-我错过了什么吗?我做错了吗?有解决办法吗?这是错误还是已知问题?
我在分片复制集群中使用 mongodb v2.6.5。
搜索 mongodb 的 jira 后,我发现了一个解释此行为的已知问题。
作为背景,文档中与我的问题最相关的页面是 this page on multikey index bounds and index intersection。
我遇到一个问题,似乎 mongo 没有正确组合子文档数组中某个字段的 $elemMatch 查询中的索引边界。
文档似乎没有说我的特定用例不应该工作,并且有很多示例非常接近我正在做的事情。从技术的角度来看,我想不出任何这不起作用的理由。我错过了什么或者有人可以向我解释为什么会这样吗?
我从这样的文档集开始:
mongos> db.test.findOne()
{
"_id" : ObjectId("54c7fdaa9a9950e75fa616b9"),
"data" : [
{
"point" : 1,
"other" : "what"
}
]
}
我有这样的索引:
mongos> db.test.getIndexes()
[
{
"v" : 1,
"key" : {
"_id" : 1
},
"name" : "_id_",
"ns" : "temp.test"
},
{
"v" : 1,
"key" : {
"data.point" : 1
},
"name" : "data.point_1",
"ns" : "temp.test"
}
]
当数据数组中只有一个子文档时:
mongos> db.test.find()
{ "_id" : ObjectId("54c7fdaa9a9950e75fa616b9"), "data" : [ { "point" : 1, "other" : "what" } ] }
{ "_id" : ObjectId("54c7fdaf9a9950e75fa616ba"), "data" : [ { "point" : 2, "other" : "who" } ] }
{ "_id" : ObjectId("54c7fdb59a9950e75fa616bb"), "data" : [ { "point" : 3, "other" : "where" } ] }
对数据的 $elemMatch 查询工作正常:
mongos> db.test.find({data: {$elemMatch: {point: {$gte: 2, $lte: 2}}}})
{ "_id" : ObjectId("54c7fdaf9a9950e75fa616ba"), "data" : [ { "point" : 2, "other" : "who" } ] }
mongos> db.test.find({data: {$elemMatch: {point: {$gte: 2, $lte: 2}}}}).explain(true)
{
"cursor" : "BtreeCursor data.point_1",
"isMultiKey" : false,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"nscannedObjectsAllPlans" : 1,
"nscannedAllPlans" : 1,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"data.point" : [
[
2,
2
]
]
},
"allPlans" : [
{
"cursor" : "BtreeCursor data.point_1",
"isMultiKey" : false,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"scanAndOrder" : false,
"indexOnly" : false,
"nChunkSkips" : 0,
"indexBounds" : {
"data.point" : [
[
2,
2
]
]
}
}
],
"server" : "XXXXXX",
"filterSet" : false,
"stats" : {
"type" : "KEEP_MUTATIONS",
"works" : 2,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 0,
"needFetch" : 0,
"isEOF" : 1,
"children" : [
{
"type" : "FETCH",
"works" : 2,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 0,
"needFetch" : 0,
"isEOF" : 1,
"alreadyHasObj" : 0,
"forcedFetches" : 0,
"matchTested" : 1,
"children" : [
{
"type" : "IXSCAN",
"works" : 2,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 0,
"needFetch" : 0,
"isEOF" : 1,
"keyPattern" : "{ data.point: 1.0 }",
"isMultiKey" : 0,
"boundsVerbose" : "field #0['data.point']: [2.0, 2.0]",
"yieldMovedCursor" : 0,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0,
"matchTested" : 0,
"keysExamined" : 1,
"children" : [ ]
}
]
}
]
},
"millis" : 0
}
但是当我在 data
数组中添加任何包含多个子文档的文档时,如下所示:
mongos> db.test.insert({data: [{point: 3, other: 'where'}, {point:4, other:"huh"}]})
WriteResult({ "nInserted" : 1 })
mongos> db.test.find()
{ "_id" : ObjectId("54c7fdaa9a9950e75fa616b9"), "data" : [ { "point" : 1, "other" : "what" } ] }
{ "_id" : ObjectId("54c7fdaf9a9950e75fa616ba"), "data" : [ { "point" : 2, "other" : "who" } ] }
{ "_id" : ObjectId("54c7fdb59a9950e75fa616bb"), "data" : [ { "point" : 3, "other" : "where" } ] }
{ "_id" : ObjectId("54c806c39a9950e75fa616bc"), "data" : [ { "point" : 3, "other" : "where" }, { "point" : 4, "other" : "huh" } ] }
查询需要更长的数量级(在非平凡的测试用例中),并且解释将边界从正确的 [2, 2]
更改为 [-Infinity, 2]
并且 isMultiKey
标志滴答作响转到 true
:
mongos> db.test.find({data: {$elemMatch: {point: {$gte: 2, $lte: 2}}}}).explain(true)
{
"cursor" : "BtreeCursor data.point_1",
"isMultiKey" : true,
"n" : 1,
"nscannedObjects" : 2,
"nscanned" : 2,
"nscannedObjectsAllPlans" : 2,
"nscannedAllPlans" : 2,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"data.point" : [
[
-Infinity,
2
]
]
},
"allPlans" : [
{
"cursor" : "BtreeCursor data.point_1",
"isMultiKey" : true,
"n" : 1,
"nscannedObjects" : 2,
"nscanned" : 2,
"scanAndOrder" : false,
"indexOnly" : false,
"nChunkSkips" : 0,
"indexBounds" : {
"data.point" : [
[
-Infinity,
2
]
]
}
}
],
"server" : "XXXXXX",
"filterSet" : false,
"stats" : {
"type" : "KEEP_MUTATIONS",
"works" : 3,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 1,
"needFetch" : 0,
"isEOF" : 1,
"children" : [
{
"type" : "FETCH",
"works" : 3,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 1,
"needFetch" : 0,
"isEOF" : 1,
"alreadyHasObj" : 0,
"forcedFetches" : 0,
"matchTested" : 1,
"children" : [
{
"type" : "IXSCAN",
"works" : 3,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 2,
"needTime" : 0,
"needFetch" : 0,
"isEOF" : 1,
"keyPattern" : "{ data.point: 1.0 }",
"isMultiKey" : 1,
"boundsVerbose" : "field #0['data.point']: [-inf.0, 2.0]",
"yieldMovedCursor" : 0,
"dupsTested" : 2,
"dupsDropped" : 0,
"seenInvalidated" : 0,
"matchTested" : 0,
"keysExamined" : 2,
"children" : [ ]
}
]
}
]
},
"millis" : 0
}
它仍然使用正确的 btree 索引并得到正确的结果,但这个问题是在大型数据集上可用和不可用之间的区别。
我知道我正在使用的查询也等同于此:
db.test.find({data: {$elemMatch: {point: 2}}})
但我这样做是为了简单起见 - 当使用 $gt $gte $lt $lte 指定任何边界时会观察到相同的行为,因为索引边界在索引中设置不正确(或者看起来是这样对我来说)。
作为参考,当我执行上述查询时,我实际上得到了我期望的索引范围,所以它不像 mongo 不能使用多个数组发出满足我想要的计划的查询子文档:
{
"cursor" : "BtreeCursor data.point_1",
"isMultiKey" : true,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"nscannedObjectsAllPlans" : 1,
"nscannedAllPlans" : 1,
"scanAndOrder" : false,
"indexOnly" : false,
"nYields" : 0,
"nChunkSkips" : 0,
"millis" : 0,
"indexBounds" : {
"data.point" : [
[
2,
2
]
]
},
"allPlans" : [
{
"cursor" : "BtreeCursor data.point_1",
"isMultiKey" : true,
"n" : 1,
"nscannedObjects" : 1,
"nscanned" : 1,
"scanAndOrder" : false,
"indexOnly" : false,
"nChunkSkips" : 0,
"indexBounds" : {
"data.point" : [
[
2,
2
]
]
}
}
],
"server" : "XXXXXX",
"filterSet" : false,
"stats" : {
"type" : "KEEP_MUTATIONS",
"works" : 2,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 0,
"needFetch" : 0,
"isEOF" : 1,
"children" : [
{
"type" : "FETCH",
"works" : 2,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 0,
"needFetch" : 0,
"isEOF" : 1,
"alreadyHasObj" : 0,
"forcedFetches" : 0,
"matchTested" : 1,
"children" : [
{
"type" : "IXSCAN",
"works" : 2,
"yields" : 0,
"unyields" : 0,
"invalidates" : 0,
"advanced" : 1,
"needTime" : 0,
"needFetch" : 0,
"isEOF" : 1,
"keyPattern" : "{ data.point: 1.0 }",
"isMultiKey" : 1,
"boundsVerbose" : "field #0['data.point']: [2.0, 2.0]",
"yieldMovedCursor" : 0,
"dupsTested" : 1,
"dupsDropped" : 0,
"seenInvalidated" : 0,
"matchTested" : 0,
"keysExamined" : 1,
"children" : [ ]
}
]
}
]
},
"millis" : 0
}
...所以再次-我错过了什么吗?我做错了吗?有解决办法吗?这是错误还是已知问题?
我在分片复制集群中使用 mongodb v2.6.5。
搜索 mongodb 的 jira 后,我发现了一个解释此行为的已知问题。