MongoDB currentOp() 和 Cursor.explain() 对同一查询显示不同的结果
MongoDB currentOp() and Cursor.explain() shows different result for the same query
多年来我一直在使用 MongoDB 3.2,我发现 currentOp()
和 Cursor.explain()
对同一查询显示不同的结果。
我发现有几个查询正在执行很长时间(20+ 秒),我认为这是不可能的,因为我测试了很多并且有索引。据我所见,查询通常是相同的。我认为它们会导致整个数据库锁定,因为当某些查询变慢时,几乎 40-50 个查询会卡在 currentOp()
中。
但是当我在 shell 中执行相同的读取操作时,它 运行 非常快,正如我预期的那样。我从 currentOp
获取了相同的查询并执行了。
当数据库锁定(我认为它已锁定)时,CPU 利用率在几个小时内达到 100%,我的应用程序将变得非常慢。我每 1 分钟监控一次 currentOp
,当它持续几秒钟没有结束时,我不得不重新启动应用程序然后它就正常了。
这是一个需要很长时间的查询。一旦发生这种情况,40-50 个其他但类似的查询也会陷入 currentOp
.
{
"desc" : "conn32882",
"threadId" : "140677207643904",
"connectionId" : 32882,
"client" : "client",
"active" : true,
"opid" : 1374027609,
"secs_running" : 20,
"microsecs_running" : NumberLong(20560351),
"op" : "query",
"ns" : "db.collection",
"query" : {
"find" : "collection",
"filter" : {
"p" : {
"$gt" : 0
},
"type" : "canvas",
"id" : {
"$in" : [
576391,
570391,
767422
]
}
},
"sort" : {
"_id" : -1
},
"projection" : {
},
"limit" : 5000,
"returnKey" : false,
"showRecordId" : false
},
"numYields" : 2761,
"locks" : {
"Global" : "r",
"Database" : "r",
"Collection" : "r"
},
"waitingForLock" : false,
"lockStats" : {
"Global" : {
"acquireCount" : {
"r" : NumberLong(5524)
},
"acquireWaitCount" : {
"r" : NumberLong(349)
},
"timeAcquiringMicros" : {
"r" : NumberLong(6613952)
}
},
"Database" : {
"acquireCount" : {
"r" : NumberLong(2762)
}
},
"Collection" : {
"acquireCount" : {
"r" : NumberLong(2762)
}
}
}
}
这里是 shell 中带有 executionStats
选项的相同查询的输出。
命令:
db.canvasdatas.find({
"p" : {
"$gt": 0
},
"type": "canvas",
"id" : {
"$in": [
576391,
570391,
767422
]
}
}, {}).sort({ _id: -1 }).limit(5000).explain('executionStats');
输出:
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "db.collection",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"type" : {
"$eq" : "canvas"
}
},
{
"p" : {
"$gt" : 0
}
},
{
"id" : {
"$in" : [
570391,
576391,
767422
]
}
}
]
},
"winningPlan" : {
"stage" : "SORT",
"sortPattern" : {
"_id" : -1
},
"limitAmount" : 5000,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"type" : {
"$eq" : "canvas"
}
},
{
"p" : {
"$gt" : 0
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"id" : 1
},
"indexName" : "id_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"id" : [
"[570391.0, 570391.0]",
"[576391.0, 576391.0]",
"[767422.0, 767422.0]"
]
}
}
}
}
},
"rejectedPlans" : [
{
"stage" : "SORT",
"sortPattern" : {
"_id" : -1
},
"limitAmount" : 5000,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"id" : {
"$in" : [
570391,
576391,
767422
]
}
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"p" : 1,
"type" : 1
},
"indexName" : "p_1_type_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"p" : [
"(0.0, inf.0]"
],
"type" : [
"[\"canvas\", \"canvas\"]"
]
}
}
}
}
},
{
"stage" : "SORT",
"sortPattern" : {
"_id" : -1
},
"limitAmount" : 5000,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"type" : {
"$eq" : "canvas"
}
},
{
"id" : {
"$in" : [
570391,
576391,
767422
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"p" : 1
},
"indexName" : "p_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"p" : [
"(0.0, inf.0]"
]
}
}
}
}
},
{
"stage" : "SORT",
"sortPattern" : {
"_id" : -1
},
"limitAmount" : 5000,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"p" : {
"$gt" : 0
}
},
{
"id" : {
"$in" : [
570391,
576391,
767422
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"type" : 1
},
"indexName" : "type_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"type" : [
"[\"canvas\", \"canvas\"]"
]
}
}
}
}
},
{
"stage" : "LIMIT",
"limitAmount" : 5000,
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"type" : {
"$eq" : "canvas"
}
},
{
"p" : {
"$gt" : 0
}
},
{
"id" : {
"$in" : [
570391,
576391,
767422
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"_id" : 1
},
"indexName" : "_id_",
"isMultiKey" : false,
"isUnique" : true,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "backward",
"indexBounds" : {
"_id" : [
"[MaxKey, MinKey]"
]
}
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 2,
"executionTimeMillis" : 0,
"totalKeysExamined" : 5,
"totalDocsExamined" : 2,
"executionStages" : {
"stage" : "SORT",
"nReturned" : 2,
"executionTimeMillisEstimate" : 0,
"works" : 10,
"advanced" : 2,
"needTime" : 6,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"sortPattern" : {
"_id" : -1
},
"memUsage" : 906,
"memLimit" : 33554432,
"limitAmount" : 5000,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"nReturned" : 0,
"executionTimeMillisEstimate" : 0,
"works" : 6,
"advanced" : 0,
"needTime" : 3,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"type" : {
"$eq" : "canvas"
}
},
{
"p" : {
"$gt" : 0
}
}
]
},
"nReturned" : 2,
"executionTimeMillisEstimate" : 0,
"works" : 5,
"advanced" : 2,
"needTime" : 2,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"docsExamined" : 2,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 2,
"executionTimeMillisEstimate" : 0,
"works" : 5,
"advanced" : 2,
"needTime" : 2,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"keyPattern" : {
"id" : 1
},
"indexName" : "id_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"id" : [
"[570391.0, 570391.0]",
"[576391.0, 576391.0]",
"[767422.0, 767422.0]"
]
},
"keysExamined" : 5,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
}
}
},
"serverInfo" : {
"host" : "host",
"port" : 27017,
"version" : "3.2.21",
"gitVersion" : ""
},
"ok" : 1
}
我用谷歌搜索了这个意外行为,但没有找到任何解决方案。所以当它挂起时我不得不重新启动服务器..
为了帮助理解,这是我的案例:
- 我正在使用 MongoDB 云管理器和数据库实例托管在 AWS EC2
- 我正在使用 ReplicaSet,我的阅读偏好是
secondaryPreferred
。所以所有的读取操作都向辅助节点进行。
- MongoDB版本为3.2
- 我为查询中使用的每个字段创建了索引(每个字段)
- 我在主节点和辅助节点中执行了相同的查询(
slaveOk
)
- 该集合有 20M 个对象。
并非每次都针对同一个查询发生。我认为还有其他因素会影响性能(例如复制?)
但是我不知道如何调试这个案例。对于这个问题或调试方法有更好的想法吗?
谢谢,
Edit : I still don't get what's the reason but tried to solve it by making any changes. I removed $gt
and it seems to work. But $gt
has no problem in my previous execution, and I think it's because there's few user at this moment.
Edit : I have MongoDB cloud manager so I could do some metrics change, I think Query targeting is increased while I still don't know. Normally it's about 100 per 1 document, but today it hits over 2K. May be related?
Query targeting for 2month
我认为这可以解释您所看到的:
"lockStats" : {
"Global" : {
"acquireCount" : {
"r" : NumberLong(5524)
},
"acquireWaitCount" : {
"r" : NumberLong(349)
},
"timeAcquiringMicros" : {
"r" : NumberLong(6613952)
}
},
似乎停滞的读取需要时间来尝试获取读取意图锁。这在旧版本的 MongoDB(4.0 之前)中是预期的,因为从辅助读取将等待 oplog 应用操作正在进行中。这样做是为了在应用 oplog 时辅助读取不会以不一致的形式读取数据。
这是最早的 MongoDB 版本中存在的长期二次读取行为,我猜您现在看到了这一点,因为您的数据库已达到足够繁忙的地步,以至于这成为一个问题。
这种情况在 MongoDB 4.0 和更新版本中通过 SERVER-34192 得到了改善,允许在 oplog apply 进行时继续进行二次读取。
多年来我一直在使用 MongoDB 3.2,我发现 currentOp()
和 Cursor.explain()
对同一查询显示不同的结果。
我发现有几个查询正在执行很长时间(20+ 秒),我认为这是不可能的,因为我测试了很多并且有索引。据我所见,查询通常是相同的。我认为它们会导致整个数据库锁定,因为当某些查询变慢时,几乎 40-50 个查询会卡在 currentOp()
中。
但是当我在 shell 中执行相同的读取操作时,它 运行 非常快,正如我预期的那样。我从 currentOp
获取了相同的查询并执行了。
当数据库锁定(我认为它已锁定)时,CPU 利用率在几个小时内达到 100%,我的应用程序将变得非常慢。我每 1 分钟监控一次 currentOp
,当它持续几秒钟没有结束时,我不得不重新启动应用程序然后它就正常了。
这是一个需要很长时间的查询。一旦发生这种情况,40-50 个其他但类似的查询也会陷入 currentOp
.
{
"desc" : "conn32882",
"threadId" : "140677207643904",
"connectionId" : 32882,
"client" : "client",
"active" : true,
"opid" : 1374027609,
"secs_running" : 20,
"microsecs_running" : NumberLong(20560351),
"op" : "query",
"ns" : "db.collection",
"query" : {
"find" : "collection",
"filter" : {
"p" : {
"$gt" : 0
},
"type" : "canvas",
"id" : {
"$in" : [
576391,
570391,
767422
]
}
},
"sort" : {
"_id" : -1
},
"projection" : {
},
"limit" : 5000,
"returnKey" : false,
"showRecordId" : false
},
"numYields" : 2761,
"locks" : {
"Global" : "r",
"Database" : "r",
"Collection" : "r"
},
"waitingForLock" : false,
"lockStats" : {
"Global" : {
"acquireCount" : {
"r" : NumberLong(5524)
},
"acquireWaitCount" : {
"r" : NumberLong(349)
},
"timeAcquiringMicros" : {
"r" : NumberLong(6613952)
}
},
"Database" : {
"acquireCount" : {
"r" : NumberLong(2762)
}
},
"Collection" : {
"acquireCount" : {
"r" : NumberLong(2762)
}
}
}
}
这里是 shell 中带有 executionStats
选项的相同查询的输出。
命令:
db.canvasdatas.find({
"p" : {
"$gt": 0
},
"type": "canvas",
"id" : {
"$in": [
576391,
570391,
767422
]
}
}, {}).sort({ _id: -1 }).limit(5000).explain('executionStats');
输出:
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "db.collection",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"type" : {
"$eq" : "canvas"
}
},
{
"p" : {
"$gt" : 0
}
},
{
"id" : {
"$in" : [
570391,
576391,
767422
]
}
}
]
},
"winningPlan" : {
"stage" : "SORT",
"sortPattern" : {
"_id" : -1
},
"limitAmount" : 5000,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"type" : {
"$eq" : "canvas"
}
},
{
"p" : {
"$gt" : 0
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"id" : 1
},
"indexName" : "id_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"id" : [
"[570391.0, 570391.0]",
"[576391.0, 576391.0]",
"[767422.0, 767422.0]"
]
}
}
}
}
},
"rejectedPlans" : [
{
"stage" : "SORT",
"sortPattern" : {
"_id" : -1
},
"limitAmount" : 5000,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"id" : {
"$in" : [
570391,
576391,
767422
]
}
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"p" : 1,
"type" : 1
},
"indexName" : "p_1_type_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"p" : [
"(0.0, inf.0]"
],
"type" : [
"[\"canvas\", \"canvas\"]"
]
}
}
}
}
},
{
"stage" : "SORT",
"sortPattern" : {
"_id" : -1
},
"limitAmount" : 5000,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"type" : {
"$eq" : "canvas"
}
},
{
"id" : {
"$in" : [
570391,
576391,
767422
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"p" : 1
},
"indexName" : "p_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"p" : [
"(0.0, inf.0]"
]
}
}
}
}
},
{
"stage" : "SORT",
"sortPattern" : {
"_id" : -1
},
"limitAmount" : 5000,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"p" : {
"$gt" : 0
}
},
{
"id" : {
"$in" : [
570391,
576391,
767422
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"type" : 1
},
"indexName" : "type_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"type" : [
"[\"canvas\", \"canvas\"]"
]
}
}
}
}
},
{
"stage" : "LIMIT",
"limitAmount" : 5000,
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"type" : {
"$eq" : "canvas"
}
},
{
"p" : {
"$gt" : 0
}
},
{
"id" : {
"$in" : [
570391,
576391,
767422
]
}
}
]
},
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"_id" : 1
},
"indexName" : "_id_",
"isMultiKey" : false,
"isUnique" : true,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "backward",
"indexBounds" : {
"_id" : [
"[MaxKey, MinKey]"
]
}
}
}
}
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : 2,
"executionTimeMillis" : 0,
"totalKeysExamined" : 5,
"totalDocsExamined" : 2,
"executionStages" : {
"stage" : "SORT",
"nReturned" : 2,
"executionTimeMillisEstimate" : 0,
"works" : 10,
"advanced" : 2,
"needTime" : 6,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"sortPattern" : {
"_id" : -1
},
"memUsage" : 906,
"memLimit" : 33554432,
"limitAmount" : 5000,
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"nReturned" : 0,
"executionTimeMillisEstimate" : 0,
"works" : 6,
"advanced" : 0,
"needTime" : 3,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"$and" : [
{
"type" : {
"$eq" : "canvas"
}
},
{
"p" : {
"$gt" : 0
}
}
]
},
"nReturned" : 2,
"executionTimeMillisEstimate" : 0,
"works" : 5,
"advanced" : 2,
"needTime" : 2,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"docsExamined" : 2,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 2,
"executionTimeMillisEstimate" : 0,
"works" : 5,
"advanced" : 2,
"needTime" : 2,
"needYield" : 0,
"saveState" : 0,
"restoreState" : 0,
"isEOF" : 1,
"invalidates" : 0,
"keyPattern" : {
"id" : 1
},
"indexName" : "id_1",
"isMultiKey" : false,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"id" : [
"[570391.0, 570391.0]",
"[576391.0, 576391.0]",
"[767422.0, 767422.0]"
]
},
"keysExamined" : 5,
"dupsTested" : 0,
"dupsDropped" : 0,
"seenInvalidated" : 0
}
}
}
}
},
"serverInfo" : {
"host" : "host",
"port" : 27017,
"version" : "3.2.21",
"gitVersion" : ""
},
"ok" : 1
}
我用谷歌搜索了这个意外行为,但没有找到任何解决方案。所以当它挂起时我不得不重新启动服务器..
为了帮助理解,这是我的案例:
- 我正在使用 MongoDB 云管理器和数据库实例托管在 AWS EC2
- 我正在使用 ReplicaSet,我的阅读偏好是
secondaryPreferred
。所以所有的读取操作都向辅助节点进行。 - MongoDB版本为3.2
- 我为查询中使用的每个字段创建了索引(每个字段)
- 我在主节点和辅助节点中执行了相同的查询(
slaveOk
) - 该集合有 20M 个对象。
并非每次都针对同一个查询发生。我认为还有其他因素会影响性能(例如复制?)
但是我不知道如何调试这个案例。对于这个问题或调试方法有更好的想法吗?
谢谢,
Edit : I still don't get what's the reason but tried to solve it by making any changes. I removed
$gt
and it seems to work. But$gt
has no problem in my previous execution, and I think it's because there's few user at this moment.Edit : I have MongoDB cloud manager so I could do some metrics change, I think Query targeting is increased while I still don't know. Normally it's about 100 per 1 document, but today it hits over 2K. May be related?
Query targeting for 2month
我认为这可以解释您所看到的:
"lockStats" : {
"Global" : {
"acquireCount" : {
"r" : NumberLong(5524)
},
"acquireWaitCount" : {
"r" : NumberLong(349)
},
"timeAcquiringMicros" : {
"r" : NumberLong(6613952)
}
},
似乎停滞的读取需要时间来尝试获取读取意图锁。这在旧版本的 MongoDB(4.0 之前)中是预期的,因为从辅助读取将等待 oplog 应用操作正在进行中。这样做是为了在应用 oplog 时辅助读取不会以不一致的形式读取数据。
这是最早的 MongoDB 版本中存在的长期二次读取行为,我猜您现在看到了这一点,因为您的数据库已达到足够繁忙的地步,以至于这成为一个问题。
这种情况在 MongoDB 4.0 和更新版本中通过 SERVER-34192 得到了改善,允许在 oplog apply 进行时继续进行二次读取。