每个操作类型的复杂时间序列查询时间差异
Complex time series query time difference per action type
我在 MongoDB 中有这种文档数据结构,它用于从与不同操作的时间序列相关的任何角度进行复杂数据分析(平面数据日志)。我发现很难使用 mongo 查询提取每个文档的特定类型更改之间所花费的时间,然后应用 $graphLookup
函数(如下所示)。我是 MongoDB 的初学者,我需要有关查询的帮助以获取所需的数据。
单个文档的数据结构(示例):
{
"_id":NumberInt(1),
"Creation": ISODate("2018-11-19T06:30:42Z"),
"Creator": NumberInt(1),
"Replies": NumberInt(10),
//... other aggregated properties
"CurrentProperties":{ // a copy of the last update signifying the current state
"StatusId": NumberInt(8),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(5),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:17:20Z"),
"TimeDelta": NumberLong(3600000), //timespan from last change in MS
"ChangeType": NumberInt(4),
"UserId": NumberInt(1)
},
"ChangeHistory":[ // time series changes
{
"StatusId": NumberInt(8),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(1),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:14:20Z"),
"TimeDelta": NumberLong(0), //timespan from last change in MS
"ChangeType": NumberInt(0), // the changed property identifier (0= creation)
"UserId": NumberInt(1)
},
{
"StatusId": NumberInt(8),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(2),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:15:50Z"),
"TimeDelta": NumberLong(90000), //timespan from last change in MS
"ChangeType": NumberInt(4), // the changed property identifier (4= department)
"UserId": NumberInt(1)
},
{
"StatusId": NumberInt(2),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(2),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:16:20Z"),
"TimeDelta": NumberLong(30000), //timespan from last change in MS
"ChangeType": NumberInt(2), // the changed property identifier (2= status)
"UserId": NumberInt(1)
},
{
"StatusId": NumberInt(2),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(5),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:17:20Z"),
"TimeDelta": NumberLong(60000), //timespan from last change in MS
"ChangeType": NumberInt(4), // the changed property identifier (4= department)
"UserId": NumberInt(1)
}
]
}
部门变更时间预期结果:
[{
RecordID: 1,
Department: 1,
ChangeTime: ISODate("2018-11-19T10:15:50Z"),
TimeSpent: 90000
},
{
RecordID: 1,
Department: 2,
ChangeTime: ISODate("2018-11-19T10:17:20Z")
TimeSpent: 90000
},
{
RecordID: 1,
Department: 5,
ChangeTime: ISODate("2018-11-21T09:47:47Z") // Current Time
TimeSpent: 171027000 //difference between now and last change in departments
}]
状态:
[{
RecordID: 1,
Status: 8,
ChangeTime: ISODate("2018-11-19T10:16:20Z"),
TimeDelta: 120000
},
{
RecordID: 1,
Status: 2,
ChangeTime: ISODate("2018-11-21T09:47:47Z"), // Current Time
TimeDelta: 171087000 //difference between now and last change in status
}]
到目前为止我尝试了什么
到目前为止我得到的最好结果是使用以下聚合创建视图,然后在视图上应用 $GraphLookup
函数:
db.test.aggregate([
{$project: {
_id:0,
RecordID: "$_id",
history: {
$filter: {
input: "$ChangeHistory",
as: "changeHistory",
cond: {$or:[
{$eq:["$$changeHistory.ChangeType",0]},
{$eq:["$$changeHistory.ChangeType",4]}
]}
}
}
}},
{$unwind: {
path: "$history",
includeArrayIndex:"order"
}}, {$project: {
_id:"$RecordID",
"RecordID": "$RecordID",
"departmentID": "$history.DepartmentId",
"actionOrder":"$order",
"nextAction":{$add:["$order",1]},
"time":"$history.ChangeTime"
}}
])
然后应用以下内容:
db.TestView.aggregate([{
$graphLookup: {
from: 'TestView',
startWith: "$nextAction",
connectFromField: 'nextAction',
connectToField: 'actionOrder',
as: 'pair',
}
}, {
$unwind: {
path: "$pair"
}
}, {
$project: {
_id: 0,
RecordID: "$_id",
Department: "$departmentID",
ChangeTime: "$pair.time",
TimeSpent: {
$subtract: ["$pair.time", "$time"]
}
}
}
])
问题是它混合了跨不同文档的动作配对,不包括到当前时间为止的花费时间,并且在中间使用视图的基础上有太多的传播。
如果需要,可以稍微修改数据结构。
在 post 提出这个问题之前,我实际上花了 2 天时间试图找出解决方案,并在几个小时后解决了它。
只是想分享我的解决方案,如果有人可以优化它的性能或任何东西,请随时post你的答案
解决方案
它利用 $zip
函数,以便在应用过滤器后通过传递事件的原始数组和除第一个元素之外的同一数组的另一个副本来形成动作对,以便第一个元素与第二个元素匹配,第二个元素与第三个元素匹配,依此类推。我还添加了当前时间的默认值,以计算当前时间的最后一个元素的增量。
db.test.aggregate([{
$project: {
RecordID: "$_id",
history: {
$filter: {
input: "$ChangeHistory",
as: "changeHistory",
cond: {
$or: [{
$eq: ["$$changeHistory.ChangeType", 0]
},
{
$eq: ["$$changeHistory.ChangeType", 2]
}
]
}
}
}
}
},
{
$addFields: {
pairs: {
$zip: { // here is the trick
inputs: ["$history", {
$slice: ["$history", 1, {
$size: "$history"
}]
}],
useLongestLength: true,
defaults: [0, {
ChangeTime: new Date()
}]
}
}
}
},
{
$unwind: {
path: "$pairs"
}
},
{
$project: {
id: "$_id",
old: {
$arrayElemAt: ["$pairs", 0]
},
new: {
$arrayElemAt: ["$pairs", 1]
}
}
},
{
$project: {
RecordID: "$id",
Status: "$old.StatusId",
TimeDeltaMS: {
$subtract: ["$new.ChangeTime", "$old.ChangeTime"]
},
ChangeTime: "$new.ChangeTime"
}
},
])
我在 MongoDB 中有这种文档数据结构,它用于从与不同操作的时间序列相关的任何角度进行复杂数据分析(平面数据日志)。我发现很难使用 mongo 查询提取每个文档的特定类型更改之间所花费的时间,然后应用 $graphLookup
函数(如下所示)。我是 MongoDB 的初学者,我需要有关查询的帮助以获取所需的数据。
单个文档的数据结构(示例):
{
"_id":NumberInt(1),
"Creation": ISODate("2018-11-19T06:30:42Z"),
"Creator": NumberInt(1),
"Replies": NumberInt(10),
//... other aggregated properties
"CurrentProperties":{ // a copy of the last update signifying the current state
"StatusId": NumberInt(8),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(5),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:17:20Z"),
"TimeDelta": NumberLong(3600000), //timespan from last change in MS
"ChangeType": NumberInt(4),
"UserId": NumberInt(1)
},
"ChangeHistory":[ // time series changes
{
"StatusId": NumberInt(8),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(1),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:14:20Z"),
"TimeDelta": NumberLong(0), //timespan from last change in MS
"ChangeType": NumberInt(0), // the changed property identifier (0= creation)
"UserId": NumberInt(1)
},
{
"StatusId": NumberInt(8),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(2),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:15:50Z"),
"TimeDelta": NumberLong(90000), //timespan from last change in MS
"ChangeType": NumberInt(4), // the changed property identifier (4= department)
"UserId": NumberInt(1)
},
{
"StatusId": NumberInt(2),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(2),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:16:20Z"),
"TimeDelta": NumberLong(30000), //timespan from last change in MS
"ChangeType": NumberInt(2), // the changed property identifier (2= status)
"UserId": NumberInt(1)
},
{
"StatusId": NumberInt(2),
"PriorityId": NumberInt(6),
"DepartmentId": NumberInt(5),
"TypeId": NumberInt(4),
"CategoryId": NumberInt(2),
"SubcategoryId": NumberInt(333),
"ChangeTime": ISODate("2018-11-19T10:17:20Z"),
"TimeDelta": NumberLong(60000), //timespan from last change in MS
"ChangeType": NumberInt(4), // the changed property identifier (4= department)
"UserId": NumberInt(1)
}
]
}
部门变更时间预期结果:
[{
RecordID: 1,
Department: 1,
ChangeTime: ISODate("2018-11-19T10:15:50Z"),
TimeSpent: 90000
},
{
RecordID: 1,
Department: 2,
ChangeTime: ISODate("2018-11-19T10:17:20Z")
TimeSpent: 90000
},
{
RecordID: 1,
Department: 5,
ChangeTime: ISODate("2018-11-21T09:47:47Z") // Current Time
TimeSpent: 171027000 //difference between now and last change in departments
}]
状态:
[{
RecordID: 1,
Status: 8,
ChangeTime: ISODate("2018-11-19T10:16:20Z"),
TimeDelta: 120000
},
{
RecordID: 1,
Status: 2,
ChangeTime: ISODate("2018-11-21T09:47:47Z"), // Current Time
TimeDelta: 171087000 //difference between now and last change in status
}]
到目前为止我尝试了什么
到目前为止我得到的最好结果是使用以下聚合创建视图,然后在视图上应用 $GraphLookup
函数:
db.test.aggregate([
{$project: {
_id:0,
RecordID: "$_id",
history: {
$filter: {
input: "$ChangeHistory",
as: "changeHistory",
cond: {$or:[
{$eq:["$$changeHistory.ChangeType",0]},
{$eq:["$$changeHistory.ChangeType",4]}
]}
}
}
}},
{$unwind: {
path: "$history",
includeArrayIndex:"order"
}}, {$project: {
_id:"$RecordID",
"RecordID": "$RecordID",
"departmentID": "$history.DepartmentId",
"actionOrder":"$order",
"nextAction":{$add:["$order",1]},
"time":"$history.ChangeTime"
}}
])
然后应用以下内容:
db.TestView.aggregate([{
$graphLookup: {
from: 'TestView',
startWith: "$nextAction",
connectFromField: 'nextAction',
connectToField: 'actionOrder',
as: 'pair',
}
}, {
$unwind: {
path: "$pair"
}
}, {
$project: {
_id: 0,
RecordID: "$_id",
Department: "$departmentID",
ChangeTime: "$pair.time",
TimeSpent: {
$subtract: ["$pair.time", "$time"]
}
}
}
])
问题是它混合了跨不同文档的动作配对,不包括到当前时间为止的花费时间,并且在中间使用视图的基础上有太多的传播。
如果需要,可以稍微修改数据结构。
在 post 提出这个问题之前,我实际上花了 2 天时间试图找出解决方案,并在几个小时后解决了它。
只是想分享我的解决方案,如果有人可以优化它的性能或任何东西,请随时post你的答案
解决方案
它利用 $zip
函数,以便在应用过滤器后通过传递事件的原始数组和除第一个元素之外的同一数组的另一个副本来形成动作对,以便第一个元素与第二个元素匹配,第二个元素与第三个元素匹配,依此类推。我还添加了当前时间的默认值,以计算当前时间的最后一个元素的增量。
db.test.aggregate([{
$project: {
RecordID: "$_id",
history: {
$filter: {
input: "$ChangeHistory",
as: "changeHistory",
cond: {
$or: [{
$eq: ["$$changeHistory.ChangeType", 0]
},
{
$eq: ["$$changeHistory.ChangeType", 2]
}
]
}
}
}
}
},
{
$addFields: {
pairs: {
$zip: { // here is the trick
inputs: ["$history", {
$slice: ["$history", 1, {
$size: "$history"
}]
}],
useLongestLength: true,
defaults: [0, {
ChangeTime: new Date()
}]
}
}
}
},
{
$unwind: {
path: "$pairs"
}
},
{
$project: {
id: "$_id",
old: {
$arrayElemAt: ["$pairs", 0]
},
new: {
$arrayElemAt: ["$pairs", 1]
}
}
},
{
$project: {
RecordID: "$id",
Status: "$old.StatusId",
TimeDeltaMS: {
$subtract: ["$new.ChangeTime", "$old.ChangeTime"]
},
ChangeTime: "$new.ChangeTime"
}
},
])