从连续值中提取范围
Extract ranges from sequential values
任务 1:
我的 collection 中的文档在 mongodb 中,其值来自以下连续范围:
{x:1}
{x:2}
{x:3}
{x:5}
{x:6}
{x:7}
{x:8}
{x:20}
{x:21}
我需要提取表格中的连续范围列表(计数不是强制性的,但我至少需要范围中的第一个和最后一个值):
{x:[1,3] , count:3}
{x:[5,8], count:4}
{x:[20,21],count:2}
或
{ min:1 , max:3 , count:3}
{ min:5 , max:8 , count:4}
{ min:20 , max:21 , count:2}
请建议合适的解决方案,collection 有 ~100M 文档,一些值在 10 位数字范围内,其他值在 15 位数字范围内,但它们都在其范围内按顺序递增?
任务 2:
与任务 1 中的想法相同,但基于自定义序列步骤,
例如,如果序列步骤是 3:
{y:1}
{y:3}
{y:5}
{y:20}
{y:22}
需要制作:
{y:[1,5] ,count:3}
{y:[20,22]} , count:2}
谢谢!
P.S。
我通过按位数范围获取分布部分成功地获得了一些范围图片,但这似乎非常笼统:
db.collection.aggregate([
{
$addFields: {
range: {
$strLenCP: {
$toString: "$x"
}
}
}
},
{
$group: {
_id: "$range",
minValue: {
$min: "$x"
},
maxValue: {
$max: "$x"
},
Count: {
$sum: 1
}
}
},
{
$addFields: {
x: [
{
$toString: "$minValue"
},
{
$toString: "$maxValue"
}
]
}
},
{
$project: {
range: "$_id",
"_id": 0,
x: 1,
Count: 1
}
},
{
$sort: {
range: 1
}
}
])
使用$reduce
如果我没有误认为 task2
,只需将 $cond
、$ne
中的 1
更改为您想要的任何序列步骤
db.collection.aggregate([
{
"$sort": {
x: 1
}
},
{
$group: {
_id: null,
temp: {
$push: "$$ROOT"
}
}
},
{
"$project": {
_id: 0,
"temp_field": {
"$reduce": {
"input": "$temp",
"initialValue": {
"prev": -999999,
"min": -999999,
"count": 0,
"ranges": []
},
"in": {
"prev": "$$this.x",
"count": {
"$cond": [
{
$gt: [
{
"$subtract": [
"$$this.x",
"$$value.prev"
]
},
1//sequence step
],
},
1,
{
"$add": [
"$$value.count",
1
]
}
]
},
"min": {
"$cond": [
{
$gt: [
{
"$subtract": [
"$$this.x",
"$$value.prev"
]
},
1//sequence step
],
},
"$$this.x",
"$$value.min"
]
},
"ranges": {
"$concatArrays": [
"$$value.ranges",
{
"$cond": [
{
$gt: [
{
"$subtract": [
"$$this.x",
"$$value.prev"
]
},
1//sequence step
],
},
[
{
max: "$$value.prev",
min: "$$value.min",
count: "$$value.count"
}
],
[]
]
}
]
}
}
}
}
}
},
{
"$project": {
ranges: {
"$concatArrays": [
"$temp_field.ranges",
[
{
max: "$temp_field.prev",
min: "$temp_field.min",
count: "$temp_field.count"
}
]
]
}
}
}
])
最后从数组中弹出第一个元素
在实际用例中测试后 R2D2 的评论我使用 allowDiskUse 达到了内存限制:true:
2022-02-14T09:38:27.575+0100 E QUERY [js] Error: command failed: {
"ok" : 0,
"errmsg" : "$push used too much memory and cannot spill to disk. Memory limit: 104857600 bytes",
"code" : 146,
"codeName" : "ExceededMemoryLimit",
将内存增加到 2GB(允许的最大值):
db.adminCommand({setParameter:1 , internalQueryMaxPushBytes: 2048576000 })
但还是遇到了极限,于是决定把合集分成小的,所以终于得到了我的结果,再次感谢你!
这是另一种查询方式 - 生成格式为 [ { min: 1 , max: 3 , count: 3 }, ... ]
:
的结果
db.collection.aggregate([
{
$sort: { x: 1 }
},
{
$group: {
_id: null,
docs: { $push: "$x" },
firstVal: { $first: "$x" },
lastVal: { $last: "$x" }
}
},
{
$project: {
_id: 0,
output: {
$let: {
vars: {
result: {
$reduce: {
input: "$docs",
initialValue: {
prev: { $add: [ "$firstVal", -1 ] },
val: { min: "$firstVal", max: 0, count: 0 },
vals: [ ]
},
in: {
$cond: [
{ $eq: [ { $subtract: [ "$$this", "$$value.prev" ] }, 1 ] },
{
prev: "$$this",
val: {
min : "$$value.val.min",
max: "$$value.val.max",
count: { $add: [ "$$value.val.count", 1 ] }
},
vals: "$$value.vals"
},
{
vals: {
$concatArrays: [
"$$value.vals",
[ { min : "$$value.val.min", max: "$$value.prev", count: "$$value.val.count" } ]
]
},
val: { min: "$$this", max: "$lastVal", count: 1 },
prev: "$$this"
},
]
}
}
}
},
in: {
$concatArrays: [ "$$result.vals", [ "$$result.val" ] ]
}
}
}
}
},
])
使用$setWindowFields
代替$group
所有数据
db.collection.aggregate([
{
$setWindowFields: {
partitionBy: "",
sortBy: { x: 1 },
output: {
c: {
$push: "$x",
window: {
range: [ -3, 0 ]
}
}
}
}
},
{
$set: {
"c": {
"$cond": {
"if": { "$gt": [ { "$size": "$c" }, 1 ] },
"then": 0,
"else": 1
}
}
}
},
{
$setWindowFields: {
partitionBy: "",
sortBy: { x: 1 },
output: {
g: {
$sum: "$c",
window: {
documents: [ "unbounded", "current" ]
}
}
}
}
},
{
$group: {
_id: "$g",
count: { $sum: 1 },
max: { "$max": "$x" },
min: { "$min": "$x" }
}
}
])
在 PostgreSQL 中
CREATE TABLE test (
id INT,
x INT
);
INSERT INTO test VALUES (1, 1);
INSERT INTO test VALUES (2, 3);
INSERT INTO test VALUES (3, 5);
INSERT INTO test VALUES (4, 20);
INSERT INTO test VALUES (5, 22);
SELECT
MAX(x) AS max, MIN(x) AS min, COUNT(*) AS count
FROM (
SELECT *, SUM(inc) OVER(ORDER BY x) AS grp
FROM (
SELECT *, CASE WHEN x - LAG(x) OVER(ORDER BY x) < 4 THEN NULL ELSE 1 END AS inc
FROM test
) q
) q
GROUP BY grp
任务 1:
我的 collection 中的文档在 mongodb 中,其值来自以下连续范围:
{x:1}
{x:2}
{x:3}
{x:5}
{x:6}
{x:7}
{x:8}
{x:20}
{x:21}
我需要提取表格中的连续范围列表(计数不是强制性的,但我至少需要范围中的第一个和最后一个值):
{x:[1,3] , count:3}
{x:[5,8], count:4}
{x:[20,21],count:2}
或
{ min:1 , max:3 , count:3}
{ min:5 , max:8 , count:4}
{ min:20 , max:21 , count:2}
请建议合适的解决方案,collection 有 ~100M 文档,一些值在 10 位数字范围内,其他值在 15 位数字范围内,但它们都在其范围内按顺序递增?
任务 2:
与任务 1 中的想法相同,但基于自定义序列步骤, 例如,如果序列步骤是 3:
{y:1}
{y:3}
{y:5}
{y:20}
{y:22}
需要制作:
{y:[1,5] ,count:3}
{y:[20,22]} , count:2}
谢谢!
P.S。 我通过按位数范围获取分布部分成功地获得了一些范围图片,但这似乎非常笼统:
db.collection.aggregate([
{
$addFields: {
range: {
$strLenCP: {
$toString: "$x"
}
}
}
},
{
$group: {
_id: "$range",
minValue: {
$min: "$x"
},
maxValue: {
$max: "$x"
},
Count: {
$sum: 1
}
}
},
{
$addFields: {
x: [
{
$toString: "$minValue"
},
{
$toString: "$maxValue"
}
]
}
},
{
$project: {
range: "$_id",
"_id": 0,
x: 1,
Count: 1
}
},
{
$sort: {
range: 1
}
}
])
使用$reduce
如果我没有误认为 task2
,只需将 $cond
、$ne
中的 1
更改为您想要的任何序列步骤
db.collection.aggregate([
{
"$sort": {
x: 1
}
},
{
$group: {
_id: null,
temp: {
$push: "$$ROOT"
}
}
},
{
"$project": {
_id: 0,
"temp_field": {
"$reduce": {
"input": "$temp",
"initialValue": {
"prev": -999999,
"min": -999999,
"count": 0,
"ranges": []
},
"in": {
"prev": "$$this.x",
"count": {
"$cond": [
{
$gt: [
{
"$subtract": [
"$$this.x",
"$$value.prev"
]
},
1//sequence step
],
},
1,
{
"$add": [
"$$value.count",
1
]
}
]
},
"min": {
"$cond": [
{
$gt: [
{
"$subtract": [
"$$this.x",
"$$value.prev"
]
},
1//sequence step
],
},
"$$this.x",
"$$value.min"
]
},
"ranges": {
"$concatArrays": [
"$$value.ranges",
{
"$cond": [
{
$gt: [
{
"$subtract": [
"$$this.x",
"$$value.prev"
]
},
1//sequence step
],
},
[
{
max: "$$value.prev",
min: "$$value.min",
count: "$$value.count"
}
],
[]
]
}
]
}
}
}
}
}
},
{
"$project": {
ranges: {
"$concatArrays": [
"$temp_field.ranges",
[
{
max: "$temp_field.prev",
min: "$temp_field.min",
count: "$temp_field.count"
}
]
]
}
}
}
])
最后从数组中弹出第一个元素
在实际用例中测试后 R2D2 的评论我使用 allowDiskUse 达到了内存限制:true:
2022-02-14T09:38:27.575+0100 E QUERY [js] Error: command failed: {
"ok" : 0,
"errmsg" : "$push used too much memory and cannot spill to disk. Memory limit: 104857600 bytes",
"code" : 146,
"codeName" : "ExceededMemoryLimit",
将内存增加到 2GB(允许的最大值):
db.adminCommand({setParameter:1 , internalQueryMaxPushBytes: 2048576000 })
但还是遇到了极限,于是决定把合集分成小的,所以终于得到了我的结果,再次感谢你!
这是另一种查询方式 - 生成格式为 [ { min: 1 , max: 3 , count: 3 }, ... ]
:
db.collection.aggregate([
{
$sort: { x: 1 }
},
{
$group: {
_id: null,
docs: { $push: "$x" },
firstVal: { $first: "$x" },
lastVal: { $last: "$x" }
}
},
{
$project: {
_id: 0,
output: {
$let: {
vars: {
result: {
$reduce: {
input: "$docs",
initialValue: {
prev: { $add: [ "$firstVal", -1 ] },
val: { min: "$firstVal", max: 0, count: 0 },
vals: [ ]
},
in: {
$cond: [
{ $eq: [ { $subtract: [ "$$this", "$$value.prev" ] }, 1 ] },
{
prev: "$$this",
val: {
min : "$$value.val.min",
max: "$$value.val.max",
count: { $add: [ "$$value.val.count", 1 ] }
},
vals: "$$value.vals"
},
{
vals: {
$concatArrays: [
"$$value.vals",
[ { min : "$$value.val.min", max: "$$value.prev", count: "$$value.val.count" } ]
]
},
val: { min: "$$this", max: "$lastVal", count: 1 },
prev: "$$this"
},
]
}
}
}
},
in: {
$concatArrays: [ "$$result.vals", [ "$$result.val" ] ]
}
}
}
}
},
])
使用$setWindowFields
代替$group
所有数据
db.collection.aggregate([
{
$setWindowFields: {
partitionBy: "",
sortBy: { x: 1 },
output: {
c: {
$push: "$x",
window: {
range: [ -3, 0 ]
}
}
}
}
},
{
$set: {
"c": {
"$cond": {
"if": { "$gt": [ { "$size": "$c" }, 1 ] },
"then": 0,
"else": 1
}
}
}
},
{
$setWindowFields: {
partitionBy: "",
sortBy: { x: 1 },
output: {
g: {
$sum: "$c",
window: {
documents: [ "unbounded", "current" ]
}
}
}
}
},
{
$group: {
_id: "$g",
count: { $sum: 1 },
max: { "$max": "$x" },
min: { "$min": "$x" }
}
}
])
在 PostgreSQL 中
CREATE TABLE test (
id INT,
x INT
);
INSERT INTO test VALUES (1, 1);
INSERT INTO test VALUES (2, 3);
INSERT INTO test VALUES (3, 5);
INSERT INTO test VALUES (4, 20);
INSERT INTO test VALUES (5, 22);
SELECT
MAX(x) AS max, MIN(x) AS min, COUNT(*) AS count
FROM (
SELECT *, SUM(inc) OVER(ORDER BY x) AS grp
FROM (
SELECT *, CASE WHEN x - LAG(x) OVER(ORDER BY x) < 4 THEN NULL ELSE 1 END AS inc
FROM test
) q
) q
GROUP BY grp