Big Query SQL - 对所有可能的聚合级别进行分组
Big Query SQL - group over all possible levels of aggregation
假设我有一个包含 value, dim1, ..., dim 10
列的 table,我想获得 dim1
、...、[=14 列的所有可能分组的中位数=].即在所有子组上,所有子组跨越任何 9 个维度,任何 8...
我可以
SELECT * FROM
(
SELECT
APPROX_QUANTILES(value, 100)[SAFE_ORDINAL(50)] as median,
dim1, ..., dim10
FROM table
GROUP BY dim1, ..., dim10
)
UNION ALL
(
SELECT
APPROX_QUANTILES(value, 100)[SAFE_ORDINAL(50)] as median,
dim1, ..., dim9
NULL as dim10
FROM table
GROUP BY dim1, ..., dim9
)
UNION ALL
... --2^10 subtables
但这很长,特别是如果 table
是即时计算的。有没有更好的方法?
我正在处理大查询,但答案可能不是特定于大查询的
我相信你想要GROUP BY ROLLUP
:
SELECT APPROX_QUANTILES(value, 100)[SAFE_ORDINAL(50)] as median,
dim1, ..., dim10
FROM table
GROUP BY ROLLUP(dim1, ..., dim10);
这在标准版和旧版中都可用 SQL(这可能解释了为什么不使用更常见的 GROUPING SETS
语法)。
这是我的临时解决方案,似乎可以做 CUBE 会做的事情:
CREATE TEMPORARY FUNCTION makeGroups(group_names ARRAY<STRING>)
RETURNS ARRAY<STRING>
LANGUAGE js AS """
var res = [];
var n = 1 << group_names.length;
for(var i=0; i<n; i++) {
res.push(group_names.map(function(g, idx) { return (i >> idx) % 2 == 1 ? (g || 'null') : 'any' }).join(':::'));
}
return res;
""";
WITH data AS
(
SELECT
value,
makeGroups([dim1, ..., dim10]) AS group_names
FROM
table
)
SELECT
SUM(1) as counts,
APPROX_QUANTILES(value, 100)[SAFE_ORDINAL(50)] as median
group_name
FROM
(
SELECT
value,
group_name
FROM data
CROSS JOIN UNNEST(data.group_names) as group_name
)
GROUP BY
group_name
我可以将看起来像 dim1_value::...::dim10_value
的组名称映射回列,但我有点喜欢 NULL
与原始值(此处为 'null'
)和聚合(此处'any'
)
在 BigQuery 支持 GROUP BY CUBE 之前,我发现这是最 compact/readable 获得每个组组合所需聚合的方法:
SELECT
APPROX_QUANTILES(value, 100)[SAFE_ORDINAL(50)] as median,
dim1, ..., dim10
FROM table,
UNNEST ([dim1, 'ALL']) AS dim1,
...
UNNEST ([dim10, 'ALL']) AS dim10
GROUP BY dim1, ..., dim10
假设我有一个包含 value, dim1, ..., dim 10
列的 table,我想获得 dim1
、...、[=14 列的所有可能分组的中位数=].即在所有子组上,所有子组跨越任何 9 个维度,任何 8...
我可以
SELECT * FROM
(
SELECT
APPROX_QUANTILES(value, 100)[SAFE_ORDINAL(50)] as median,
dim1, ..., dim10
FROM table
GROUP BY dim1, ..., dim10
)
UNION ALL
(
SELECT
APPROX_QUANTILES(value, 100)[SAFE_ORDINAL(50)] as median,
dim1, ..., dim9
NULL as dim10
FROM table
GROUP BY dim1, ..., dim9
)
UNION ALL
... --2^10 subtables
但这很长,特别是如果 table
是即时计算的。有没有更好的方法?
我正在处理大查询,但答案可能不是特定于大查询的
我相信你想要GROUP BY ROLLUP
:
SELECT APPROX_QUANTILES(value, 100)[SAFE_ORDINAL(50)] as median,
dim1, ..., dim10
FROM table
GROUP BY ROLLUP(dim1, ..., dim10);
这在标准版和旧版中都可用 SQL(这可能解释了为什么不使用更常见的 GROUPING SETS
语法)。
这是我的临时解决方案,似乎可以做 CUBE 会做的事情:
CREATE TEMPORARY FUNCTION makeGroups(group_names ARRAY<STRING>)
RETURNS ARRAY<STRING>
LANGUAGE js AS """
var res = [];
var n = 1 << group_names.length;
for(var i=0; i<n; i++) {
res.push(group_names.map(function(g, idx) { return (i >> idx) % 2 == 1 ? (g || 'null') : 'any' }).join(':::'));
}
return res;
""";
WITH data AS
(
SELECT
value,
makeGroups([dim1, ..., dim10]) AS group_names
FROM
table
)
SELECT
SUM(1) as counts,
APPROX_QUANTILES(value, 100)[SAFE_ORDINAL(50)] as median
group_name
FROM
(
SELECT
value,
group_name
FROM data
CROSS JOIN UNNEST(data.group_names) as group_name
)
GROUP BY
group_name
我可以将看起来像 dim1_value::...::dim10_value
的组名称映射回列,但我有点喜欢 NULL
与原始值(此处为 'null'
)和聚合(此处'any'
)
在 BigQuery 支持 GROUP BY CUBE 之前,我发现这是最 compact/readable 获得每个组组合所需聚合的方法:
SELECT
APPROX_QUANTILES(value, 100)[SAFE_ORDINAL(50)] as median,
dim1, ..., dim10
FROM table,
UNNEST ([dim1, 'ALL']) AS dim1,
...
UNNEST ([dim10, 'ALL']) AS dim10
GROUP BY dim1, ..., dim10