在 bigquery 中使用 RANGE_BUCKET 时如何显示存储桶名称
How to show the bucket name when using RANGE_BUCKET in bigquery
这是我在 BigQuery 中对 public 数据集的查询:
SELECT RANGE_BUCKET(reputation, [400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000]) AS reputation_group, COUNT(*) AS count
FROM `bigquery-public-data.Whosebug.users`
Where reputation > 200000
GROUP BY 1
ORDER By 1
结果如下:
如何显示存储桶的范围,而不是将 reputation_group 显示为整数:
0: [0-400000]
1: [400001-500000]
2: [500001-600000]
....
非常感谢。
更新:
非常感谢 Mikhail 的回答,并做了以下小改动:
SELECT bucket,
FORMAT('%i - %i', IFNULL(ranges[SAFE_OFFSET(bucket - 1)] + 1, 0), ranges[SAFE_OFFSET(bucket)]) AS reputation_group,
COUNT(*) AS COUNT
FROM `bigquery-public-data.Whosebug.users`,
UNNEST([STRUCT([200000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000] AS ranges)]),
UNNEST([RANGE_BUCKET(reputation, ranges)]) bucket
WHERE reputation > 200000
GROUP BY 1, 2
ORDER BY bucket
注意一个额外的项目 200000 添加到 STRUCT,这使得结果显示
200001 - 400000
而不是 0 - 400000
有了 JOIN
和一些重构:
WITH range_array AS (
SELECT [400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000]
)
, buckets AS (
SELECT LAG(bucket_end) OVER(ORDER BY reputation_group) bucket_start, *
FROM UNNEST((SELECT * FROM range_array)) bucket_end WITH OFFSET reputation_group
)
SELECT *
, (SELECT AS STRUCT * FROM buckets WHERE a.reputation_group = reputation_group) bucket
FROM (
SELECT RANGE_BUCKET(reputation, (SELECT * FROM range_array)) AS reputation_group, COUNT(*) AS count
FROM `bigquery-public-data.Whosebug.users`
WHERE reputation > 200000
GROUP BY 1
ORDER BY 1
) a
ORDER BY reputation_group
但是如果你这样做了,就更容易忘记 RANGE_BUCKET:
WITH buckets AS (
SELECT IFNULL(LAG(max) OVER(ORDER BY grp), -10000000) min, *
FROM UNNEST([400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000]) max WITH OFFSET grp
)
SELECT buckets.min, buckets.max, COUNTIF(reputation >= buckets.min AND reputation < buckets.max) c
FROM `bigquery-public-data.Whosebug.users`, buckets
WHERE reputation > 200000
GROUP BY 1,2
ORDER BY 1
或者:
SELECT IFNULL(min,0) min, max, COUNT(*) c
FROM (
SELECT (SELECT MAX(x) FROM UNNEST(ranges) x WHERE x<reputation) min, (SELECT MIN(x) FROM UNNEST(ranges) x WHERE x>reputation) max
FROM `bigquery-public-data.Whosebug.users`
, (SELECT [400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000] ranges)
WHERE reputation > 200000
)
GROUP BY 1, 2
ORDER BY 1
以下适用于 BigQuery 标准 SQL
#standardSQL
SELECT bucket,
FORMAT('%i - %i', IFNULL(ranges[SAFE_OFFSET(bucket - 1)] + 1, 0), ranges[SAFE_OFFSET(bucket)]) AS reputation_group,
COUNT(*) AS COUNT
FROM `bigquery-public-data.Whosebug.users`,
UNNEST([STRUCT([400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000] AS ranges)]),
UNNEST([RANGE_BUCKET(reputation, ranges)]) bucket
WHERE reputation > 200000
GROUP BY 1, 2
ORDER BY bucket
结果
Row bucket reputation_group COUNT
1 0 0 - 400000 198
2 1 400001 - 500000 23
3 2 500001 - 600000 13
4 3 600001 - 700000 12
5 4 700001 - 800000 4
6 5 800001 - 900000 5
7 6 900001 - 1000000 2
8 8 1100001 - 1200000 1
这是我在 BigQuery 中对 public 数据集的查询:
SELECT RANGE_BUCKET(reputation, [400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000]) AS reputation_group, COUNT(*) AS count
FROM `bigquery-public-data.Whosebug.users`
Where reputation > 200000
GROUP BY 1
ORDER By 1
结果如下:
如何显示存储桶的范围,而不是将 reputation_group 显示为整数:
0: [0-400000]
1: [400001-500000]
2: [500001-600000]
....
非常感谢。
更新: 非常感谢 Mikhail 的回答,并做了以下小改动:
SELECT bucket,
FORMAT('%i - %i', IFNULL(ranges[SAFE_OFFSET(bucket - 1)] + 1, 0), ranges[SAFE_OFFSET(bucket)]) AS reputation_group,
COUNT(*) AS COUNT
FROM `bigquery-public-data.Whosebug.users`,
UNNEST([STRUCT([200000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000] AS ranges)]),
UNNEST([RANGE_BUCKET(reputation, ranges)]) bucket
WHERE reputation > 200000
GROUP BY 1, 2
ORDER BY bucket
注意一个额外的项目 200000 添加到 STRUCT,这使得结果显示
200001 - 400000
而不是 0 - 400000
有了 JOIN
和一些重构:
WITH range_array AS (
SELECT [400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000]
)
, buckets AS (
SELECT LAG(bucket_end) OVER(ORDER BY reputation_group) bucket_start, *
FROM UNNEST((SELECT * FROM range_array)) bucket_end WITH OFFSET reputation_group
)
SELECT *
, (SELECT AS STRUCT * FROM buckets WHERE a.reputation_group = reputation_group) bucket
FROM (
SELECT RANGE_BUCKET(reputation, (SELECT * FROM range_array)) AS reputation_group, COUNT(*) AS count
FROM `bigquery-public-data.Whosebug.users`
WHERE reputation > 200000
GROUP BY 1
ORDER BY 1
) a
ORDER BY reputation_group
但是如果你这样做了,就更容易忘记 RANGE_BUCKET:
WITH buckets AS (
SELECT IFNULL(LAG(max) OVER(ORDER BY grp), -10000000) min, *
FROM UNNEST([400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000]) max WITH OFFSET grp
)
SELECT buckets.min, buckets.max, COUNTIF(reputation >= buckets.min AND reputation < buckets.max) c
FROM `bigquery-public-data.Whosebug.users`, buckets
WHERE reputation > 200000
GROUP BY 1,2
ORDER BY 1
或者:
SELECT IFNULL(min,0) min, max, COUNT(*) c
FROM (
SELECT (SELECT MAX(x) FROM UNNEST(ranges) x WHERE x<reputation) min, (SELECT MIN(x) FROM UNNEST(ranges) x WHERE x>reputation) max
FROM `bigquery-public-data.Whosebug.users`
, (SELECT [400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000] ranges)
WHERE reputation > 200000
)
GROUP BY 1, 2
ORDER BY 1
以下适用于 BigQuery 标准 SQL
#standardSQL
SELECT bucket,
FORMAT('%i - %i', IFNULL(ranges[SAFE_OFFSET(bucket - 1)] + 1, 0), ranges[SAFE_OFFSET(bucket)]) AS reputation_group,
COUNT(*) AS COUNT
FROM `bigquery-public-data.Whosebug.users`,
UNNEST([STRUCT([400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000] AS ranges)]),
UNNEST([RANGE_BUCKET(reputation, ranges)]) bucket
WHERE reputation > 200000
GROUP BY 1, 2
ORDER BY bucket
结果
Row bucket reputation_group COUNT
1 0 0 - 400000 198
2 1 400001 - 500000 23
3 2 500001 - 600000 13
4 3 600001 - 700000 12
5 4 700001 - 800000 4
6 5 800001 - 900000 5
7 6 900001 - 1000000 2
8 8 1100001 - 1200000 1