BigQuery:如何通过 window 函数合并 HLL 草图? (在滚动 window 中计算不同的值)
BigQuery: How to merge HLL Sketches over a window function? (Count distinct values over a rolling window)
示例相关 table 模式:
+---------------------------+-------------------+
| activity_date - TIMESTAMP | user_id - STRING |
+---------------------------+-------------------+
| 2017-02-22 17:36:08 UTC | fake_id_i24385787 |
+---------------------------+-------------------+
| 2017-02-22 04:27:08 UTC | fake_id_234885747 |
+---------------------------+-------------------+
| 2017-02-22 08:36:08 UTC | fake_id_i24385787 |
+---------------------------+-------------------+
我需要在一个滚动时间段(90 天)内对大型数据集的不同活动用户进行计数,并且 运行 由于数据集的大小而遇到问题。
起初,我尝试使用 window 函数,类似于此处的答案。
WITH
daily AS (
SELECT
DATE(activity_date) day,
user_id
FROM
`fake-table`)
SELECT
day,
SUM(APPROX_COUNT_DISTINCT(user_id)) OVER (ORDER BY day ROWS BETWEEN 89 PRECEDING AND CURRENT ROW) ninty_day_window_apprx
FROM
daily
GROUP BY
1
ORDER BY
1 DESC
然而,这导致每天获得不同数量的用户,然后将这些用户相加 - 但不同的用户可能会在 window 中重复,如果它们出现多次。因此,这并不是对 90 天内不同用户的真正准确衡量。
接下来我尝试的是使用以下解决方案
- 将每个 window 的所有不同 user_ids 连接到一个数组,然后计算其中的不同
WITH daily AS (
SELECT date(activity_date) day, STRING_AGG(DISTINCT user_id) users
FROM `fake-table`
GROUP BY day
), temp2 AS (
SELECT
day,
STRING_AGG(users) OVER(ORDER BY UNIX_DATE(day) RANGE BETWEEN 89 PRECEDING AND CURRENT ROW) users
FROM daily
)
SELECT day,
(SELECT APPROX_COUNT_DISTINCT(id) FROM UNNEST(SPLIT(users)) AS id) Unique90Days
FROM temp2
order by 1 desc
然而,这很快 运行 内存不足。
接下来是使用 HLL 草图以更小的值表示不同的 ID,这样内存就不是问题了。我以为我的问题已经解决了,但是当 运行 以下内容时我收到了一个错误: 错误只是 "Function MERGE_PARTIAL is not supported." 我也尝试使用 MERGE 并得到了同样的错误。它仅在使用 window 函数时发生。为每天的价值创建草图效果很好。
我通读了 BigQuery Standard SQL 文档,但没有看到任何关于 HLL_COUNT.MERGE_PARTIAL 和 HLL_COUNT.MERGE 以及 window 函数的内容。据推测,这应该将 90 个草图组合成一个 HLL 草图,代表 90 个原始草图之间的不同值?
WITH
daily AS (
SELECT
DATE(activity_date) day,
HLL_COUNT.INIT(user_id) sketch
FROM
`fake-table`
GROUP BY
1
ORDER BY
1 DESC),
rolling AS (
SELECT
day,
HLL_COUNT.MERGE_PARTIAL(sketch) OVER (ORDER BY UNIX_DATE(day) RANGE BETWEEN 89 PRECEDING AND CURRENT ROW) rolling_sketch
FROM daily)
SELECT
day,
HLL_COUNT.EXTRACT(rolling_sketch)
FROM
rolling
ORDER BY
1
"Image of the error - Function MERGE_PARTIAL is not supported"
知道为什么会发生此错误或如何调整吗?
合并 HLL_COUNT.INIT
和 HLL_COUNT.MERGE
。此解决方案使用 GENERATE_ARRAY(1, 90)
而不是 OVER
.
的 90 天交叉连接
#standardSQL
SELECT DATE_SUB(date, INTERVAL i DAY) date_grp
, HLL_COUNT.MERGE(sketch) unique_90_day_users
, HLL_COUNT.MERGE(DISTINCT IF(i<31,sketch,null)) unique_30_day_users
, HLL_COUNT.MERGE(DISTINCT IF(i<8,sketch,null)) unique_7_day_users
FROM (
SELECT DATE(creation_date) date, HLL_COUNT.INIT(owner_user_id) sketch
FROM `bigquery-public-data.Whosebug.posts_questions`
WHERE EXTRACT(YEAR FROM creation_date)=2017
GROUP BY 1
), UNNEST(GENERATE_ARRAY(1, 90)) i
GROUP BY 1
ORDER BY date_grp
以下是 BigQuery Standard SQL 并且使用 window 函数
完全符合您的要求
#standardSQL
SELECT day,
(SELECT HLL_COUNT.MERGE(sketch) FROM UNNEST(rolling_sketch_arr) sketch) rolling_sketch
FROM (
SELECT day,
ARRAY_AGG(ids_sketch) OVER(ORDER BY UNIX_DATE(day) RANGE BETWEEN 89 PRECEDING AND CURRENT ROW) rolling_sketch_arr
FROM (
SELECT day, HLL_COUNT.INIT(id) ids_sketch
FROM `project.dataset.table`
GROUP BY day
)
)
您可以使用 [完全] 虚拟数据来测试和玩上面的示例
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 id, DATE '2019-01-01' day UNION ALL
SELECT 2, '2019-01-01' UNION ALL
SELECT 3, '2019-01-01' UNION ALL
SELECT 1, '2019-01-02' UNION ALL
SELECT 4, '2019-01-02' UNION ALL
SELECT 2, '2019-01-03' UNION ALL
SELECT 3, '2019-01-03' UNION ALL
SELECT 4, '2019-01-03' UNION ALL
SELECT 5, '2019-01-03' UNION ALL
SELECT 1, '2019-01-04' UNION ALL
SELECT 4, '2019-01-04' UNION ALL
SELECT 2, '2019-01-05' UNION ALL
SELECT 3, '2019-01-05' UNION ALL
SELECT 5, '2019-01-05' UNION ALL
SELECT 6, '2019-01-05'
)
SELECT day,
(SELECT HLL_COUNT.MERGE(sketch) FROM UNNEST(rolling_sketch_arr) sketch) rolling_sketch
FROM (
SELECT day,
ARRAY_AGG(ids_sketch) OVER(ORDER BY UNIX_DATE(day) RANGE BETWEEN 2 PRECEDING AND CURRENT ROW) rolling_sketch_arr
FROM (
SELECT day, HLL_COUNT.INIT(id) ids_sketch
FROM `project.dataset.table`
GROUP BY day
)
)
-- ORDER BY day
结果
Row day rolling_sketch
1 2019-01-01 3
2 2019-01-02 4
3 2019-01-03 5
4 2019-01-04 5
5 2019-01-05 6
示例相关 table 模式:
+---------------------------+-------------------+
| activity_date - TIMESTAMP | user_id - STRING |
+---------------------------+-------------------+
| 2017-02-22 17:36:08 UTC | fake_id_i24385787 |
+---------------------------+-------------------+
| 2017-02-22 04:27:08 UTC | fake_id_234885747 |
+---------------------------+-------------------+
| 2017-02-22 08:36:08 UTC | fake_id_i24385787 |
+---------------------------+-------------------+
我需要在一个滚动时间段(90 天)内对大型数据集的不同活动用户进行计数,并且 运行 由于数据集的大小而遇到问题。
起初,我尝试使用 window 函数,类似于此处的答案。
WITH
daily AS (
SELECT
DATE(activity_date) day,
user_id
FROM
`fake-table`)
SELECT
day,
SUM(APPROX_COUNT_DISTINCT(user_id)) OVER (ORDER BY day ROWS BETWEEN 89 PRECEDING AND CURRENT ROW) ninty_day_window_apprx
FROM
daily
GROUP BY
1
ORDER BY
1 DESC
然而,这导致每天获得不同数量的用户,然后将这些用户相加 - 但不同的用户可能会在 window 中重复,如果它们出现多次。因此,这并不是对 90 天内不同用户的真正准确衡量。
接下来我尝试的是使用以下解决方案
WITH daily AS (
SELECT date(activity_date) day, STRING_AGG(DISTINCT user_id) users
FROM `fake-table`
GROUP BY day
), temp2 AS (
SELECT
day,
STRING_AGG(users) OVER(ORDER BY UNIX_DATE(day) RANGE BETWEEN 89 PRECEDING AND CURRENT ROW) users
FROM daily
)
SELECT day,
(SELECT APPROX_COUNT_DISTINCT(id) FROM UNNEST(SPLIT(users)) AS id) Unique90Days
FROM temp2
order by 1 desc
然而,这很快 运行 内存不足。
接下来是使用 HLL 草图以更小的值表示不同的 ID,这样内存就不是问题了。我以为我的问题已经解决了,但是当 运行 以下内容时我收到了一个错误: 错误只是 "Function MERGE_PARTIAL is not supported." 我也尝试使用 MERGE 并得到了同样的错误。它仅在使用 window 函数时发生。为每天的价值创建草图效果很好。
我通读了 BigQuery Standard SQL 文档,但没有看到任何关于 HLL_COUNT.MERGE_PARTIAL 和 HLL_COUNT.MERGE 以及 window 函数的内容。据推测,这应该将 90 个草图组合成一个 HLL 草图,代表 90 个原始草图之间的不同值?
WITH
daily AS (
SELECT
DATE(activity_date) day,
HLL_COUNT.INIT(user_id) sketch
FROM
`fake-table`
GROUP BY
1
ORDER BY
1 DESC),
rolling AS (
SELECT
day,
HLL_COUNT.MERGE_PARTIAL(sketch) OVER (ORDER BY UNIX_DATE(day) RANGE BETWEEN 89 PRECEDING AND CURRENT ROW) rolling_sketch
FROM daily)
SELECT
day,
HLL_COUNT.EXTRACT(rolling_sketch)
FROM
rolling
ORDER BY
1
"Image of the error - Function MERGE_PARTIAL is not supported"
知道为什么会发生此错误或如何调整吗?
合并 HLL_COUNT.INIT
和 HLL_COUNT.MERGE
。此解决方案使用 GENERATE_ARRAY(1, 90)
而不是 OVER
.
#standardSQL
SELECT DATE_SUB(date, INTERVAL i DAY) date_grp
, HLL_COUNT.MERGE(sketch) unique_90_day_users
, HLL_COUNT.MERGE(DISTINCT IF(i<31,sketch,null)) unique_30_day_users
, HLL_COUNT.MERGE(DISTINCT IF(i<8,sketch,null)) unique_7_day_users
FROM (
SELECT DATE(creation_date) date, HLL_COUNT.INIT(owner_user_id) sketch
FROM `bigquery-public-data.Whosebug.posts_questions`
WHERE EXTRACT(YEAR FROM creation_date)=2017
GROUP BY 1
), UNNEST(GENERATE_ARRAY(1, 90)) i
GROUP BY 1
ORDER BY date_grp
以下是 BigQuery Standard SQL 并且使用 window 函数
完全符合您的要求#standardSQL
SELECT day,
(SELECT HLL_COUNT.MERGE(sketch) FROM UNNEST(rolling_sketch_arr) sketch) rolling_sketch
FROM (
SELECT day,
ARRAY_AGG(ids_sketch) OVER(ORDER BY UNIX_DATE(day) RANGE BETWEEN 89 PRECEDING AND CURRENT ROW) rolling_sketch_arr
FROM (
SELECT day, HLL_COUNT.INIT(id) ids_sketch
FROM `project.dataset.table`
GROUP BY day
)
)
您可以使用 [完全] 虚拟数据来测试和玩上面的示例
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 id, DATE '2019-01-01' day UNION ALL
SELECT 2, '2019-01-01' UNION ALL
SELECT 3, '2019-01-01' UNION ALL
SELECT 1, '2019-01-02' UNION ALL
SELECT 4, '2019-01-02' UNION ALL
SELECT 2, '2019-01-03' UNION ALL
SELECT 3, '2019-01-03' UNION ALL
SELECT 4, '2019-01-03' UNION ALL
SELECT 5, '2019-01-03' UNION ALL
SELECT 1, '2019-01-04' UNION ALL
SELECT 4, '2019-01-04' UNION ALL
SELECT 2, '2019-01-05' UNION ALL
SELECT 3, '2019-01-05' UNION ALL
SELECT 5, '2019-01-05' UNION ALL
SELECT 6, '2019-01-05'
)
SELECT day,
(SELECT HLL_COUNT.MERGE(sketch) FROM UNNEST(rolling_sketch_arr) sketch) rolling_sketch
FROM (
SELECT day,
ARRAY_AGG(ids_sketch) OVER(ORDER BY UNIX_DATE(day) RANGE BETWEEN 2 PRECEDING AND CURRENT ROW) rolling_sketch_arr
FROM (
SELECT day, HLL_COUNT.INIT(id) ids_sketch
FROM `project.dataset.table`
GROUP BY day
)
)
-- ORDER BY day
结果
Row day rolling_sketch
1 2019-01-01 3
2 2019-01-02 4
3 2019-01-03 5
4 2019-01-04 5
5 2019-01-05 6