与月重叠的组行(月月)大查询
Group rows which overlap with month (month on month) Big Query
我有以下格式的数据
id current_period_start current_period_end
sub_EoJifk08hBL5Tz 2019-07-02 07:30:28 UTC 2019-08-02 07:30:28 UTC
sub_EoJifk08hBL5Tz 2019-07-02 07:30:28 UTC 2019-08-02 07:30:28 UTC
sub_EoJifk08hBL5Tz 2019-07-02 07:30:28 UTC 2019-08-02 07:30:28 UTC
sub_EoJifk08hBL5Tz 2019-07-02 07:30:28 UTC 2019-08-02 07:30:28 UTC
sub_EoJifk08hBL5Tz 2019-07-02 07:30:28 UTC 2019-08-02 07:30:28 UTC
...
- 请注意,每个唯一值都有重复的行
id
我想计算满足以下条件的每个月的 UNIQUE id
的数量:
WHERE current_period_end > start_month and current_period_start < end_month
start_month
和 end_month
是每个月回到最早的 current_period_start
。
所需的输出将是这样
month count
2018-04-27 13:04:19.157149 UTC 20
2018-05-27 13:04:19.157149 UTC 33
2018-06-26 13:04:19.157149 UTC 61
2018-07-26 13:04:19.157149 UTC 93
2018-08-25 13:04:19.157149 UTC 128
当前尝试
- 生成最近 50 个月。
- 匹配符合条件的 ID。
- 计算 ID
虽然这似乎确实生成了所需的输出,但以这种方式生成时间戳感觉不太好。
WITH grouped_by_unique_id AS (
WITH all_timestamps AS (
-- Generate timestamps for last 50 30 day periods (hack)
SELECT TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -30*num day) AS t FROM UNNEST(GENERATE_ARRAY(1, 50)) AS num
)
SELECT t, id
FROM all_timestamps, my_data
WHERE current_period_end > TIMESTAMP_SUB(t, INTERVAL 24*30 HOUR)
AND current_period_start < t
GROUP BY t, id
ORDER BY t, id
)
SELECT t AS month, count(*) AS count
FROM grouped_by_unique_id
GROUP BY t
ORDER BY t desc;
您可以使用 generate_timestamp_array()
并通常简化您的查询:
select t AS month, count(distinct d.id) AS paying_customers
from unnest(generate_timestamp_array( date_timestamp(current_timestamp, interval -30*2 day),
current_timestamp,
interval 30 day
)
) ts left join
my_data d
on d.current_period_end > timestamp(ts, interval 30 day) and
d.current_period_start < t
group by ts
order by ts;
使用 BigQuery 中的 date/timestamp 函数,您可以使用更精确的度量,例如日历月。
这看起来像是事件重叠的问题windows。试试这个方法(使用 BQ 的 GENERATE_DATE_ARRAY
函数:
with intervals as (
select
month as start_month,
coalesce(lead(month) over (order by month), month) as end_month
from (
select
generate_date_array(
date(extract(year from min_date), extract(month from min_date), 1),
date(extract(year from max_date)+1, extract(month from max_date), 1),
interval 1 month
) as months
from (
select
min(date(current_period_start)) min_date,
max(date(current_period_end)) max_date
from `dataset.table`
)
), unnest(months) as month
)
select
start_month,
end_month,
count(distinct id) as count
from (
select
d.current_period_start,
d.current_period_end,
d.id,
i.start_month,
i.end_month
from intervals i
left join data d on 1=1
where d.current_period_end > i.start_month and d.current_period_start < i.end_month
)
group by 1,2
order by 1,2
我有以下格式的数据
id current_period_start current_period_end
sub_EoJifk08hBL5Tz 2019-07-02 07:30:28 UTC 2019-08-02 07:30:28 UTC
sub_EoJifk08hBL5Tz 2019-07-02 07:30:28 UTC 2019-08-02 07:30:28 UTC
sub_EoJifk08hBL5Tz 2019-07-02 07:30:28 UTC 2019-08-02 07:30:28 UTC
sub_EoJifk08hBL5Tz 2019-07-02 07:30:28 UTC 2019-08-02 07:30:28 UTC
sub_EoJifk08hBL5Tz 2019-07-02 07:30:28 UTC 2019-08-02 07:30:28 UTC
...
- 请注意,每个唯一值都有重复的行
id
我想计算满足以下条件的每个月的 UNIQUE id
的数量:
WHERE current_period_end > start_month and current_period_start < end_month
start_month
和 end_month
是每个月回到最早的 current_period_start
。
所需的输出将是这样
month count
2018-04-27 13:04:19.157149 UTC 20
2018-05-27 13:04:19.157149 UTC 33
2018-06-26 13:04:19.157149 UTC 61
2018-07-26 13:04:19.157149 UTC 93
2018-08-25 13:04:19.157149 UTC 128
当前尝试
- 生成最近 50 个月。
- 匹配符合条件的 ID。
- 计算 ID
虽然这似乎确实生成了所需的输出,但以这种方式生成时间戳感觉不太好。
WITH grouped_by_unique_id AS (
WITH all_timestamps AS (
-- Generate timestamps for last 50 30 day periods (hack)
SELECT TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -30*num day) AS t FROM UNNEST(GENERATE_ARRAY(1, 50)) AS num
)
SELECT t, id
FROM all_timestamps, my_data
WHERE current_period_end > TIMESTAMP_SUB(t, INTERVAL 24*30 HOUR)
AND current_period_start < t
GROUP BY t, id
ORDER BY t, id
)
SELECT t AS month, count(*) AS count
FROM grouped_by_unique_id
GROUP BY t
ORDER BY t desc;
您可以使用 generate_timestamp_array()
并通常简化您的查询:
select t AS month, count(distinct d.id) AS paying_customers
from unnest(generate_timestamp_array( date_timestamp(current_timestamp, interval -30*2 day),
current_timestamp,
interval 30 day
)
) ts left join
my_data d
on d.current_period_end > timestamp(ts, interval 30 day) and
d.current_period_start < t
group by ts
order by ts;
使用 BigQuery 中的 date/timestamp 函数,您可以使用更精确的度量,例如日历月。
这看起来像是事件重叠的问题windows。试试这个方法(使用 BQ 的 GENERATE_DATE_ARRAY
函数:
with intervals as (
select
month as start_month,
coalesce(lead(month) over (order by month), month) as end_month
from (
select
generate_date_array(
date(extract(year from min_date), extract(month from min_date), 1),
date(extract(year from max_date)+1, extract(month from max_date), 1),
interval 1 month
) as months
from (
select
min(date(current_period_start)) min_date,
max(date(current_period_end)) max_date
from `dataset.table`
)
), unnest(months) as month
)
select
start_month,
end_month,
count(distinct id) as count
from (
select
d.current_period_start,
d.current_period_end,
d.id,
i.start_month,
i.end_month
from intervals i
left join data d on 1=1
where d.current_period_end > i.start_month and d.current_period_start < i.end_month
)
group by 1,2
order by 1,2