加窗平均,考虑差距
Windowed Average, accounting for gaps
我需要计算前 4 周的平均值...
SELECT
*,
AVG(val) OVER (PARTITION BY some_identifier, day_of_week_column
ORDER BY date_column
ROW BETWEEN 4 PRECEDING AND 1 PRECEDING
)
AS preceding_4_week_average
FROM
myTable
然而,数据是"sparse"
在这种情况下,我的 window 函数应该回顾“4 周”而不是“4 行”。
- 丢失的日期不是 0
,而是隐含的 NULL
thing | date | dow | val | avg
1 | 2018-01-01 | 1 | 1 | NULL <= AVG({})
1 | 2018-01-08 | 1 | 2 | 1 <= AVG({1})
1 | 2018-01-15 | 1 | 3 | 1.5 <= AVG({1,2})
1 | 2018-01-22 | 1 | 4 | 2 <= AVG({1,2,3})
1 | 2018-01-29 | 1 | 5 | 2.5 <= AVG({1,2,3,4})
1 | 2018-02-12 | 1 | 7 | 4 <= AVG({3,4,5})
1 | 2018-02-19 | 1 | 8 | 5.33 <= AVG({4,5,7})
1 | 2018-02-26 | 1 | 9 | 6.66 <= AVG({5,7,8})
1 | 2018-03-05 | 1 | 10 | 8 <= AVG({7,8,9})
1 | 2018-03-12 | 1 | 11 | 11.25 <= AVG({7,8,9,10})
1 | 2018-03-19 | 1 | 12 | 9.5 <= AVG({8,9,10,11})
注意:2018-02-05 没有值
我通常会通过以下两种方式之一处理它...
- LEFT JOIN 到模板 "force" 所有日期都存在,并有效地依赖
AVG()
"ignoring" NULL。
这不太理想,因为 "things" 的数量很大,而且构建此模板的成本很高。
SELECT
*,
AVG(mytable.val) OVER (PARTITION BY things.id, dates.dow
ORDER BY dates.date
ROW BETWEEN 4 PRECEDING AND 1 PRECEDING
)
AS preceding_4_week_average
FROM
things
CROSS JOIN
dates
LEFT JOIN
myTable
ON myTable.date = dates.date
AND myTable.id = things.id
- 不要使用 window 函数,而是使用自连接
这不太理想,因为 myTable 中有数百列,而 BigQuery 在这方面表现不佳。
SELECT
myTable.*,
AVG(hist.val) AS preceding_4_week_average
FROM
myTable
LEFT JOIN
myTable AS hist
ON hist.id = myTable.id
AND hist.date >= myTable.date - INTERVAL 28 DAYS
AND hist.date < myTable.date
GROUP BY
myTable.column1,
myTable.column2,
etc, etc
真题
有没有其他人有替代方案,最好使用 windowed/analytic 函数来 "look back 4 weeks" 而不是 "look back 4 rows"?
这是蛮力,但应该更快:
select t.*,
((case when date_1 >= date_add(date, interval -4 week)
then val_1 else 0
end) +
(case when date_2 >= date_add(date, interval -4 week)
then val_2 else 0
end) +
(case when date_3 >= date_add(date, interval -4 week)
then val_3 else 0
end) +
(case when date_4 >= date_add(date, interval -4 week)
then val_4 else 0
end)
) /
((case when date_1 >= date_add(date, interval -4 week)
then 1 else 0
end) +
(case when date_2 >= date_add(date, interval -4 week)
then 1 else 0
end) +
(case when date_3 >= date_add(date, interval -4 week)
then 1 else 0
end) +
(case when date_4 >= date_add(date, interval -4 week)
then 1 else 0
end)
)
from (select t.*,
lag(val, 1) over (partition by id, dow order by date) as val_1,
lag(val, 2) over (partition by id, dow order by date) as val_2,
lag(val, 3) over (partition by id, dow order by date) as val_3,
lag(val, 4) over (partition by id, dow order by date) as val_4,
lag(date, 1) over (partition by id, dow order by date) as date_1,
lag(date, 2) over (partition by id, dow order by date) as date_2,
lag(date, 3) over (partition by id, dow order by date) as date_3,
lag(date, 4) over (partition by id, dow order by date) as date_4
from mytable t
) t;
可能有一种巧妙的方法可以使用数组来表达这一点,但我在这里有点早。
以下适用于 BigQuery 标准 SQL
如您所见 - 诀窍在于使用 RANGE
而不是 ROW
#standardSQL
SELECT *,
AVG(val) OVER(
PARTITION BY id, dow
ORDER BY DATE_DIFF(DATE_TRUNC(date, WEEK), DATE_TRUNC(CURRENT_DATE(), WEEK), WEEK)
RANGE BETWEEN 4 PRECEDING AND 1 PRECEDING
) AVG
FROM `project.dataset.table`
您可以使用您问题中的虚拟数据来测试和玩上面的内容,如下所示
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 id, DATE '2018-01-01' date, 1 dow, 1 val UNION ALL
SELECT 1, '2018-01-08', 1, 2 UNION ALL
SELECT 1, '2018-01-15', 1, 3 UNION ALL
SELECT 1, '2018-01-22', 1, 4 UNION ALL
SELECT 1, '2018-01-29', 1, 5 UNION ALL
SELECT 1, '2018-02-12', 1, 7 UNION ALL
SELECT 1, '2018-02-19', 1, 8 UNION ALL
SELECT 1, '2018-02-26', 1, 9 UNION ALL
SELECT 1, '2018-03-05', 1, 10 UNION ALL
SELECT 1, '2018-03-12', 1, 11 UNION ALL
SELECT 1, '2018-03-19', 1, 12
)
SELECT *,
AVG(val) OVER(
PARTITION BY id, dow
ORDER BY DATE_DIFF(DATE_TRUNC(date, WEEK), DATE_TRUNC(CURRENT_DATE(), WEEK), WEEK)
RANGE BETWEEN 4 PRECEDING AND 1 PRECEDING
) avg
FROM `project.dataset.table`
-- ORDER BY date
结果为
Row id date dow val avg
1 1 2018-01-01 1 1 null
2 1 2018-01-08 1 2 1.0
3 1 2018-01-15 1 3 1.5
4 1 2018-01-22 1 4 2.0
5 1 2018-01-29 1 5 2.5
6 1 2018-02-12 1 7 4.0
7 1 2018-02-19 1 8 5.333333333333333
8 1 2018-02-26 1 9 6.666666666666667
9 1 2018-03-05 1 10 8.0
10 1 2018-03-12 1 11 8.5
11 1 2018-03-19 1 12 9.5
我需要计算前 4 周的平均值...
SELECT
*,
AVG(val) OVER (PARTITION BY some_identifier, day_of_week_column
ORDER BY date_column
ROW BETWEEN 4 PRECEDING AND 1 PRECEDING
)
AS preceding_4_week_average
FROM
myTable
然而,数据是"sparse"
在这种情况下,我的 window 函数应该回顾“4 周”而不是“4 行”。
- 丢失的日期不是 0
,而是隐含的 NULL
thing | date | dow | val | avg
1 | 2018-01-01 | 1 | 1 | NULL <= AVG({})
1 | 2018-01-08 | 1 | 2 | 1 <= AVG({1})
1 | 2018-01-15 | 1 | 3 | 1.5 <= AVG({1,2})
1 | 2018-01-22 | 1 | 4 | 2 <= AVG({1,2,3})
1 | 2018-01-29 | 1 | 5 | 2.5 <= AVG({1,2,3,4})
1 | 2018-02-12 | 1 | 7 | 4 <= AVG({3,4,5})
1 | 2018-02-19 | 1 | 8 | 5.33 <= AVG({4,5,7})
1 | 2018-02-26 | 1 | 9 | 6.66 <= AVG({5,7,8})
1 | 2018-03-05 | 1 | 10 | 8 <= AVG({7,8,9})
1 | 2018-03-12 | 1 | 11 | 11.25 <= AVG({7,8,9,10})
1 | 2018-03-19 | 1 | 12 | 9.5 <= AVG({8,9,10,11})
注意:2018-02-05 没有值
我通常会通过以下两种方式之一处理它...
- LEFT JOIN 到模板 "force" 所有日期都存在,并有效地依赖
AVG()
"ignoring" NULL。
这不太理想,因为 "things" 的数量很大,而且构建此模板的成本很高。
SELECT
*,
AVG(mytable.val) OVER (PARTITION BY things.id, dates.dow
ORDER BY dates.date
ROW BETWEEN 4 PRECEDING AND 1 PRECEDING
)
AS preceding_4_week_average
FROM
things
CROSS JOIN
dates
LEFT JOIN
myTable
ON myTable.date = dates.date
AND myTable.id = things.id
- 不要使用 window 函数,而是使用自连接
这不太理想,因为 myTable 中有数百列,而 BigQuery 在这方面表现不佳。
SELECT
myTable.*,
AVG(hist.val) AS preceding_4_week_average
FROM
myTable
LEFT JOIN
myTable AS hist
ON hist.id = myTable.id
AND hist.date >= myTable.date - INTERVAL 28 DAYS
AND hist.date < myTable.date
GROUP BY
myTable.column1,
myTable.column2,
etc, etc
真题
有没有其他人有替代方案,最好使用 windowed/analytic 函数来 "look back 4 weeks" 而不是 "look back 4 rows"?
这是蛮力,但应该更快:
select t.*,
((case when date_1 >= date_add(date, interval -4 week)
then val_1 else 0
end) +
(case when date_2 >= date_add(date, interval -4 week)
then val_2 else 0
end) +
(case when date_3 >= date_add(date, interval -4 week)
then val_3 else 0
end) +
(case when date_4 >= date_add(date, interval -4 week)
then val_4 else 0
end)
) /
((case when date_1 >= date_add(date, interval -4 week)
then 1 else 0
end) +
(case when date_2 >= date_add(date, interval -4 week)
then 1 else 0
end) +
(case when date_3 >= date_add(date, interval -4 week)
then 1 else 0
end) +
(case when date_4 >= date_add(date, interval -4 week)
then 1 else 0
end)
)
from (select t.*,
lag(val, 1) over (partition by id, dow order by date) as val_1,
lag(val, 2) over (partition by id, dow order by date) as val_2,
lag(val, 3) over (partition by id, dow order by date) as val_3,
lag(val, 4) over (partition by id, dow order by date) as val_4,
lag(date, 1) over (partition by id, dow order by date) as date_1,
lag(date, 2) over (partition by id, dow order by date) as date_2,
lag(date, 3) over (partition by id, dow order by date) as date_3,
lag(date, 4) over (partition by id, dow order by date) as date_4
from mytable t
) t;
可能有一种巧妙的方法可以使用数组来表达这一点,但我在这里有点早。
以下适用于 BigQuery 标准 SQL
如您所见 - 诀窍在于使用 RANGE
而不是 ROW
#standardSQL
SELECT *,
AVG(val) OVER(
PARTITION BY id, dow
ORDER BY DATE_DIFF(DATE_TRUNC(date, WEEK), DATE_TRUNC(CURRENT_DATE(), WEEK), WEEK)
RANGE BETWEEN 4 PRECEDING AND 1 PRECEDING
) AVG
FROM `project.dataset.table`
您可以使用您问题中的虚拟数据来测试和玩上面的内容,如下所示
#standardSQL
WITH `project.dataset.table` AS (
SELECT 1 id, DATE '2018-01-01' date, 1 dow, 1 val UNION ALL
SELECT 1, '2018-01-08', 1, 2 UNION ALL
SELECT 1, '2018-01-15', 1, 3 UNION ALL
SELECT 1, '2018-01-22', 1, 4 UNION ALL
SELECT 1, '2018-01-29', 1, 5 UNION ALL
SELECT 1, '2018-02-12', 1, 7 UNION ALL
SELECT 1, '2018-02-19', 1, 8 UNION ALL
SELECT 1, '2018-02-26', 1, 9 UNION ALL
SELECT 1, '2018-03-05', 1, 10 UNION ALL
SELECT 1, '2018-03-12', 1, 11 UNION ALL
SELECT 1, '2018-03-19', 1, 12
)
SELECT *,
AVG(val) OVER(
PARTITION BY id, dow
ORDER BY DATE_DIFF(DATE_TRUNC(date, WEEK), DATE_TRUNC(CURRENT_DATE(), WEEK), WEEK)
RANGE BETWEEN 4 PRECEDING AND 1 PRECEDING
) avg
FROM `project.dataset.table`
-- ORDER BY date
结果为
Row id date dow val avg
1 1 2018-01-01 1 1 null
2 1 2018-01-08 1 2 1.0
3 1 2018-01-15 1 3 1.5
4 1 2018-01-22 1 4 2.0
5 1 2018-01-29 1 5 2.5
6 1 2018-02-12 1 7 4.0
7 1 2018-02-19 1 8 5.333333333333333
8 1 2018-02-26 1 9 6.666666666666667
9 1 2018-03-05 1 10 8.0
10 1 2018-03-12 1 11 8.5
11 1 2018-03-19 1 12 9.5