具有缺失值的 BIGQUERY 移动平均值

BIGQUERY moving average with missing values

我有以下数据

with dummy_data as 
(
SELECT '2017-01-01' as ref_month, 18 as value, 1 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 1 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 1 as id
-- UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 1 as id
UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 1 as id
UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 1 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 1 as id
-- UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 1 as id
-- UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 1 as id
UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 1 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 1 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 1 as id
UNION ALL SELECT '2017-01-01' as ref_month, 18 as value, 2 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 2 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 2 as id
UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 2 as id
-- UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 2 as id
-- UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 2 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 2 as id
UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 2 as id
UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 2 as id
-- UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 2 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 2 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 2 as id
)

我想计算每个 id 的移动平均值。我知道您可以执行以下操作

select 
    id
  , ref_month
  , avg(value) over (partition by id order by ref_month ROWS BETWEEN 5 PRECEDING AND CURRENT ROW ) as moving_avg
from 
    dummy_data

但是正如您从我的虚拟数据中看到的那样,有一些缺失值。 关于在存在某些缺失值时如何轻松计算移动平均线的任何想法? 我想先计算一个完整的日期范围

date_range AS
(
  SELECT reference_month
  FROM UNNEST(
      GENERATE_DATE_ARRAY(PARSE_DATE('%Y-%m-%d', (SELECT MIN(ref_month) FROM dummy_data)), PARSE_DATE('%Y-%m-%d', (SELECT MAX(ref_month) FROM dummy_data)), INTERVAL 1 MONTH)
  ) AS reference_month
)

然后用 id 做一个笛卡尔积,然后用我的虚拟数据加入回来,但这似乎是一种反模式。关于如何以最佳方式执行此操作的任何想法? 谢谢

编辑:

预期结果: 对于 id 1:

2017-01-01  18
2017-02-01  19
2017-03-01  20
2017-05-01  18
2017-06-01  21.8
2017-07-01  26.2
2017-10-01  26
2017-11-01  30
2017-12-01  32.8

对于 id 2:

2017-01-01  18
2017-02-01  19
2017-03-01  20
2017-04-01  22
2017-07-01  18.4
2017-08-01  25
2017-09-01  29.2
2017-11-01  40.6
2017-12-01  43.4

这应该有效:

with dummy_data as 
(
SELECT '2017-01-01' as ref_month, 18 as value, 1 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 1 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 1 as id
-- UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 1 as id
UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 1 as id
UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 1 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 1 as id
-- UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 1 as id
-- UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 1 as id
UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 1 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 1 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 1 as id
UNION ALL SELECT '2017-01-01' as ref_month, 18 as value, 2 as id
UNION ALL SELECT '2017-02-01' as ref_month, 20 as value, 2 as id
UNION ALL SELECT '2017-03-01' as ref_month, 22 as value, 2 as id
UNION ALL SELECT '2017-04-01' as ref_month, 28 as value, 2 as id
-- UNION ALL SELECT '2017-05-01' as ref_month, 30 as value, 2 as id
-- UNION ALL SELECT '2017-06-01' as ref_month, 37 as value, 2 as id
UNION ALL SELECT '2017-07-01' as ref_month, 42 as value, 2 as id
UNION ALL SELECT '2017-08-01' as ref_month, 55 as value, 2 as id
UNION ALL SELECT '2017-09-01' as ref_month, 49 as value, 2 as id
-- UNION ALL SELECT '2017-10-01' as ref_month, 51 as value, 2 as id
UNION ALL SELECT '2017-11-01' as ref_month, 57 as value, 2 as id
UNION ALL SELECT '2017-12-01' as ref_month, 56 as value, 2 as id
)


select 
    id
  , ref_month
  , avg(avg(value)) over (partition by id order by ref_month) as moving_avg
from 
    dummy_data
    group by id
  , ref_month

如果您想将值视为 0 并且想要“5”,那么一系列 lag() 可能是最简单的方法:

select id, ref_month,
       (value +
        (case when lag(ref_month) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
              then lag(value, 1) over (partition by id order by ref_month)
              else 0
         end) +
        (case when lag(ref_month, 2) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
              then lag(value, 2) over (partition by id order by ref_month)
              else 0
         end) +
        (case when lag(ref_month, 3) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
              then lag(value, 3) over (partition by id order by ref_month)
              else 0
         end) +
        (case when lag(ref_month, 4) over (partition by id order by ref_month) > date_add(ref_month, interval -4 month)
              then lag(value, 4) over (partition by id order by ref_month)
              else 0
         end)
       ) / 
       least(5, date_diff(min(ref_month) over (partition by id), ref_month))
from dummy_data;

查询比逻辑复杂。它基本上将五个最近的值相加除以 5。但它会影响边界条件(以及缺失值)。

以下适用于 BigQuery 标准 SQL,并且确实有效! :o)
它假定您的 ref_month 是 DATE 数据类型(如果您的情况是 STRING - 仍然可以 - 请参阅我答案底部的注释)

#standardSQL
SELECT 
  id, 
  ref_month,
  ROUND(SUM(value) OVER (rolling_six_days) / 
    (LAST_VALUE(month_pos) OVER (rolling_six_days) 
      - FIRST_VALUE(month_pos) OVER (rolling_six_days)
      + 1)
  ) AS correct_moving_avg
FROM (
  SELECT id, ref_month, value,
    DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
  FROM dummy_data
)
WINDOW rolling_six_days AS 
  (PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )

您可以使用下面的示例数据测试/使用它

#standardSQL
WITH dummy_data AS (
  SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 1 AS id
  -- UNION ALL SELECT DATE '2017-04-01' as ref_month, 28 as value, 1 as id
  UNION ALL SELECT DATE '2017-05-01' AS ref_month, 30 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-06-01' AS ref_month, 37 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 1 AS id
  -- UNION ALL SELECT DATE '2017-08-01' as ref_month, 55 as value, 1 as id
  -- UNION ALL SELECT DATE '2017-09-01' as ref_month, 49 as value, 1 as id
  UNION ALL SELECT DATE '2017-10-01' AS ref_month, 51 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-04-01' AS ref_month, 28 AS value, 2 AS id
  -- UNION ALL SELECT DATE '2017-05-01' as ref_month, 30 as value, 2 as id
  -- UNION ALL SELECT DATE '2017-06-01' as ref_month, 37 as value, 2 as id
  UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-08-01' AS ref_month, 55 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-09-01' AS ref_month, 49 AS value, 2 AS id
  -- UNION ALL SELECT DATE '2017-10-01' as ref_month, 51 as value, 2 as id
  UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 2 AS id
)
SELECT 
  id, 
  ref_month,
  ROUND(SUM(value) OVER (rolling_six_days) / 
    (LAST_VALUE(month_pos) OVER (rolling_six_days) 
      - FIRST_VALUE(month_pos) OVER (rolling_six_days)
      + 1)
  ) AS correct_moving_avg
FROM (
  SELECT id, ref_month, value,
    DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
  FROM dummy_data
)
WINDOW rolling_six_days AS (PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )
ORDER BY 1,2  

为了帮助您探索逻辑 - 请参阅上面查询的下面 "expanded" 版本 - 它甚至将所有中间值传播到非常外部 select 因此您可以看到所有内容...

#standardSQL
WITH dummy_data AS 
(
  SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 1 AS id
  -- UNION ALL SELECT DATE '2017-04-01' as ref_month, 28 as value, 1 as id
  UNION ALL SELECT DATE '2017-05-01' AS ref_month, 30 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-06-01' AS ref_month, 37 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 1 AS id
  -- UNION ALL SELECT DATE '2017-08-01' as ref_month, 55 as value, 1 as id
  -- UNION ALL SELECT DATE '2017-09-01' as ref_month, 49 as value, 1 as id
  UNION ALL SELECT DATE '2017-10-01' AS ref_month, 51 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 1 AS id
  UNION ALL SELECT DATE '2017-01-01' AS ref_month, 18 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-02-01' AS ref_month, 20 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-03-01' AS ref_month, 22 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-04-01' AS ref_month, 28 AS value, 2 AS id
  -- UNION ALL SELECT DATE '2017-05-01' as ref_month, 30 as value, 2 as id
  -- UNION ALL SELECT DATE '2017-06-01' as ref_month, 37 as value, 2 as id
  UNION ALL SELECT DATE '2017-07-01' AS ref_month, 42 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-08-01' AS ref_month, 55 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-09-01' AS ref_month, 49 AS value, 2 AS id
  -- UNION ALL SELECT DATE '2017-10-01' as ref_month, 51 as value, 2 as id
  UNION ALL SELECT DATE '2017-11-01' AS ref_month, 57 AS value, 2 AS id
  UNION ALL SELECT DATE '2017-12-01' AS ref_month, 56 AS value, 2 AS id
)
SELECT 
  id,
  ref_month,
  value,
  moving_sum,
  first_month,
  last_month,
  ROUND(moving_sum / (last_month - first_month + 1)) AS correct_moving_avg,
  moving_avg
FROM (
  SELECT
    id,
    ref_month,
    value,
    SUM(value) OVER (rolling_six_days) AS moving_sum,
    FIRST_VALUE(month_pos) OVER (rolling_six_days) AS first_month,
    LAST_VALUE(month_pos) OVER (rolling_six_days) AS last_month,
    AVG(value) OVER (rolling_six_days) AS moving_avg
  FROM (
    SELECT 
      id, ref_month, value,
      DATE_DIFF(ref_month, '2016-01-01', MONTH) month_pos
    FROM dummy_data
  )
  WINDOW rolling_six_days AS 
    (PARTITION BY id ORDER BY month_pos RANGE BETWEEN 5 PRECEDING AND CURRENT ROW )
)
ORDER BY 1,2   

结果为

id  ref_month   value moving_sum    first_month last_month  correct_moving_avg  moving_avg   
1    2017-01-01 18    18            12          12          18.0                  18.0   
1    2017-02-01 20    38            12          13          19.0                  19.0   
1    2017-03-01 22    60            12          14          20.0                  20.0   
1    2017-05-01 30    90            12          16          18.0                  22.5   
1    2017-06-01 37    127           12          17          21.0                  25.4   
1    2017-07-01 42    151           13          18          25.0                  30.2   
1    2017-10-01 51    160           16          21          27.0                  40.0   
1    2017-11-01 57    187           17          22          31.0                  46.75  
1    2017-12-01 56    206           18          23          34.0                  51.5   
2    2017-01-01 18    18            12          12          18.0                  18.0   
2    2017-02-01 20    38            12          13          19.0                  19.0   
2    2017-03-01 22    60            12          14          20.0                  20.0   
2    2017-04-01 28    88            12          15          22.0                  22.0   
2    2017-07-01 42    112           13          18          19.0                  28.0   
2    2017-08-01 55    147           14          19          25.0                  36.75  
2    2017-09-01 49    174           15          20          29.0                  43.5   
2    2017-11-01 57    203           18          22          41.0                  50.75  
2    2017-12-01 56    259           18          23          43.0                  51.8     

希望这个shows/explains你的做法

注意:如果您的 ref_month 字段是 STRING` 数据类型,您应该稍微调整行 DATE_DIFF - 它应该是

DATE_DIFF(cast(ref_month as DATE), '2016-01-01', MONTH) month_pos

注意 2:我选择“2016-01-01”作为计算月份的起点 - 但您可以选择任何一个以确保它小于您的最短日期 - 例如“2000-01-01” ' 也能完美工作