Clickhouse:滑动/移动 window

Clickhouse: Sliding / moving window

我正在寻找一种有效的方法来查询 n 过去的值作为 ClickHouse 中按一列排序的每一行的数组(即 Time) ,其中的值应作为数组检索。

Window 函数在 ClickHouse 中仍然不受支持(参见 #1469),所以我希望使用像 groupArray()?

这样的聚合函数来解决问题

Table:

Time  | Value
12:11 | 1
12:12 | 2
12:13 | 3
12:14 | 4
12:15 | 5
12:16 | 6

window 大小 n=3 的预期结果:

Time  | Value
12:13 | [1,2,3]
12:14 | [2,3,4]
12:15 | [3,4,5]
12:16 | [4,5,6]

ClickHouse 目前使用哪些 ways/functions 来高效地查询 sliding/moving window 以及如何获得我想要的结果?

编辑:

我的解决方案基于@vladimir 的回复:

select max(Time) as Time, groupArray(Value) as Values
from (
    select
        *,
        rowNumberInAllBlocks() as row_number,
        arrayJoin(range(row_number, row_number + 3)) as window_id
    from (
        /* BEGIN emulate origin dataset */
        select toDateTime(a) as Time, rowNumberInAllBlocks()+1 as Value
        from (
            select arrayJoin([
                '2020-01-01 12:11:00',
                '2020-01-01 12:12:00',
                '2020-01-01 12:13:00',
                '2020-01-01 12:14:00',
                '2020-01-01 12:15:00',
                '2020-01-01 12:16:00']) a
        )
        order by Time
        /* END emulate origin dataset */
    )
    order by Time
) s
group by window_id
having length(Values) = 3
order by Time

请注意,3 在查询中出现两次,代表 window 大小 n

输出:

┌────────────────Time─┬─Values──┐
│ 2020-01-01 12:13:00 │ [1,2,3] │
│ 2020-01-01 12:14:00 │ [2,3,4] │
│ 2020-01-01 12:15:00 │ [3,4,5] │
│ 2020-01-01 12:16:00 │ [4,5,6] │
└─────────────────────┴─────────┘

version 21.4 开始添加了对 window-functions 的 完整 支持。此时它被标记为实验性功能

SELECT
    Time,
    groupArray(any(Value)) OVER (ORDER BY Time ASC ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS Values
FROM 
(
    /* Emulate the test dataset, */
    select toDateTime(a) as Time, rowNumberInAllBlocks()+1 as Value
    from (
        select arrayJoin([
            '2020-01-01 12:11:00',
            '2020-01-01 12:12:00',
            '2020-01-01 12:13:00',
            '2020-01-01 12:14:00',
            '2020-01-01 12:15:00',
            '2020-01-01 12:16:00']) a
    )
    order by Time
)
GROUP BY Time
SETTINGS allow_experimental_window_functions = 1

/*
┌────────────────Time─┬─Values──┐
│ 2020-01-01 12:11:00 │ [1]     │
│ 2020-01-01 12:12:00 │ [1,2]   │
│ 2020-01-01 12:13:00 │ [1,2,3] │
│ 2020-01-01 12:14:00 │ [2,3,4] │
│ 2020-01-01 12:15:00 │ [3,4,5] │
│ 2020-01-01 12:16:00 │ [4,5,6] │
└─────────────────────┴─────────┘
*/

参见 https://altinity.com/blog/clickhouse-window-functions-current-state-of-the-art


ClickHouse 有几个 datablock-scoped window 函数,我们就拿 neighbor:

SELECT Time, [neighbor(Value, -2), neighbor(Value, -1), neighbor(Value, 0)] Values
FROM (
  /* emulate origin data */
  SELECT toDateTime(data.1) as Time, data.2 as Value
  FROM (
    SELECT arrayJoin([('2020-01-01 12:11:00', 1),
    ('2020-01-01 12:12:00', 2),
    ('2020-01-01 12:13:00', 3),
    ('2020-01-01 12:14:00', 4),
    ('2020-01-01 12:15:00', 5),
    ('2020-01-01 12:16:00', 6)]) as data)
  )

/*
┌────────────────Time─┬─Values──┐
│ 2020-01-01 12:11:00 │ [0,0,1] │
│ 2020-01-01 12:12:00 │ [0,1,2] │
│ 2020-01-01 12:13:00 │ [1,2,3] │
│ 2020-01-01 12:14:00 │ [2,3,4] │
│ 2020-01-01 12:15:00 │ [3,4,5] │
│ 2020-01-01 12:16:00 │ [4,5,6] │
└─────────────────────┴─────────┘

*/

基于源行重复 window_size 次的替代方法:

SELECT   
  arrayReduce('max', arrayMap(x -> x.1, raw_result)) Time,
  arrayMap(x -> x.2, raw_result) Values
FROM (  
  SELECT groupArray((Time, Value)) raw_result, max(row_number) max_row_number
  FROM (
    SELECT 
      3 AS window_size,
      *, 
      rowNumberInAllBlocks() row_number,
      arrayJoin(arrayMap(x -> x + row_number, range(window_size))) window_id
    FROM (
      /* emulate origin dataset */
      SELECT toDateTime(data.1) as Time, data.2 as Value
      FROM (
        SELECT arrayJoin([('2020-01-01 12:11:00', 1),
          ('2020-01-01 12:12:00', 2),
          ('2020-01-01 12:13:00', 3),
          ('2020-01-01 12:14:00', 4),
          ('2020-01-01 12:15:00', 5),
          ('2020-01-01 12:16:00', 6)]) as data)
      ORDER BY Value
      )
    )
  GROUP BY window_id
  HAVING max_row_number = window_id
  ORDER BY window_id
  )
/*
┌────────────────Time─┬─Values──┐
│ 2020-01-01 12:11:00 │ [1]     │
│ 2020-01-01 12:12:00 │ [1,2]   │
│ 2020-01-01 12:13:00 │ [1,2,3] │
│ 2020-01-01 12:14:00 │ [2,3,4] │
│ 2020-01-01 12:15:00 │ [3,4,5] │
│ 2020-01-01 12:16:00 │ [4,5,6] │
└─────────────────────┴─────────┘
*/

附加示例:

SELECT   
  arrayReduce('max', arrayMap(x -> x.1, raw_result)) id,
  arrayMap(x -> x.2, raw_result) values
FROM (  
  SELECT groupArray((id, value)) raw_result, max(row_number) max_row_number
  FROM (
    SELECT 
      48 AS window_size,
      *, 
      rowNumberInAllBlocks() row_number,
      arrayJoin(arrayMap(x -> x + row_number, range(window_size))) window_id
    FROM (
      /* the origin dataset */
      SELECT number AS id, number AS value
      FROM numbers(4096) 
      )
    )
  GROUP BY window_id
  HAVING max_row_number = window_id
  ORDER BY window_id
  )
/*
┌─id─┬─values────────────────┐
│  0 │ [0]                   │
│  1 │ [0,1]                 │
│  2 │ [0,1,2]               │
│  3 │ [0,1,2,3]             │
│  4 │ [0,1,2,3,4]           │
│  5 │ [0,1,2,3,4,5]         │
│  6 │ [0,1,2,3,4,5,6]       │
│  7 │ [0,1,2,3,4,5,6,7]     │
..
│ 56 │ [9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56]  │
│ 57 │ [10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57] │
│ 58 │ [11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58] │
│ 59 │ [12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59] │
│ 60 │ [13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60] │
..
│ 4093 │ [4046,4047,4048,4049,4050,4051,4052,4053,4054,4055,4056,4057,4058,4059,4060,4061,4062,4063,4064,4065,4066,4067,4068,4069,4070,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080,4081,4082,4083,4084,4085,4086,4087,4088,4089,4090,4091,4092,4093] │
│ 4094 │ [4047,4048,4049,4050,4051,4052,4053,4054,4055,4056,4057,4058,4059,4060,4061,4062,4063,4064,4065,4066,4067,4068,4069,4070,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080,4081,4082,4083,4084,4085,4086,4087,4088,4089,4090,4091,4092,4093,4094] │
│ 4095 │ [4048,4049,4050,4051,4052,4053,4054,4055,4056,4057,4058,4059,4060,4061,4062,4063,4064,4065,4066,4067,4068,4069,4070,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080,4081,4082,4083,4084,4085,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095] │
└──────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
*/

对于 Clickhouse 19,范围函数只接受单个输入,您可以使用以下查询

select max(Time) as Time, groupArray(Value) as Values
from (
select
        *,
        rowNumberInAllBlocks() as row_number,
        arrayJoin( arrayMap(x -> x + row_number, range(3)) ) as window_id
    from (
        /* BEGIN emulate origin dataset */
        select toDateTime(a) as Time, rowNumberInAllBlocks()+1 as Value
        from (
            select arrayJoin([
                '2020-01-01 12:11:00',
                '2020-01-01 12:12:00',
                '2020-01-01 12:13:00',
                '2020-01-01 12:14:00',
                '2020-01-01 12:15:00',
                '2020-01-01 12:16:00']) a
        )
        order by Time
        /* END emulate origin dataset */
    )
order by Time
) s
group by window_id
having length(Values) = 3
order by Time