Clickhouse:滑动/移动 window
Clickhouse: Sliding / moving window
我正在寻找一种有效的方法来查询 n 过去的值作为 ClickHouse 中按一列排序的每一行的数组(即 Time
) ,其中的值应作为数组检索。
Window 函数在 ClickHouse 中仍然不受支持(参见 #1469),所以我希望使用像 groupArray()
?
这样的聚合函数来解决问题
Table:
Time | Value
12:11 | 1
12:12 | 2
12:13 | 3
12:14 | 4
12:15 | 5
12:16 | 6
window 大小 n=3
的预期结果:
Time | Value
12:13 | [1,2,3]
12:14 | [2,3,4]
12:15 | [3,4,5]
12:16 | [4,5,6]
ClickHouse 目前使用哪些 ways/functions 来高效地查询 sliding/moving window 以及如何获得我想要的结果?
编辑:
我的解决方案基于@vladimir 的回复:
select max(Time) as Time, groupArray(Value) as Values
from (
select
*,
rowNumberInAllBlocks() as row_number,
arrayJoin(range(row_number, row_number + 3)) as window_id
from (
/* BEGIN emulate origin dataset */
select toDateTime(a) as Time, rowNumberInAllBlocks()+1 as Value
from (
select arrayJoin([
'2020-01-01 12:11:00',
'2020-01-01 12:12:00',
'2020-01-01 12:13:00',
'2020-01-01 12:14:00',
'2020-01-01 12:15:00',
'2020-01-01 12:16:00']) a
)
order by Time
/* END emulate origin dataset */
)
order by Time
) s
group by window_id
having length(Values) = 3
order by Time
请注意,3
在查询中出现两次,代表 window 大小 n。
输出:
┌────────────────Time─┬─Values──┐
│ 2020-01-01 12:13:00 │ [1,2,3] │
│ 2020-01-01 12:14:00 │ [2,3,4] │
│ 2020-01-01 12:15:00 │ [3,4,5] │
│ 2020-01-01 12:16:00 │ [4,5,6] │
└─────────────────────┴─────────┘
从 version 21.4 开始添加了对 window-functions 的 完整 支持。此时它被标记为实验性功能。
SELECT
Time,
groupArray(any(Value)) OVER (ORDER BY Time ASC ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS Values
FROM
(
/* Emulate the test dataset, */
select toDateTime(a) as Time, rowNumberInAllBlocks()+1 as Value
from (
select arrayJoin([
'2020-01-01 12:11:00',
'2020-01-01 12:12:00',
'2020-01-01 12:13:00',
'2020-01-01 12:14:00',
'2020-01-01 12:15:00',
'2020-01-01 12:16:00']) a
)
order by Time
)
GROUP BY Time
SETTINGS allow_experimental_window_functions = 1
/*
┌────────────────Time─┬─Values──┐
│ 2020-01-01 12:11:00 │ [1] │
│ 2020-01-01 12:12:00 │ [1,2] │
│ 2020-01-01 12:13:00 │ [1,2,3] │
│ 2020-01-01 12:14:00 │ [2,3,4] │
│ 2020-01-01 12:15:00 │ [3,4,5] │
│ 2020-01-01 12:16:00 │ [4,5,6] │
└─────────────────────┴─────────┘
*/
参见 https://altinity.com/blog/clickhouse-window-functions-current-state-of-the-art。
ClickHouse 有几个 datablock-scoped window 函数,我们就拿 neighbor:
SELECT Time, [neighbor(Value, -2), neighbor(Value, -1), neighbor(Value, 0)] Values
FROM (
/* emulate origin data */
SELECT toDateTime(data.1) as Time, data.2 as Value
FROM (
SELECT arrayJoin([('2020-01-01 12:11:00', 1),
('2020-01-01 12:12:00', 2),
('2020-01-01 12:13:00', 3),
('2020-01-01 12:14:00', 4),
('2020-01-01 12:15:00', 5),
('2020-01-01 12:16:00', 6)]) as data)
)
/*
┌────────────────Time─┬─Values──┐
│ 2020-01-01 12:11:00 │ [0,0,1] │
│ 2020-01-01 12:12:00 │ [0,1,2] │
│ 2020-01-01 12:13:00 │ [1,2,3] │
│ 2020-01-01 12:14:00 │ [2,3,4] │
│ 2020-01-01 12:15:00 │ [3,4,5] │
│ 2020-01-01 12:16:00 │ [4,5,6] │
└─────────────────────┴─────────┘
*/
基于源行重复 window_size 次的替代方法:
SELECT
arrayReduce('max', arrayMap(x -> x.1, raw_result)) Time,
arrayMap(x -> x.2, raw_result) Values
FROM (
SELECT groupArray((Time, Value)) raw_result, max(row_number) max_row_number
FROM (
SELECT
3 AS window_size,
*,
rowNumberInAllBlocks() row_number,
arrayJoin(arrayMap(x -> x + row_number, range(window_size))) window_id
FROM (
/* emulate origin dataset */
SELECT toDateTime(data.1) as Time, data.2 as Value
FROM (
SELECT arrayJoin([('2020-01-01 12:11:00', 1),
('2020-01-01 12:12:00', 2),
('2020-01-01 12:13:00', 3),
('2020-01-01 12:14:00', 4),
('2020-01-01 12:15:00', 5),
('2020-01-01 12:16:00', 6)]) as data)
ORDER BY Value
)
)
GROUP BY window_id
HAVING max_row_number = window_id
ORDER BY window_id
)
/*
┌────────────────Time─┬─Values──┐
│ 2020-01-01 12:11:00 │ [1] │
│ 2020-01-01 12:12:00 │ [1,2] │
│ 2020-01-01 12:13:00 │ [1,2,3] │
│ 2020-01-01 12:14:00 │ [2,3,4] │
│ 2020-01-01 12:15:00 │ [3,4,5] │
│ 2020-01-01 12:16:00 │ [4,5,6] │
└─────────────────────┴─────────┘
*/
附加示例:
SELECT
arrayReduce('max', arrayMap(x -> x.1, raw_result)) id,
arrayMap(x -> x.2, raw_result) values
FROM (
SELECT groupArray((id, value)) raw_result, max(row_number) max_row_number
FROM (
SELECT
48 AS window_size,
*,
rowNumberInAllBlocks() row_number,
arrayJoin(arrayMap(x -> x + row_number, range(window_size))) window_id
FROM (
/* the origin dataset */
SELECT number AS id, number AS value
FROM numbers(4096)
)
)
GROUP BY window_id
HAVING max_row_number = window_id
ORDER BY window_id
)
/*
┌─id─┬─values────────────────┐
│ 0 │ [0] │
│ 1 │ [0,1] │
│ 2 │ [0,1,2] │
│ 3 │ [0,1,2,3] │
│ 4 │ [0,1,2,3,4] │
│ 5 │ [0,1,2,3,4,5] │
│ 6 │ [0,1,2,3,4,5,6] │
│ 7 │ [0,1,2,3,4,5,6,7] │
..
│ 56 │ [9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56] │
│ 57 │ [10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57] │
│ 58 │ [11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58] │
│ 59 │ [12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59] │
│ 60 │ [13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60] │
..
│ 4093 │ [4046,4047,4048,4049,4050,4051,4052,4053,4054,4055,4056,4057,4058,4059,4060,4061,4062,4063,4064,4065,4066,4067,4068,4069,4070,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080,4081,4082,4083,4084,4085,4086,4087,4088,4089,4090,4091,4092,4093] │
│ 4094 │ [4047,4048,4049,4050,4051,4052,4053,4054,4055,4056,4057,4058,4059,4060,4061,4062,4063,4064,4065,4066,4067,4068,4069,4070,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080,4081,4082,4083,4084,4085,4086,4087,4088,4089,4090,4091,4092,4093,4094] │
│ 4095 │ [4048,4049,4050,4051,4052,4053,4054,4055,4056,4057,4058,4059,4060,4061,4062,4063,4064,4065,4066,4067,4068,4069,4070,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080,4081,4082,4083,4084,4085,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095] │
└──────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
*/
对于 Clickhouse 19,范围函数只接受单个输入,您可以使用以下查询
select max(Time) as Time, groupArray(Value) as Values
from (
select
*,
rowNumberInAllBlocks() as row_number,
arrayJoin( arrayMap(x -> x + row_number, range(3)) ) as window_id
from (
/* BEGIN emulate origin dataset */
select toDateTime(a) as Time, rowNumberInAllBlocks()+1 as Value
from (
select arrayJoin([
'2020-01-01 12:11:00',
'2020-01-01 12:12:00',
'2020-01-01 12:13:00',
'2020-01-01 12:14:00',
'2020-01-01 12:15:00',
'2020-01-01 12:16:00']) a
)
order by Time
/* END emulate origin dataset */
)
order by Time
) s
group by window_id
having length(Values) = 3
order by Time
我正在寻找一种有效的方法来查询 n 过去的值作为 ClickHouse 中按一列排序的每一行的数组(即 Time
) ,其中的值应作为数组检索。
Window 函数在 ClickHouse 中仍然不受支持(参见 #1469),所以我希望使用像 groupArray()
?
Table:
Time | Value
12:11 | 1
12:12 | 2
12:13 | 3
12:14 | 4
12:15 | 5
12:16 | 6
window 大小 n=3
的预期结果:
Time | Value
12:13 | [1,2,3]
12:14 | [2,3,4]
12:15 | [3,4,5]
12:16 | [4,5,6]
ClickHouse 目前使用哪些 ways/functions 来高效地查询 sliding/moving window 以及如何获得我想要的结果?
编辑:
我的解决方案基于@vladimir 的回复:
select max(Time) as Time, groupArray(Value) as Values
from (
select
*,
rowNumberInAllBlocks() as row_number,
arrayJoin(range(row_number, row_number + 3)) as window_id
from (
/* BEGIN emulate origin dataset */
select toDateTime(a) as Time, rowNumberInAllBlocks()+1 as Value
from (
select arrayJoin([
'2020-01-01 12:11:00',
'2020-01-01 12:12:00',
'2020-01-01 12:13:00',
'2020-01-01 12:14:00',
'2020-01-01 12:15:00',
'2020-01-01 12:16:00']) a
)
order by Time
/* END emulate origin dataset */
)
order by Time
) s
group by window_id
having length(Values) = 3
order by Time
请注意,3
在查询中出现两次,代表 window 大小 n。
输出:
┌────────────────Time─┬─Values──┐
│ 2020-01-01 12:13:00 │ [1,2,3] │
│ 2020-01-01 12:14:00 │ [2,3,4] │
│ 2020-01-01 12:15:00 │ [3,4,5] │
│ 2020-01-01 12:16:00 │ [4,5,6] │
└─────────────────────┴─────────┘
从 version 21.4 开始添加了对 window-functions 的 完整 支持。此时它被标记为实验性功能。
SELECT
Time,
groupArray(any(Value)) OVER (ORDER BY Time ASC ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS Values
FROM
(
/* Emulate the test dataset, */
select toDateTime(a) as Time, rowNumberInAllBlocks()+1 as Value
from (
select arrayJoin([
'2020-01-01 12:11:00',
'2020-01-01 12:12:00',
'2020-01-01 12:13:00',
'2020-01-01 12:14:00',
'2020-01-01 12:15:00',
'2020-01-01 12:16:00']) a
)
order by Time
)
GROUP BY Time
SETTINGS allow_experimental_window_functions = 1
/*
┌────────────────Time─┬─Values──┐
│ 2020-01-01 12:11:00 │ [1] │
│ 2020-01-01 12:12:00 │ [1,2] │
│ 2020-01-01 12:13:00 │ [1,2,3] │
│ 2020-01-01 12:14:00 │ [2,3,4] │
│ 2020-01-01 12:15:00 │ [3,4,5] │
│ 2020-01-01 12:16:00 │ [4,5,6] │
└─────────────────────┴─────────┘
*/
参见 https://altinity.com/blog/clickhouse-window-functions-current-state-of-the-art。
ClickHouse 有几个 datablock-scoped window 函数,我们就拿 neighbor:
SELECT Time, [neighbor(Value, -2), neighbor(Value, -1), neighbor(Value, 0)] Values
FROM (
/* emulate origin data */
SELECT toDateTime(data.1) as Time, data.2 as Value
FROM (
SELECT arrayJoin([('2020-01-01 12:11:00', 1),
('2020-01-01 12:12:00', 2),
('2020-01-01 12:13:00', 3),
('2020-01-01 12:14:00', 4),
('2020-01-01 12:15:00', 5),
('2020-01-01 12:16:00', 6)]) as data)
)
/*
┌────────────────Time─┬─Values──┐
│ 2020-01-01 12:11:00 │ [0,0,1] │
│ 2020-01-01 12:12:00 │ [0,1,2] │
│ 2020-01-01 12:13:00 │ [1,2,3] │
│ 2020-01-01 12:14:00 │ [2,3,4] │
│ 2020-01-01 12:15:00 │ [3,4,5] │
│ 2020-01-01 12:16:00 │ [4,5,6] │
└─────────────────────┴─────────┘
*/
基于源行重复 window_size 次的替代方法:
SELECT
arrayReduce('max', arrayMap(x -> x.1, raw_result)) Time,
arrayMap(x -> x.2, raw_result) Values
FROM (
SELECT groupArray((Time, Value)) raw_result, max(row_number) max_row_number
FROM (
SELECT
3 AS window_size,
*,
rowNumberInAllBlocks() row_number,
arrayJoin(arrayMap(x -> x + row_number, range(window_size))) window_id
FROM (
/* emulate origin dataset */
SELECT toDateTime(data.1) as Time, data.2 as Value
FROM (
SELECT arrayJoin([('2020-01-01 12:11:00', 1),
('2020-01-01 12:12:00', 2),
('2020-01-01 12:13:00', 3),
('2020-01-01 12:14:00', 4),
('2020-01-01 12:15:00', 5),
('2020-01-01 12:16:00', 6)]) as data)
ORDER BY Value
)
)
GROUP BY window_id
HAVING max_row_number = window_id
ORDER BY window_id
)
/*
┌────────────────Time─┬─Values──┐
│ 2020-01-01 12:11:00 │ [1] │
│ 2020-01-01 12:12:00 │ [1,2] │
│ 2020-01-01 12:13:00 │ [1,2,3] │
│ 2020-01-01 12:14:00 │ [2,3,4] │
│ 2020-01-01 12:15:00 │ [3,4,5] │
│ 2020-01-01 12:16:00 │ [4,5,6] │
└─────────────────────┴─────────┘
*/
附加示例:
SELECT
arrayReduce('max', arrayMap(x -> x.1, raw_result)) id,
arrayMap(x -> x.2, raw_result) values
FROM (
SELECT groupArray((id, value)) raw_result, max(row_number) max_row_number
FROM (
SELECT
48 AS window_size,
*,
rowNumberInAllBlocks() row_number,
arrayJoin(arrayMap(x -> x + row_number, range(window_size))) window_id
FROM (
/* the origin dataset */
SELECT number AS id, number AS value
FROM numbers(4096)
)
)
GROUP BY window_id
HAVING max_row_number = window_id
ORDER BY window_id
)
/*
┌─id─┬─values────────────────┐
│ 0 │ [0] │
│ 1 │ [0,1] │
│ 2 │ [0,1,2] │
│ 3 │ [0,1,2,3] │
│ 4 │ [0,1,2,3,4] │
│ 5 │ [0,1,2,3,4,5] │
│ 6 │ [0,1,2,3,4,5,6] │
│ 7 │ [0,1,2,3,4,5,6,7] │
..
│ 56 │ [9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56] │
│ 57 │ [10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57] │
│ 58 │ [11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58] │
│ 59 │ [12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59] │
│ 60 │ [13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60] │
..
│ 4093 │ [4046,4047,4048,4049,4050,4051,4052,4053,4054,4055,4056,4057,4058,4059,4060,4061,4062,4063,4064,4065,4066,4067,4068,4069,4070,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080,4081,4082,4083,4084,4085,4086,4087,4088,4089,4090,4091,4092,4093] │
│ 4094 │ [4047,4048,4049,4050,4051,4052,4053,4054,4055,4056,4057,4058,4059,4060,4061,4062,4063,4064,4065,4066,4067,4068,4069,4070,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080,4081,4082,4083,4084,4085,4086,4087,4088,4089,4090,4091,4092,4093,4094] │
│ 4095 │ [4048,4049,4050,4051,4052,4053,4054,4055,4056,4057,4058,4059,4060,4061,4062,4063,4064,4065,4066,4067,4068,4069,4070,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080,4081,4082,4083,4084,4085,4086,4087,4088,4089,4090,4091,4092,4093,4094,4095] │
└──────┴───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
*/
对于 Clickhouse 19,范围函数只接受单个输入,您可以使用以下查询
select max(Time) as Time, groupArray(Value) as Values
from (
select
*,
rowNumberInAllBlocks() as row_number,
arrayJoin( arrayMap(x -> x + row_number, range(3)) ) as window_id
from (
/* BEGIN emulate origin dataset */
select toDateTime(a) as Time, rowNumberInAllBlocks()+1 as Value
from (
select arrayJoin([
'2020-01-01 12:11:00',
'2020-01-01 12:12:00',
'2020-01-01 12:13:00',
'2020-01-01 12:14:00',
'2020-01-01 12:15:00',
'2020-01-01 12:16:00']) a
)
order by Time
/* END emulate origin dataset */
)
order by Time
) s
group by window_id
having length(Values) = 3
order by Time