如何对多行的列值求和?
How to sum the values of a column for several rows?
我有这个 table,我想为几行添加 'change' 列的值(或者,更准确地说,从 'ne' 值所在的行零到下一行,其中包括 'ne' 的零(不是第二个本身))。
任何答案将不胜感激。
┌─rn─┬───────date─┬─ne─┬───────change─┐
│ 0 │ 2008-12-07 │ 0 │ -10330848398 │
│ 1 │ 2009-04-14 │ 1 │ -61290 │
│ 2 │ 2009-04-26 │ 1 │ 9605743360 │
│ 3 │ 2013-07-06 │ 0 │ -32028871920 │
│ 4 │ 2014-01-12 │ 1 │ -42296164902 │
│ 5 │ 2015-06-08 │ 1 │ 59100383646 │
└────┴────────────┴────┴──────────────┘
我们期望的结果是这样的。
row start end sum(change)
--------------------------------------------------
0 | 2008-12-07 | 2009-04-26 | -725,166,328
--------------------------------------------------
1 | 2013-07-06 | 2015-06-08 | -15,224,653,176
--------------------------------------------------
这是一个缺口和孤岛问题。规范解决方案确实使用 window 函数,就目前而言,Clickhouse 不支持这些函数。
这是一种使用子查询模拟条件 window 总和的方法:
select
min(date) start_date,
max(date) end_date,
sum(change) sum_change
from (
select
t.*,
(select count(*) from mytable t1 where t1.date <= t.date and t1.ne = 0) grp
from mytable t
) t
group by grp
子查询统计从table的第一行到当前行有多少行ne = 0
。这定义了记录组。那么剩下要做的就是汇总了。
如果你可以使用 window 函数,你会这样表述:
select
min(date) start_date,
max(date) end_date,
sum(change) sum_change
from (
select
t.*,
sum(case when ne = 0 then 1 else 0 end) over(order by date) grp
from mytable t
) t
group by grp
SELECT ne,MIN(date)作为开始,MAX(date)作为结束,SUM(change)作为变化
按 ne
分组
假设 Clickhouse 支持变量:
set @block := -1;
select
block as row,
min(date) as start,
max(date) as end,
sum(change)
from
(select
case when ne = 0 then @block:=@block+1 end as dummy,
@block as block,
t.*
from t) tt
group by block;
在大数据中无法解决(> 1 亿行)
SELECT
d[1] AS s,
d[-1] AS e,
arraySum(c) AS sm
FROM
(
SELECT
arraySplit((x, y) -> (NOT y), d, n) AS dd,
arraySplit((x, y) -> (NOT y), c, n) AS cc
FROM
(
SELECT
groupArray(date) AS d,
groupArray(ne) AS n,
groupArray(change) AS c
FROM
(
SELECT *
FROM mytable
ORDER BY rn ASC
)
)
)
ARRAY JOIN
dd AS d,
cc AS c
┌─s──────────┬─e──────────┬───────────sm─┐
│ 2008-12-07 │ 2009-04-26 │ -725166328 │
│ 2013-07-06 │ 2015-06-08 │ -15224653176 │
└────────────┴────────────┴──────────────┘
解决此任务的另一种方法:
WITH (SELECT arraySort(groupArray(rn))
FROM test_table
WHERE ne = 0) as group_start_id
SELECT argMin(date, rn) start, argMax(date, rn) end, sum(change)
FROM (
SELECT rn, date, change
FROM test_table
ORDER BY rn)
GROUP BY arrayFirstIndex(x -> rn < x, group_start_id)
ORDER BY start
样本数据测试:
WITH (SELECT arraySort(groupArray(rn))
FROM (
SELECT data.1 rn, data.2 date, data.3 ne, data.4 change
FROM (
SELECT arrayJoin([
(0, toDate('2008-12-07'), 0, toInt64(-10330848398)),
(1, toDate('2009-04-14'), 1, toInt64(-61290)),
(2, toDate('2009-04-26'), 1, toInt64(9605743360)),
(3, toDate('2013-07-06'), 0, toInt64(-32028871920)),
(4, toDate('2014-01-12'), 1, toInt64(-42296164902)),
(5, toDate('2015-06-08'), 1, toInt64(59100383646)),
(6, toDate('2015-06-08'), 0, toInt64(101)),
(7, toDate('2015-06-09'), 0, toInt64(102)),
(8, toDate('2015-06-10'), 0, toInt64(103)),
(9, toDate('2015-06-11'), 1, toInt64(104))
]) data))
WHERE ne = 0) as group_start_id
SELECT argMin(date, rn) start, argMax(date, rn) end, sum(change)
FROM (
SELECT data.1 rn, data.2 date, data.4 change
FROM (
SELECT arrayJoin([
(0, toDate('2008-12-07'), 0, toInt64(-10330848398)),
(1, toDate('2009-04-14'), 1, toInt64(-61290)),
(2, toDate('2009-04-26'), 1, toInt64(9605743360)),
(3, toDate('2013-07-06'), 0, toInt64(-32028871920)),
(4, toDate('2014-01-12'), 1, toInt64(-42296164902)),
(5, toDate('2015-06-08'), 1, toInt64(59100383646)),
(6, toDate('2015-06-08'), 0, toInt64(101)),
(7, toDate('2015-06-09'), 0, toInt64(102)),
(8, toDate('2015-06-10'), 0, toInt64(103)),
(9, toDate('2015-06-11'), 1, toInt64(104))
]) data)
ORDER BY rn)
GROUP BY arrayFirstIndex(x -> rn < x, group_start_id)
ORDER BY start
/* result
┌──────start─┬────────end─┬──sum(change)─┐
│ 2008-12-07 │ 2009-04-26 │ -725166328 │
│ 2013-07-06 │ 2015-06-08 │ -15224653176 │
│ 2015-06-08 │ 2015-06-08 │ 101 │
│ 2015-06-09 │ 2015-06-09 │ 102 │
│ 2015-06-10 │ 2015-06-11 │ 207 │
└────────────┴────────────┴──────────────┘
*/
我有这个 table,我想为几行添加 'change' 列的值(或者,更准确地说,从 'ne' 值所在的行零到下一行,其中包括 'ne' 的零(不是第二个本身))。 任何答案将不胜感激。
┌─rn─┬───────date─┬─ne─┬───────change─┐
│ 0 │ 2008-12-07 │ 0 │ -10330848398 │
│ 1 │ 2009-04-14 │ 1 │ -61290 │
│ 2 │ 2009-04-26 │ 1 │ 9605743360 │
│ 3 │ 2013-07-06 │ 0 │ -32028871920 │
│ 4 │ 2014-01-12 │ 1 │ -42296164902 │
│ 5 │ 2015-06-08 │ 1 │ 59100383646 │
└────┴────────────┴────┴──────────────┘
我们期望的结果是这样的。
row start end sum(change)
--------------------------------------------------
0 | 2008-12-07 | 2009-04-26 | -725,166,328
--------------------------------------------------
1 | 2013-07-06 | 2015-06-08 | -15,224,653,176
--------------------------------------------------
这是一个缺口和孤岛问题。规范解决方案确实使用 window 函数,就目前而言,Clickhouse 不支持这些函数。
这是一种使用子查询模拟条件 window 总和的方法:
select
min(date) start_date,
max(date) end_date,
sum(change) sum_change
from (
select
t.*,
(select count(*) from mytable t1 where t1.date <= t.date and t1.ne = 0) grp
from mytable t
) t
group by grp
子查询统计从table的第一行到当前行有多少行ne = 0
。这定义了记录组。那么剩下要做的就是汇总了。
如果你可以使用 window 函数,你会这样表述:
select
min(date) start_date,
max(date) end_date,
sum(change) sum_change
from (
select
t.*,
sum(case when ne = 0 then 1 else 0 end) over(order by date) grp
from mytable t
) t
group by grp
SELECT ne,MIN(date)作为开始,MAX(date)作为结束,SUM(change)作为变化 按 ne
分组假设 Clickhouse 支持变量:
set @block := -1;
select
block as row,
min(date) as start,
max(date) as end,
sum(change)
from
(select
case when ne = 0 then @block:=@block+1 end as dummy,
@block as block,
t.*
from t) tt
group by block;
在大数据中无法解决(> 1 亿行)
SELECT
d[1] AS s,
d[-1] AS e,
arraySum(c) AS sm
FROM
(
SELECT
arraySplit((x, y) -> (NOT y), d, n) AS dd,
arraySplit((x, y) -> (NOT y), c, n) AS cc
FROM
(
SELECT
groupArray(date) AS d,
groupArray(ne) AS n,
groupArray(change) AS c
FROM
(
SELECT *
FROM mytable
ORDER BY rn ASC
)
)
)
ARRAY JOIN
dd AS d,
cc AS c
┌─s──────────┬─e──────────┬───────────sm─┐
│ 2008-12-07 │ 2009-04-26 │ -725166328 │
│ 2013-07-06 │ 2015-06-08 │ -15224653176 │
└────────────┴────────────┴──────────────┘
解决此任务的另一种方法:
WITH (SELECT arraySort(groupArray(rn))
FROM test_table
WHERE ne = 0) as group_start_id
SELECT argMin(date, rn) start, argMax(date, rn) end, sum(change)
FROM (
SELECT rn, date, change
FROM test_table
ORDER BY rn)
GROUP BY arrayFirstIndex(x -> rn < x, group_start_id)
ORDER BY start
样本数据测试:
WITH (SELECT arraySort(groupArray(rn))
FROM (
SELECT data.1 rn, data.2 date, data.3 ne, data.4 change
FROM (
SELECT arrayJoin([
(0, toDate('2008-12-07'), 0, toInt64(-10330848398)),
(1, toDate('2009-04-14'), 1, toInt64(-61290)),
(2, toDate('2009-04-26'), 1, toInt64(9605743360)),
(3, toDate('2013-07-06'), 0, toInt64(-32028871920)),
(4, toDate('2014-01-12'), 1, toInt64(-42296164902)),
(5, toDate('2015-06-08'), 1, toInt64(59100383646)),
(6, toDate('2015-06-08'), 0, toInt64(101)),
(7, toDate('2015-06-09'), 0, toInt64(102)),
(8, toDate('2015-06-10'), 0, toInt64(103)),
(9, toDate('2015-06-11'), 1, toInt64(104))
]) data))
WHERE ne = 0) as group_start_id
SELECT argMin(date, rn) start, argMax(date, rn) end, sum(change)
FROM (
SELECT data.1 rn, data.2 date, data.4 change
FROM (
SELECT arrayJoin([
(0, toDate('2008-12-07'), 0, toInt64(-10330848398)),
(1, toDate('2009-04-14'), 1, toInt64(-61290)),
(2, toDate('2009-04-26'), 1, toInt64(9605743360)),
(3, toDate('2013-07-06'), 0, toInt64(-32028871920)),
(4, toDate('2014-01-12'), 1, toInt64(-42296164902)),
(5, toDate('2015-06-08'), 1, toInt64(59100383646)),
(6, toDate('2015-06-08'), 0, toInt64(101)),
(7, toDate('2015-06-09'), 0, toInt64(102)),
(8, toDate('2015-06-10'), 0, toInt64(103)),
(9, toDate('2015-06-11'), 1, toInt64(104))
]) data)
ORDER BY rn)
GROUP BY arrayFirstIndex(x -> rn < x, group_start_id)
ORDER BY start
/* result
┌──────start─┬────────end─┬──sum(change)─┐
│ 2008-12-07 │ 2009-04-26 │ -725166328 │
│ 2013-07-06 │ 2015-06-08 │ -15224653176 │
│ 2015-06-08 │ 2015-06-08 │ 101 │
│ 2015-06-09 │ 2015-06-09 │ 102 │
│ 2015-06-10 │ 2015-06-11 │ 207 │
└────────────┴────────────┴──────────────┘
*/