如何对多行的列值求和?

How to sum the values of a column for several rows?

我有这个 table,我想为几行添加 'change' 列的值(或者,更准确地说,从 'ne' 值所在的行零到下一行,其中包括 'ne' 的零(不是第二个本身))。 任何答案将不胜感激。

┌─rn─┬───────date─┬─ne─┬───────change─┐
│  0 │ 2008-12-07 │  0 │ -10330848398 │
│  1 │ 2009-04-14 │  1 │       -61290 │
│  2 │ 2009-04-26 │  1 │   9605743360 │
│  3 │ 2013-07-06 │  0 │ -32028871920 │
│  4 │ 2014-01-12 │  1 │ -42296164902 │
│  5 │ 2015-06-08 │  1 │  59100383646 │
└────┴────────────┴────┴──────────────┘

我们期望的结果是这样的。

row    start        end         sum(change) 
--------------------------------------------------
0 | 2008-12-07 | 2009-04-26 | -725,166,328
--------------------------------------------------
1 | 2013-07-06 | 2015-06-08 | -15,224,653,176
--------------------------------------------------

这是一个缺口和孤岛问题。规范解决方案确实使用 window 函数,就目前而言,Clickhouse 不支持这些函数。

这是一种使用子查询模拟条件 window 总和的方法:

select
    min(date) start_date,
    max(date) end_date,
    sum(change) sum_change
from (
    select 
        t.*,
        (select count(*) from mytable t1 where t1.date <= t.date and t1.ne = 0) grp
    from mytable t
) t
group by grp

子查询统计从table的第一行到当前行有多少行ne = 0。这定义了记录组。那么剩下要做的就是汇总了。

如果你可以使用 window 函数,你会这样表述:

select
    min(date) start_date,
    max(date) end_date,
    sum(change) sum_change
from (
    select 
        t.*,
        sum(case when ne = 0 then 1 else 0 end) over(order by date) grp
    from mytable t
) t
group by grp

SELECT ne,MIN(date)作为开始,MAX(date)作为结束,SUM(change)作为变化 按 ne

分组

假设 Clickhouse 支持变量:

set @block := -1;
select 
    block as row,
    min(date) as start,
    max(date) as end,
    sum(change)
from
    (select  
        case when ne = 0 then @block:=@block+1 end as dummy,
        @block as block,
        t.*
    from t) tt
group by block;

在大数据中无法解决(> 1 亿行)

SELECT
    d[1] AS s,
    d[-1] AS e,
    arraySum(c) AS sm
FROM
(
    SELECT
        arraySplit((x, y) -> (NOT y), d, n) AS dd,
        arraySplit((x, y) -> (NOT y), c, n) AS cc
    FROM
    (
        SELECT
            groupArray(date) AS d,
            groupArray(ne) AS n,
            groupArray(change) AS c
        FROM
        (
            SELECT *
            FROM mytable
            ORDER BY rn ASC
        )
    )
)
ARRAY JOIN
    dd AS d,
    cc AS c

┌─s──────────┬─e──────────┬───────────sm─┐
│ 2008-12-07 │ 2009-04-26 │   -725166328 │
│ 2013-07-06 │ 2015-06-08 │ -15224653176 │
└────────────┴────────────┴──────────────┘

解决此任务的另一种方法:

WITH (SELECT arraySort(groupArray(rn))
    FROM test_table
    WHERE ne = 0) as group_start_id
SELECT argMin(date, rn) start, argMax(date, rn) end, sum(change)
FROM (
    SELECT rn, date, change
    FROM test_table
    ORDER BY rn)
GROUP BY arrayFirstIndex(x -> rn < x, group_start_id)   
ORDER BY start

样本数据测试:

WITH (SELECT arraySort(groupArray(rn))
    FROM (
        SELECT data.1 rn, data.2 date, data.3 ne, data.4 change
        FROM (
            SELECT arrayJoin([
            (0, toDate('2008-12-07'), 0, toInt64(-10330848398)),
            (1, toDate('2009-04-14'), 1, toInt64(-61290)),
            (2, toDate('2009-04-26'), 1, toInt64(9605743360)),
            (3, toDate('2013-07-06'), 0, toInt64(-32028871920)),
            (4, toDate('2014-01-12'), 1, toInt64(-42296164902)),
            (5, toDate('2015-06-08'), 1, toInt64(59100383646)),
            (6, toDate('2015-06-08'), 0, toInt64(101)),
            (7, toDate('2015-06-09'), 0, toInt64(102)),
            (8, toDate('2015-06-10'), 0, toInt64(103)),
            (9, toDate('2015-06-11'), 1, toInt64(104))
            ]) data))
    WHERE ne = 0) as group_start_id
SELECT argMin(date, rn) start, argMax(date, rn) end, sum(change)
FROM (
    SELECT data.1 rn, data.2 date, data.4 change
    FROM (
        SELECT arrayJoin([
        (0, toDate('2008-12-07'), 0, toInt64(-10330848398)),
        (1, toDate('2009-04-14'), 1, toInt64(-61290)),
        (2, toDate('2009-04-26'), 1, toInt64(9605743360)),
        (3, toDate('2013-07-06'), 0, toInt64(-32028871920)),
        (4, toDate('2014-01-12'), 1, toInt64(-42296164902)),
        (5, toDate('2015-06-08'), 1, toInt64(59100383646)),
        (6, toDate('2015-06-08'), 0, toInt64(101)),
        (7, toDate('2015-06-09'), 0, toInt64(102)),
        (8, toDate('2015-06-10'), 0, toInt64(103)),
        (9, toDate('2015-06-11'), 1, toInt64(104))
        ]) data)
    ORDER BY rn)
GROUP BY arrayFirstIndex(x -> rn < x, group_start_id)   
ORDER BY start
/* result
┌──────start─┬────────end─┬──sum(change)─┐
│ 2008-12-07 │ 2009-04-26 │   -725166328 │
│ 2013-07-06 │ 2015-06-08 │ -15224653176 │
│ 2015-06-08 │ 2015-06-08 │          101 │
│ 2015-06-09 │ 2015-06-09 │          102 │
│ 2015-06-10 │ 2015-06-11 │          207 │
└────────────┴────────────┴──────────────┘
*/