如何在 postgres 中 select 随着时间的推移按组累计计数
How to select cumulative counts by group over time across in postgres
我有两组随时间的累积计数,格式如下:
Date
Group
Cumulative Count
1/1/2020
A
1
1/2/2020
A
3
1/2/2020
B
1
1/3/2020
B
2
我想将这些数据重塑为这种格式:
Date
Group
Cumulative Count
1/1/2020
A
1
1/1/2020
B
0
1/2/2020
A
3
1/2/2020
B
1
1/3/2020
A
3
1/3/2020
B
2
这样我就可以让它在元数据库的堆叠面积图中准确显示 - 有什么建议吗?
在使用左连接获取组合数据集之前,您可以使用交叉连接生成所有可能的日期和组对,例如
由于您的数据集已经具有累积计数,空值标识的缺失值已使用最近的累积计数替换为 MAX
和 COALESCE
。
SELECT
d."Date"::text,
d."Group",
COALESCE(m."CumulativeCount",COALESCE(MAX(m."CumulativeCount") OVER (
PARTITION BY d."Group"
ORDER BY d."Date"
),0)) as CumulativeCount
FROM (
SELECT "Date", "Group" FROM (
SELECT DISTINCT
"Date"
FROM
my_data
) t1
CROSS JOIN (
SELECT DISTINCT
"Group"
FROM
my_data
) t2
) d
LEFT JOIN my_data m ON m."Date"=d."Date" AND
m."Group" = d."Group"
ORDER BY 1,2;
Date
Group
cumulativecount
2020-01-01
A
1
2020-01-01
B
0
2020-01-02
A
3
2020-01-02
B
1
2020-01-03
A
3
2020-01-03
B
2
View working demo on DB Fiddle
更新 1
如果您想为缺失日期之间的日期生成值,例如您的下一个日期是 1/7/2020
并且您想填补 1/3/2020
的空缺,您可以使用 generate_series
生成可能的日期,MAX
获取最新值。我在下面包含了一个 fiddle 和额外的示例数据,例如
模式(PostgreSQL v13)
CREATE TABLE my_data (
"Date" DATE,
"Group" VARCHAR(1),
"CumulativeCount" INTEGER
);
INSERT INTO my_data
("Date", "Group", "CumulativeCount")
VALUES
('1/1/2020', 'A', '1'),
('1/2/2020', 'A', '3'),
('1/2/2020', 'B', '1'),
('1/3/2020', 'B', '2'),
('1/1/2020', 'C', '2'),
('1/7/2020', 'C', '3');
查询#1
SELECT
d."Date"::text,
d."Group",
COALESCE(
m."CumulativeCount",
COALESCE(MAX(m."CumulativeCount") OVER (
PARTITION BY d."Group"
ORDER BY d."Date"
),0)
) as CumulativeCount
FROM (
SELECT "Date", "Group" FROM (
SELECT
GENERATE_SERIES(
MIN("Date"),
MAX("Date"),
INTERVAL '1' DAY
) as "Date"
FROM
my_data
) t1
CROSS JOIN (
SELECT DISTINCT
"Group"
FROM
my_data
) t2
) d
LEFT JOIN my_data m ON m."Date"=d."Date" AND
m."Group" = d."Group"
ORDER BY 1,2;
Date
Group
cumulativecount
2020-01-01 00:00:00+00
A
1
2020-01-01 00:00:00+00
B
0
2020-01-01 00:00:00+00
C
2
2020-01-02 00:00:00+00
A
3
2020-01-02 00:00:00+00
B
1
2020-01-02 00:00:00+00
C
2
2020-01-03 00:00:00+00
A
3
2020-01-03 00:00:00+00
B
2
2020-01-03 00:00:00+00
C
2
2020-01-04 00:00:00+00
A
3
2020-01-04 00:00:00+00
B
2
2020-01-04 00:00:00+00
C
2
2020-01-05 00:00:00+00
A
3
2020-01-05 00:00:00+00
B
2
2020-01-05 00:00:00+00
C
2
2020-01-06 00:00:00+00
A
3
2020-01-06 00:00:00+00
B
2
2020-01-06 00:00:00+00
C
2
2020-01-07 00:00:00+00
A
3
2020-01-07 00:00:00+00
B
2
2020-01-07 00:00:00+00
C
3
您可以通过以下步骤实现:
注意:我创建了几个table。根据您的喜好随意使用子查询或 CTE。
首先,创建可能的 date-group
对:
create table ads as
SELECT
sq."date",
sq."group",
COALESCE(m."cummulativecount",0) as CummulativeCount
FROM
(
SELECT "date", "group"
FROM
(
SELECT DISTINCT "date"
FROM tbl
) dt
CROSS JOIN
(
SELECT DISTINCT "group"
FROM tbl
) grp
) sq
LEFT JOIN tbl m
ON m."date"=sq."date"
AND
m."group" = sq."group"
(以上步骤引用自@ggordon的回答)。现在,由于我们将附加记录的所有 cummulativeCount 归为 0,因此我们必须获得实际 table 的最近 cummulativeCount(即,对于 A,3
将是取自 "1/2/2020"
。如果最近的 most 值不可用,则不会取。
create table prev_cnt as
select t."group", t.cummulativecount
from tbl as t
inner join
(
select tbl."group", max(tbl."date") m_date
from ads
inner join tbl
on ads."group"=tbl."group"
where ads.cummulativecount =0 and
tbl."date" < ads."date"
group by tbl."group"
) as sq
on t."group"=sq."group"
and t."date"=sq.m_date
最后,将最近的值与派生广告结合起来table:
create table fin_ads as
select
ads."date",
ads."group",
case
when ads.cummulativecount=0 and pc.cummulativecount IS NOT NULL then pc.cummulativecount
else ads.cummulativecount
end as cummulativecount
from ads
LEFT join prev_cnt as pc
on ads."group"=pc."group"
table fin_ads
将是您想要的输出。
示例代码和日期在这里:DB<>fiddle
我有两组随时间的累积计数,格式如下:
Date | Group | Cumulative Count |
---|---|---|
1/1/2020 | A | 1 |
1/2/2020 | A | 3 |
1/2/2020 | B | 1 |
1/3/2020 | B | 2 |
我想将这些数据重塑为这种格式:
Date | Group | Cumulative Count |
---|---|---|
1/1/2020 | A | 1 |
1/1/2020 | B | 0 |
1/2/2020 | A | 3 |
1/2/2020 | B | 1 |
1/3/2020 | A | 3 |
1/3/2020 | B | 2 |
这样我就可以让它在元数据库的堆叠面积图中准确显示 - 有什么建议吗?
在使用左连接获取组合数据集之前,您可以使用交叉连接生成所有可能的日期和组对,例如
由于您的数据集已经具有累积计数,空值标识的缺失值已使用最近的累积计数替换为 MAX
和 COALESCE
。
SELECT
d."Date"::text,
d."Group",
COALESCE(m."CumulativeCount",COALESCE(MAX(m."CumulativeCount") OVER (
PARTITION BY d."Group"
ORDER BY d."Date"
),0)) as CumulativeCount
FROM (
SELECT "Date", "Group" FROM (
SELECT DISTINCT
"Date"
FROM
my_data
) t1
CROSS JOIN (
SELECT DISTINCT
"Group"
FROM
my_data
) t2
) d
LEFT JOIN my_data m ON m."Date"=d."Date" AND
m."Group" = d."Group"
ORDER BY 1,2;
Date | Group | cumulativecount |
---|---|---|
2020-01-01 | A | 1 |
2020-01-01 | B | 0 |
2020-01-02 | A | 3 |
2020-01-02 | B | 1 |
2020-01-03 | A | 3 |
2020-01-03 | B | 2 |
View working demo on DB Fiddle
更新 1
如果您想为缺失日期之间的日期生成值,例如您的下一个日期是 1/7/2020
并且您想填补 1/3/2020
的空缺,您可以使用 generate_series
生成可能的日期,MAX
获取最新值。我在下面包含了一个 fiddle 和额外的示例数据,例如
模式(PostgreSQL v13)
CREATE TABLE my_data (
"Date" DATE,
"Group" VARCHAR(1),
"CumulativeCount" INTEGER
);
INSERT INTO my_data
("Date", "Group", "CumulativeCount")
VALUES
('1/1/2020', 'A', '1'),
('1/2/2020', 'A', '3'),
('1/2/2020', 'B', '1'),
('1/3/2020', 'B', '2'),
('1/1/2020', 'C', '2'),
('1/7/2020', 'C', '3');
查询#1
SELECT
d."Date"::text,
d."Group",
COALESCE(
m."CumulativeCount",
COALESCE(MAX(m."CumulativeCount") OVER (
PARTITION BY d."Group"
ORDER BY d."Date"
),0)
) as CumulativeCount
FROM (
SELECT "Date", "Group" FROM (
SELECT
GENERATE_SERIES(
MIN("Date"),
MAX("Date"),
INTERVAL '1' DAY
) as "Date"
FROM
my_data
) t1
CROSS JOIN (
SELECT DISTINCT
"Group"
FROM
my_data
) t2
) d
LEFT JOIN my_data m ON m."Date"=d."Date" AND
m."Group" = d."Group"
ORDER BY 1,2;
Date | Group | cumulativecount |
---|---|---|
2020-01-01 00:00:00+00 | A | 1 |
2020-01-01 00:00:00+00 | B | 0 |
2020-01-01 00:00:00+00 | C | 2 |
2020-01-02 00:00:00+00 | A | 3 |
2020-01-02 00:00:00+00 | B | 1 |
2020-01-02 00:00:00+00 | C | 2 |
2020-01-03 00:00:00+00 | A | 3 |
2020-01-03 00:00:00+00 | B | 2 |
2020-01-03 00:00:00+00 | C | 2 |
2020-01-04 00:00:00+00 | A | 3 |
2020-01-04 00:00:00+00 | B | 2 |
2020-01-04 00:00:00+00 | C | 2 |
2020-01-05 00:00:00+00 | A | 3 |
2020-01-05 00:00:00+00 | B | 2 |
2020-01-05 00:00:00+00 | C | 2 |
2020-01-06 00:00:00+00 | A | 3 |
2020-01-06 00:00:00+00 | B | 2 |
2020-01-06 00:00:00+00 | C | 2 |
2020-01-07 00:00:00+00 | A | 3 |
2020-01-07 00:00:00+00 | B | 2 |
2020-01-07 00:00:00+00 | C | 3 |
您可以通过以下步骤实现:
注意:我创建了几个table。根据您的喜好随意使用子查询或 CTE。
首先,创建可能的 date-group
对:
create table ads as
SELECT
sq."date",
sq."group",
COALESCE(m."cummulativecount",0) as CummulativeCount
FROM
(
SELECT "date", "group"
FROM
(
SELECT DISTINCT "date"
FROM tbl
) dt
CROSS JOIN
(
SELECT DISTINCT "group"
FROM tbl
) grp
) sq
LEFT JOIN tbl m
ON m."date"=sq."date"
AND
m."group" = sq."group"
(以上步骤引用自@ggordon的回答)。现在,由于我们将附加记录的所有 cummulativeCount 归为 0,因此我们必须获得实际 table 的最近 cummulativeCount(即,对于 A,3
将是取自 "1/2/2020"
。如果最近的 most 值不可用,则不会取。
create table prev_cnt as
select t."group", t.cummulativecount
from tbl as t
inner join
(
select tbl."group", max(tbl."date") m_date
from ads
inner join tbl
on ads."group"=tbl."group"
where ads.cummulativecount =0 and
tbl."date" < ads."date"
group by tbl."group"
) as sq
on t."group"=sq."group"
and t."date"=sq.m_date
最后,将最近的值与派生广告结合起来table:
create table fin_ads as
select
ads."date",
ads."group",
case
when ads.cummulativecount=0 and pc.cummulativecount IS NOT NULL then pc.cummulativecount
else ads.cummulativecount
end as cummulativecount
from ads
LEFT join prev_cnt as pc
on ads."group"=pc."group"
table fin_ads
将是您想要的输出。
示例代码和日期在这里:DB<>fiddle