在 PostgreSQL 中有效地计算滚动总和
Computing rolling sums efficiently in PostgreSQL
假设我有一组交易(购买)和一组客户的日期,我想计算滚动 x 天的购买金额和购买次数总和 客户 在同一个 window 中。我已经使用 window 函数让它工作,但我必须填写客户未进行任何购买的日期。这样做时,我使用的是笛卡尔积。是否有更有效的方法,使其随着客户数量和时间 window 的增加而更具可扩展性?
编辑: 如评论中所述,我使用的是 PostgreSQL v9.3.
这是示例数据(请注意,某些客户在给定日期可能有 0 次、1 次或多次购买):
| id | cust_id | txn_date | amount |
|----|---------|------------|--------|
| 1 | 123 | 2017-08-17 | 10 |
| 2 | 123 | 2017-08-17 | 5 |
| 3 | 123 | 2017-08-18 | 5 |
| 4 | 123 | 2017-08-20 | 50 |
| 5 | 123 | 2017-08-21 | 100 |
| 6 | 456 | 2017-08-01 | 5 |
| 7 | 456 | 2017-08-01 | 5 |
| 8 | 456 | 2017-08-01 | 5 |
| 9 | 456 | 2017-08-30 | 5 |
| 10 | 456 | 2017-08-01 | 1000 |
| 11 | 789 | 2017-08-15 | 1000 |
| 12 | 789 | 2017-08-30 | 1000 |
这是所需的输出:
| cust_id | txn_date | sum_dly_txns | tot_txns_7d | cnt_txns_7d |
|---------|------------|--------------|-------------|-------------|
| 123 | 2017-08-17 | 15 | 15 | 2 |
| 123 | 2017-08-18 | 5 | 20 | 3 |
| 123 | 2017-08-20 | 50 | 70 | 4 |
| 123 | 2017-08-21 | 100 | 170 | 5 |
| 456 | 2017-08-01 | 1015 | 1015 | 4 |
| 456 | 2017-08-30 | 5 | 5 | 1 |
| 789 | 2017-08-15 | 1000 | 1000 | 1 |
| 789 | 2017-08-30 | 1000 | 1000 | 1 |
这里 SQL 生成所需的总数:
SELECT *
FROM (
-- One row per day per user
WITH daily_txns AS (
SELECT
t.cust_id
,t.txn_date AS txn_date
,SUM(t.amount) AS sum_dly_txns
,COUNT(t.id) AS cnt_dly_txns
FROM transactions t
GROUP BY t.cust_id, txn_date
),
-- Every possible transaction date for every user
dummydates AS (
SELECT txn_date, uids.cust_id
FROM (
SELECT generate_series(
timestamp '2017-08-01'
,timestamp '2017-08-30'
,interval '1 day')::date
) d(txn_date)
CROSS JOIN (SELECT DISTINCT cust_id FROM daily_txns) uids
),
txns_dummied AS (
SELECT
d.cust_id
,d.txn_date
,COALESCE(sum_dly_txns,0) AS sum_dly_txns
,COALESCE(cnt_dly_txns,0) AS cnt_dly_txns
FROM dummydates d
LEFT JOIN daily_txns dx
ON d.txn_date = dx.txn_date
AND d.cust_id = dx.cust_id
ORDER BY d.txn_date, d.cust_id
)
SELECT
cust_id
,txn_date
,sum_dly_txns
,SUM(COALESCE(sum_dly_txns,0)) OVER w AS tot_txns_7d
,SUM(cnt_dly_txns) OVER w AS cnt_txns_7d
FROM txns_dummied
WINDOW w AS (
PARTITION BY cust_id
ORDER BY txn_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW -- 7d moving window
)
ORDER BY cust_id, txn_date
) xfers
WHERE sum_dly_txns > 0 -- Omit dates with no transactions
;
你想写 RANGE '6 days' PRECEEDING
而不是 ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
吗?
这一定是您要找的:
SELECT DISTINCT
cust_id
,txn_date
,SUM(amount) OVER (PARTITION BY cust_id, txn_date) sum_dly_txns
,SUM(amount) OVER (PARTITION BY cust_id ORDER BY txn_date RANGE '6 days' PRECEDING)
,COUNT(*) OVER (PARTITION BY cust_id ORDER BY txn_date RANGE '6 days' PRECEDING)
from transactions
ORDER BY cust_id, txn_date
编辑:由于您使用的是旧版本(我在我的 postgresql 11 上测试了上面的版本),上面的观点没有多大意义,所以您需要旧版本-fashioned SQL(即没有 window 函数)。
它的效率有点低,但做得还不错。
WITH daily_txns AS (
SELECT
t.cust_id
,t.txn_date AS txn_date
,SUM(t.amount) AS sum_dly_txns
,COUNT(t.id) AS cnt_dly_txns
FROM transactions t
GROUP BY t.cust_id, txn_date
)
SELECT t1.cust_id, t1.txn_date, t1.sum_dly_txns, SUM(t2.sum_dly_txns), SUM(t2.cnt_dly_txns)
from daily_txns t1
join daily_txns t2 ON t1.cust_id = t2.cust_id and t2.txn_date BETWEEN t1.txn_date - 7 and t1.txn_date
group by t1.cust_id, t1.txn_date, t1.sum_dly_txns
order by t1.cust_id, t1.txn_date
假设我有一组交易(购买)和一组客户的日期,我想计算滚动 x 天的购买金额和购买次数总和 客户 在同一个 window 中。我已经使用 window 函数让它工作,但我必须填写客户未进行任何购买的日期。这样做时,我使用的是笛卡尔积。是否有更有效的方法,使其随着客户数量和时间 window 的增加而更具可扩展性?
编辑: 如评论中所述,我使用的是 PostgreSQL v9.3.
这是示例数据(请注意,某些客户在给定日期可能有 0 次、1 次或多次购买):
| id | cust_id | txn_date | amount |
|----|---------|------------|--------|
| 1 | 123 | 2017-08-17 | 10 |
| 2 | 123 | 2017-08-17 | 5 |
| 3 | 123 | 2017-08-18 | 5 |
| 4 | 123 | 2017-08-20 | 50 |
| 5 | 123 | 2017-08-21 | 100 |
| 6 | 456 | 2017-08-01 | 5 |
| 7 | 456 | 2017-08-01 | 5 |
| 8 | 456 | 2017-08-01 | 5 |
| 9 | 456 | 2017-08-30 | 5 |
| 10 | 456 | 2017-08-01 | 1000 |
| 11 | 789 | 2017-08-15 | 1000 |
| 12 | 789 | 2017-08-30 | 1000 |
这是所需的输出:
| cust_id | txn_date | sum_dly_txns | tot_txns_7d | cnt_txns_7d |
|---------|------------|--------------|-------------|-------------|
| 123 | 2017-08-17 | 15 | 15 | 2 |
| 123 | 2017-08-18 | 5 | 20 | 3 |
| 123 | 2017-08-20 | 50 | 70 | 4 |
| 123 | 2017-08-21 | 100 | 170 | 5 |
| 456 | 2017-08-01 | 1015 | 1015 | 4 |
| 456 | 2017-08-30 | 5 | 5 | 1 |
| 789 | 2017-08-15 | 1000 | 1000 | 1 |
| 789 | 2017-08-30 | 1000 | 1000 | 1 |
这里 SQL 生成所需的总数:
SELECT *
FROM (
-- One row per day per user
WITH daily_txns AS (
SELECT
t.cust_id
,t.txn_date AS txn_date
,SUM(t.amount) AS sum_dly_txns
,COUNT(t.id) AS cnt_dly_txns
FROM transactions t
GROUP BY t.cust_id, txn_date
),
-- Every possible transaction date for every user
dummydates AS (
SELECT txn_date, uids.cust_id
FROM (
SELECT generate_series(
timestamp '2017-08-01'
,timestamp '2017-08-30'
,interval '1 day')::date
) d(txn_date)
CROSS JOIN (SELECT DISTINCT cust_id FROM daily_txns) uids
),
txns_dummied AS (
SELECT
d.cust_id
,d.txn_date
,COALESCE(sum_dly_txns,0) AS sum_dly_txns
,COALESCE(cnt_dly_txns,0) AS cnt_dly_txns
FROM dummydates d
LEFT JOIN daily_txns dx
ON d.txn_date = dx.txn_date
AND d.cust_id = dx.cust_id
ORDER BY d.txn_date, d.cust_id
)
SELECT
cust_id
,txn_date
,sum_dly_txns
,SUM(COALESCE(sum_dly_txns,0)) OVER w AS tot_txns_7d
,SUM(cnt_dly_txns) OVER w AS cnt_txns_7d
FROM txns_dummied
WINDOW w AS (
PARTITION BY cust_id
ORDER BY txn_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW -- 7d moving window
)
ORDER BY cust_id, txn_date
) xfers
WHERE sum_dly_txns > 0 -- Omit dates with no transactions
;
你想写 RANGE '6 days' PRECEEDING
而不是 ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
吗?
这一定是您要找的:
SELECT DISTINCT
cust_id
,txn_date
,SUM(amount) OVER (PARTITION BY cust_id, txn_date) sum_dly_txns
,SUM(amount) OVER (PARTITION BY cust_id ORDER BY txn_date RANGE '6 days' PRECEDING)
,COUNT(*) OVER (PARTITION BY cust_id ORDER BY txn_date RANGE '6 days' PRECEDING)
from transactions
ORDER BY cust_id, txn_date
编辑:由于您使用的是旧版本(我在我的 postgresql 11 上测试了上面的版本),上面的观点没有多大意义,所以您需要旧版本-fashioned SQL(即没有 window 函数)。
它的效率有点低,但做得还不错。
WITH daily_txns AS (
SELECT
t.cust_id
,t.txn_date AS txn_date
,SUM(t.amount) AS sum_dly_txns
,COUNT(t.id) AS cnt_dly_txns
FROM transactions t
GROUP BY t.cust_id, txn_date
)
SELECT t1.cust_id, t1.txn_date, t1.sum_dly_txns, SUM(t2.sum_dly_txns), SUM(t2.cnt_dly_txns)
from daily_txns t1
join daily_txns t2 ON t1.cust_id = t2.cust_id and t2.txn_date BETWEEN t1.txn_date - 7 and t1.txn_date
group by t1.cust_id, t1.txn_date, t1.sum_dly_txns
order by t1.cust_id, t1.txn_date