如何在 Redshift 中实现窗口 运行 中位数?
How can I achieve a windowed running median in Redshift?
我正在努力尝试按时间顺序创建分区值的 运行 / 累积 median
。基本上我有一个 table:
create table "SomeData"
(
ClientId INT,
SomeData DECIMAL(10,2),
SomeDate TIMESTAMP
);
一些数据:
INSERT INTO "SomeData" (ClientId, SomeData, SomeDate) VALUES
(1, 1, '1 Jan 2000'),
(1, 2, '2 Jan 2000'),
(1, 3, '3 Jan 2000'),
(1, 4, '4 Jan 2000'),
(2, 100, '1 Jan 2000'),
(2, 100, '2 Jan 2000'),
(2, 100, '3 Jan 2000'),
(2, 200, '4 Jan 2000'),
(2, 200, '5 Jan 2000'),
(2, 200, '6 Jan 2000'),
(2, 200, '7 Jan 2000');
我需要一个 运行 中位数,按 ClientId
划分,按 SomeDate
排序。
基本上,我需要制作的是:
ClientId SomeDate Median of SomeData
1 "2000-01-01" 1.000
1 "2000-01-02" 1.500
1 "2000-01-03" 2.000
1 "2000-01-04" 2.500
2 "2000-01-01" 100.0
2 "2000-01-02" 100.0
2 "2000-01-03" 100.0
2 "2000-01-04" 100.0
2 "2000-01-05" 100.0
2 "2000-01-06" 150.0
2 "2000-01-07" 200.0
我可以在 PostgresSql 9.x 中用 Aggregate_median
function 以多种方式做到这一点,但是事实证明这在 Redshift 中很难,它只有一个聚合中位数
SELECT ClientId, SomeDate, median(SomeData) OVER (PARTITION BY ClientId ORDER BY SomeDate)
FROM "SomeData" xout
ORDER BY ClientId, SomeDate;
运行 Redshift 上面的内容给出了错误:
ERROR: window specification should not contain frame clause and order-by for window function median
中位数可以用手动相关子查询替换回原始 table,但是 RedShift 似乎也不支持这些。
ERROR: This type of correlated subquery pattern is not supported due to internal error
Here are a bunch of fiddles 在 PostGres 中工作,none 在 Redshift 中工作
在这一点上,我似乎需要将数据拉入内存并 do this in code,但如果这可以直接在 Redshift 中完成,我将不胜感激。
我想知道你是否可以用 nth_value()
:
SELECT ClientId, SomeDate,
NTH_VALUE(seqnum / 2) OVER (PARTITION BY ClientId ORDER BY SomeDate)
FROM (SELECT s.*,
COUNT(*) OVER (PARTITION BY ClientId ORDER BY SomeDate) as seqnum
FROM SomeData s
) s
ORDER BY ClientId, SomeDate;
注意:使用 COUNT(*)
而不是 ROW_NUMBER()
需要一些时间来适应。
这是对您要查找的数量的精确计算。
本身并不性感,但它可以正确处理奇数与偶数长度的中位数。
with row_numbers as (
SELECT d.partitionField -- the field (or fields) you are partitioning the window function by
, d.orderField -- your sort field for the window functions
, d.medianField -- quantity your are computing the median of
, ROW_NUMBER()
OVER (PARTITION BY partitionField ORDER BY orderField) as seqnum
FROM data d
)
, medians as (
SELECT nth_value(medianField, CASE
WHEN mod(seqnum, 2) = 0 THEN (seqnum/2)::int
ELSE ((seqnum/2)::int + 1)
END)
OVER (PARTITION BY partitionField ORDER BY orderField ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as median1
, nth_value(medianField, (seqnum/2)::int + 1) OVER (PARTITION BY partitionField ORDER BY orderField ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as median2
, mod(seqnum, 2) as mod1
FROM row_numbers
ORDER BY partitionField, orderField
)
select CASE
when mod(mod1,2) = 0
then ((median1 + median2)/2)::FLOAT
else median1
end as median
from medians
我认为@GordonLinoff 提出的解决方案是不正确的,因为它没有按照您要查找的中位数的值对行进行排序。正确的方法灵感来自:
适用于红移:
WITH CTE
AS
(
SELECT ClientId,
ROW_NUMBER() OVER (PARTITION BY ClientId ORDER BY SomeDate ASC) row_num,
SomeDate,
SomeData
FROM "SomeData"
)
SELECT A.SomeDate,
A.SomeData,
(SELECT MEDIAN(B.SomeData)
FROM CTE B
WHERE B.row_num BETWEEN 1 AND A.row_num
GROUP BY A.ClientId) AS median
FROM CTE A
我正在努力尝试按时间顺序创建分区值的 运行 / 累积 median
。基本上我有一个 table:
create table "SomeData"
(
ClientId INT,
SomeData DECIMAL(10,2),
SomeDate TIMESTAMP
);
一些数据:
INSERT INTO "SomeData" (ClientId, SomeData, SomeDate) VALUES
(1, 1, '1 Jan 2000'),
(1, 2, '2 Jan 2000'),
(1, 3, '3 Jan 2000'),
(1, 4, '4 Jan 2000'),
(2, 100, '1 Jan 2000'),
(2, 100, '2 Jan 2000'),
(2, 100, '3 Jan 2000'),
(2, 200, '4 Jan 2000'),
(2, 200, '5 Jan 2000'),
(2, 200, '6 Jan 2000'),
(2, 200, '7 Jan 2000');
我需要一个 运行 中位数,按 ClientId
划分,按 SomeDate
排序。
基本上,我需要制作的是:
ClientId SomeDate Median of SomeData
1 "2000-01-01" 1.000
1 "2000-01-02" 1.500
1 "2000-01-03" 2.000
1 "2000-01-04" 2.500
2 "2000-01-01" 100.0
2 "2000-01-02" 100.0
2 "2000-01-03" 100.0
2 "2000-01-04" 100.0
2 "2000-01-05" 100.0
2 "2000-01-06" 150.0
2 "2000-01-07" 200.0
我可以在 PostgresSql 9.x 中用 Aggregate_median
function 以多种方式做到这一点,但是事实证明这在 Redshift 中很难,它只有一个聚合中位数
SELECT ClientId, SomeDate, median(SomeData) OVER (PARTITION BY ClientId ORDER BY SomeDate)
FROM "SomeData" xout
ORDER BY ClientId, SomeDate;
运行 Redshift 上面的内容给出了错误:
ERROR: window specification should not contain frame clause and order-by for window function median
中位数可以用手动相关子查询替换回原始 table,但是 RedShift 似乎也不支持这些。
ERROR: This type of correlated subquery pattern is not supported due to internal error
Here are a bunch of fiddles 在 PostGres 中工作,none 在 Redshift 中工作
在这一点上,我似乎需要将数据拉入内存并 do this in code,但如果这可以直接在 Redshift 中完成,我将不胜感激。
我想知道你是否可以用 nth_value()
:
SELECT ClientId, SomeDate,
NTH_VALUE(seqnum / 2) OVER (PARTITION BY ClientId ORDER BY SomeDate)
FROM (SELECT s.*,
COUNT(*) OVER (PARTITION BY ClientId ORDER BY SomeDate) as seqnum
FROM SomeData s
) s
ORDER BY ClientId, SomeDate;
注意:使用 COUNT(*)
而不是 ROW_NUMBER()
需要一些时间来适应。
这是对您要查找的数量的精确计算。
本身并不性感,但它可以正确处理奇数与偶数长度的中位数。
with row_numbers as (
SELECT d.partitionField -- the field (or fields) you are partitioning the window function by
, d.orderField -- your sort field for the window functions
, d.medianField -- quantity your are computing the median of
, ROW_NUMBER()
OVER (PARTITION BY partitionField ORDER BY orderField) as seqnum
FROM data d
)
, medians as (
SELECT nth_value(medianField, CASE
WHEN mod(seqnum, 2) = 0 THEN (seqnum/2)::int
ELSE ((seqnum/2)::int + 1)
END)
OVER (PARTITION BY partitionField ORDER BY orderField ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as median1
, nth_value(medianField, (seqnum/2)::int + 1) OVER (PARTITION BY partitionField ORDER BY orderField ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as median2
, mod(seqnum, 2) as mod1
FROM row_numbers
ORDER BY partitionField, orderField
)
select CASE
when mod(mod1,2) = 0
then ((median1 + median2)/2)::FLOAT
else median1
end as median
from medians
我认为@GordonLinoff 提出的解决方案是不正确的,因为它没有按照您要查找的中位数的值对行进行排序。正确的方法灵感来自:
适用于红移:
WITH CTE
AS
(
SELECT ClientId,
ROW_NUMBER() OVER (PARTITION BY ClientId ORDER BY SomeDate ASC) row_num,
SomeDate,
SomeData
FROM "SomeData"
)
SELECT A.SomeDate,
A.SomeData,
(SELECT MEDIAN(B.SomeData)
FROM CTE B
WHERE B.row_num BETWEEN 1 AND A.row_num
GROUP BY A.ClientId) AS median
FROM CTE A