Redshift PostgreSQL Distinct ON 运算符
Redshift PostgreSQL Distinct ON Operator
我有一个数据集,我想对其进行解析以查看多点触控归因。该数据集由响应营销活动的潜在客户及其营销来源组成。
每个潜在客户都可以响应多个营销活动,我想在同一个 table 中获得他们的第一个营销来源和最后一个营销来源。
我想我可以创建两个 tables 并使用来自两者的 select 语句。
第一个 table 将尝试创建一个 table,其中包含每个人的最新营销来源(使用电子邮件作为他们的唯一 ID)。
create table temp.multitouch1 as (
select distinct on (email) email, date, market_source as last_source
from sf.campaignmember
where date >= '1/1/2016' ORDER BY DATE DESC);
然后我会创建一个 table,其中包含删除重复的电子邮件,但这次是针对第一个来源。
create table temp.multitouch2 as (
select distinct on (email) email, date, market_source as first_source
from sf.campaignmember
where date >= '1/1/2016' ORDER BY DATE ASC);
最后我想简单地 select 电子邮件并将第一个和最后一个市场资源添加到它们自己的列中。
select a.email, a.last_source, b.first_source, a.date
from temp.multitouch1 a
left join temp.multitouch b on b.email = a.email
由于 distinct on 在 redshift 的 postgresql 版本上不起作用,我希望有人有想法以另一种方式解决这个问题。
编辑 2/22:为了获得更多背景信息,我正在处理他们已经回应的人和活动。每条记录都是 "campaign response",每个人都可以有多个来源的多个活动响应。我正在尝试制作一个 select 声明,该声明将按人进行重复数据删除,然后分别为他们响应的第一个 campaign/marketing 源和他们响应的最后一个 campaign/marketing 源提供列.
编辑 2/24:理想的输出是 table,有 4 列:电子邮件、last_source、first_source、日期。
对于只有 1 个活动成员记录的人来说,第一个和最后一个源列是相同的,对于拥有超过 1 个活动成员记录的每个人来说都是不同的。
您可以使用旧的左连接分组最大值。
SELECT DISTINCT c1.email, c1.date, c1.market_source
FROM sf.campaignmember c1
LEFT JOIN sf.campaignmember c2
ON c1.email = c2.email AND c1.date > c2.date AND c1.id > c2.id
LEFT JOIN sf.campaignmember c3
ON c1.email = c3.email AND c1.date < c3.date AND c1.id > c3.id
WHERE c1.date >= '1/1/2016' AND c2.date >= '1/1/2016'
AND (c2.email IS NULL OR c3.email IS NULL)
这假设您有一个唯一的 ID 列,如果(日期,电子邮件)是唯一的 ID 则不需要。
我相信你可以像这样在 case 表达式中使用 row_number():
SELECT
email
, MIN(first_source) AS first_source
, MIN(date) first_date
, MAX(last_source) AS last_source
, MAX(date) AS last_date
FROM (
SELECT
email
, date
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date ASC) = 1 THEN market_source
ELSE NULL
END AS first_source
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date DESC) = 1 THEN market_source
ELSE NULL
END AS last_source
FROM sf.campaignmember
WHERE date >= '2016-01-01'
) s
WHERE first_source IS NOT NULL
OR last_source IS NOT NULL
GROUP BY
email
在此处测试:SQL Fiddle
PostgreSQL 9.3 架构设置:
CREATE TABLE campaignmember
(email varchar(3), date timestamp, market_source varchar(1))
;
INSERT INTO campaignmember
(email, date, market_source)
VALUES
('a@a', '2016-01-02 00:00:00', 'x'),
('a@a', '2016-01-03 00:00:00', 'y'),
('a@a', '2016-01-04 00:00:00', 'z'),
('b@b', '2016-01-02 00:00:00', 'x')
;
查询 1:
SELECT
email
, MIN(first_source) AS first_source
, MIN(date) first_date
, MAX(last_source) AS last_source
, MAX(date) AS last_date
FROM (
SELECT
email
, date
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date ASC) = 1 THEN market_source
ELSE NULL
END AS first_source
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date DESC) = 1 THEN market_source
ELSE NULL
END AS last_source
FROM campaignmember
WHERE date >= '2016-01-01'
) s
WHERE first_source IS NOT NULL
OR last_source IS NOT NULL
GROUP BY
email
| email | first_source | first_date | last_source | last_date |
|-------|--------------|---------------------------|-------------|---------------------------|
| a@a | x | January, 02 2016 00:00:00 | z | January, 04 2016 00:00:00 |
| b@b | x | January, 02 2016 00:00:00 | x | January, 02 2016 00:00:00 |
&请求的小扩展,统计联系点数
SELECT
email
, MIN(first_source) AS first_source
, MIN(date) first_date
, MAX(last_source) AS last_source
, MAX(date) AS last_date
, MAX(numof) AS Numberof_Contacts
FROM (
SELECT
email
, date
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date ASC) = 1 THEN market_source
ELSE NULL
END AS first_source
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date DESC) = 1 THEN market_source
ELSE NULL
END AS last_source
, COUNT(*) OVER (PARTITION BY email) as numof
FROM campaignmember
WHERE date >= '2016-01-01'
) s
WHERE first_source IS NOT NULL
OR last_source IS NOT NULL
GROUP BY
email
我有一个数据集,我想对其进行解析以查看多点触控归因。该数据集由响应营销活动的潜在客户及其营销来源组成。
每个潜在客户都可以响应多个营销活动,我想在同一个 table 中获得他们的第一个营销来源和最后一个营销来源。
我想我可以创建两个 tables 并使用来自两者的 select 语句。 第一个 table 将尝试创建一个 table,其中包含每个人的最新营销来源(使用电子邮件作为他们的唯一 ID)。
create table temp.multitouch1 as (
select distinct on (email) email, date, market_source as last_source
from sf.campaignmember
where date >= '1/1/2016' ORDER BY DATE DESC);
然后我会创建一个 table,其中包含删除重复的电子邮件,但这次是针对第一个来源。
create table temp.multitouch2 as (
select distinct on (email) email, date, market_source as first_source
from sf.campaignmember
where date >= '1/1/2016' ORDER BY DATE ASC);
最后我想简单地 select 电子邮件并将第一个和最后一个市场资源添加到它们自己的列中。
select a.email, a.last_source, b.first_source, a.date
from temp.multitouch1 a
left join temp.multitouch b on b.email = a.email
由于 distinct on 在 redshift 的 postgresql 版本上不起作用,我希望有人有想法以另一种方式解决这个问题。
编辑 2/22:为了获得更多背景信息,我正在处理他们已经回应的人和活动。每条记录都是 "campaign response",每个人都可以有多个来源的多个活动响应。我正在尝试制作一个 select 声明,该声明将按人进行重复数据删除,然后分别为他们响应的第一个 campaign/marketing 源和他们响应的最后一个 campaign/marketing 源提供列.
编辑 2/24:理想的输出是 table,有 4 列:电子邮件、last_source、first_source、日期。
对于只有 1 个活动成员记录的人来说,第一个和最后一个源列是相同的,对于拥有超过 1 个活动成员记录的每个人来说都是不同的。
您可以使用旧的左连接分组最大值。
SELECT DISTINCT c1.email, c1.date, c1.market_source
FROM sf.campaignmember c1
LEFT JOIN sf.campaignmember c2
ON c1.email = c2.email AND c1.date > c2.date AND c1.id > c2.id
LEFT JOIN sf.campaignmember c3
ON c1.email = c3.email AND c1.date < c3.date AND c1.id > c3.id
WHERE c1.date >= '1/1/2016' AND c2.date >= '1/1/2016'
AND (c2.email IS NULL OR c3.email IS NULL)
这假设您有一个唯一的 ID 列,如果(日期,电子邮件)是唯一的 ID 则不需要。
我相信你可以像这样在 case 表达式中使用 row_number():
SELECT
email
, MIN(first_source) AS first_source
, MIN(date) first_date
, MAX(last_source) AS last_source
, MAX(date) AS last_date
FROM (
SELECT
email
, date
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date ASC) = 1 THEN market_source
ELSE NULL
END AS first_source
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date DESC) = 1 THEN market_source
ELSE NULL
END AS last_source
FROM sf.campaignmember
WHERE date >= '2016-01-01'
) s
WHERE first_source IS NOT NULL
OR last_source IS NOT NULL
GROUP BY
email
在此处测试:SQL Fiddle
PostgreSQL 9.3 架构设置:
CREATE TABLE campaignmember
(email varchar(3), date timestamp, market_source varchar(1))
;
INSERT INTO campaignmember
(email, date, market_source)
VALUES
('a@a', '2016-01-02 00:00:00', 'x'),
('a@a', '2016-01-03 00:00:00', 'y'),
('a@a', '2016-01-04 00:00:00', 'z'),
('b@b', '2016-01-02 00:00:00', 'x')
;
查询 1:
SELECT
email
, MIN(first_source) AS first_source
, MIN(date) first_date
, MAX(last_source) AS last_source
, MAX(date) AS last_date
FROM (
SELECT
email
, date
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date ASC) = 1 THEN market_source
ELSE NULL
END AS first_source
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date DESC) = 1 THEN market_source
ELSE NULL
END AS last_source
FROM campaignmember
WHERE date >= '2016-01-01'
) s
WHERE first_source IS NOT NULL
OR last_source IS NOT NULL
GROUP BY
email
| email | first_source | first_date | last_source | last_date |
|-------|--------------|---------------------------|-------------|---------------------------|
| a@a | x | January, 02 2016 00:00:00 | z | January, 04 2016 00:00:00 |
| b@b | x | January, 02 2016 00:00:00 | x | January, 02 2016 00:00:00 |
&请求的小扩展,统计联系点数
SELECT
email
, MIN(first_source) AS first_source
, MIN(date) first_date
, MAX(last_source) AS last_source
, MAX(date) AS last_date
, MAX(numof) AS Numberof_Contacts
FROM (
SELECT
email
, date
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date ASC) = 1 THEN market_source
ELSE NULL
END AS first_source
, CASE
WHEN ROW_NUMBER() OVER (PARTITION BY email ORDER BY date DESC) = 1 THEN market_source
ELSE NULL
END AS last_source
, COUNT(*) OVER (PARTITION BY email) as numof
FROM campaignmember
WHERE date >= '2016-01-01'
) s
WHERE first_source IS NOT NULL
OR last_source IS NOT NULL
GROUP BY
email