SQL 查询挑战 - 在列中找到最常见的项目并将结果汇总到一个数据透视表 table
SQL query challenge - find top frequent items in columns and summarize result to a pivot table
我正在寻找一个查询来进行以下转换。
基本上我想在每个网站上,每天找到前 3 个频繁 SELL_COUNTRY 和前 3 个频繁类别。 (比如网站1,日期6-5-2017,SELL_COUNTRY有2*US,1*JP,1*UK,所以TOP1_SELL_COUNTRY是US,JP和UK去TOP2_SELL_COUNTRY 和 TOP3_SELL_COUNTRY。CATEGORY 列的思路相同)
我现在的解决方案涉及到很多子查询,这种方法可行,但我觉得它太复杂了。我很感兴趣 sql master 是如何优雅地做到这一点的。
目前我知道如何使用
来自
到
WITH a1 AS
(
SELECT *,
COUNT(*) OVER( PARTITION BY website,SUBSTRING(visit_date,1,8),sell_country ) AS sell_cntry,
COUNT(*) OVER( PARTITION BY website,SUBSTRING(visit_date,1,8),pur_country ) AS pur_cntry
FROM Yourtable
),
a2 AS
(
SELECT website,
visit_date,
sell_country,
RANK() OVER ( PARTITION BY website,SUBSTRING(visit_date,1,8) ORDER BY sell_cntry DESC ) AS sell_cntry_rnk
FROM a1
),
a3 AS
(
SELECT website,
visit_date,
pur_country,
RANK() OVER ( PARTITION BY website,SUBSTRING(visit_date,1,8) ORDER BY pur_cntry DESC ) AS pur_cntry_rnk
FROM a1
),
a4 AS
(
SELECT a2.website AS company,
a2.v_date,
CASE WHEN a2.sell_cntry_rn = 1 THEN a2.sell_country END AS TOP1_SELL_COUNTRY,
CASE WHEN a2.sell_cntry_rn = 2 THEN a2.sell_country END AS TOP2_SELL_COUNTRY,
CASE WHEN a2.sell_cntry_rn = 3 THEN a2.sell_country END AS TOP3_SELL_COUNTRY,
CASE WHEN a3.pur_cntry_rn = 1 THEN a3.pur_country END AS TOP1_PUR_COUNTRY,
CASE WHEN a3.pur_cntry_rn = 2 THEN a3.pur_country END AS TOP2_PUR_COUNTRY,
CASE WHEN a3.pur_cntry_rn = 3 THEN a3.pur_country END AS TOP3_PUR_COUNTRY
FROM (
SELECT Z.*,
ROW_NUMBER() OVER( PARTITION BY website,v_date ORDER BY sell_cntry_rnk,sell_country ) AS sell_cntry_rn
FROM
(
SELECT DISTINCT website,
SUBSTRING(visit_date,1,8) AS v_date,
sell_cntry_rnk,
sell_country
FROM a2
) Z
WHERE Z.sell_cntry_rnk <= 3
) a2
INNER JOIN
(
SELECT *,
ROW_NUMBER() OVER( PARTITION BY website,v_date ORDER BY pur_cntry_rnk,pur_country ) AS pur_cntry_rn
FROM
( SELECT DISTINCT website,
SUBSTRING(visit_date,1,8) AS v_date,
pur_cntry_rnk,
pur_country
FROM a3
) Z
WHERE Z.pur_cntry_rnk <= 3
) a3
ON a2.website = a3.website
AND a2.v_date = a3.v_date
),
a5 AS
(
SELECT company,
v_date,
MAX(TOP1_SELL_COUNTRY) AS TOP1_SELL_COUNTRY,
MAX(TOP2_SELL_COUNTRY) AS TOP2_SELL_COUNTRY,
MAX(TOP3_SELL_COUNTRY) AS TOP3_SELL_COUNTRY,
MAX(TOP1_PUR_COUNTRY) AS TOP1_PUR_COUNTRY,
MAX(TOP2_PUR_COUNTRY) AS TOP2_PUR_COUNTRY,
MAX(TOP3_PUR_COUNTRY) AS TOP3_PUR_COUNTRY
FROM a4
GROUP BY company,
v_date
)
SELECT company,
v_date,
CASE WHEN TOP1_SELL_COUNTRY IS NULL THEN 'NA' ELSE TOP1_SELL_COUNTRY END AS TOP1_SELL_COUNTRY,
CASE WHEN TOP2_SELL_COUNTRY IS NULL THEN 'NA' ELSE TOP2_SELL_COUNTRY END AS TOP2_SELL_COUNTRY,
CASE WHEN TOP3_SELL_COUNTRY IS NULL THEN 'NA' ELSE TOP3_SELL_COUNTRY END AS TOP3_SELL_COUNTRY,
CASE WHEN TOP1_PUR_COUNTRY IS NULL THEN 'NA' ELSE TOP1_PUR_COUNTRY END AS TOP1_PUR_COUNTRY,
CASE WHEN TOP2_PUR_COUNTRY IS NULL THEN 'NA' ELSE TOP2_PUR_COUNTRY END AS TOP2_PUR_COUNTRY,
CASE WHEN TOP3_PUR_COUNTRY IS NULL THEN 'NA' ELSE TOP3_PUR_COUNTRY END AS TOP3_PUR_COUNTRY
FROM a5
ORDER BY company,v_date;
我会分 3 个步骤完成:
- 按国家/地区分组并按计数排名
- 按类别分组并按计数排名
- 使用条件聚合混合结果(这只会将值放在必要的单元格中,因为
CASE
的结果将只是您的值和许多 NULL
值,所以 min()
输出值)
像这样:
WITH
countries as (
SELECT *, row_number() over (partition by website,date order by count desc)
FROM (
SELECT
website
,date::date
,sell_country
,count(1)
FROM your_table
GROUP BY 1,2,3
)
)
,categories as (
SELECT *, row_number() over (partition by website,date order by count desc)
FROM (
SELECT
website
,date::date
,category
,count(1)
FROM your_table
GROUP BY 1,2,3
)
)
SELECT
website
,date
,coalesce(min(case when t1.row_number=1 then t1.sell_country end),'NA') as top1_sell_country
,coalesce(min(case when t1.row_number=2 then t1.sell_country end),'NA') as top2_sell_country
,coalesce(min(case when t1.row_number=3 then t1.sell_country end),'NA') as top3_sell_country
,coalesce(min(case when t2.row_number=1 then t2.category end),'NA') as top1_sell_category
,coalesce(min(case when t2.row_number=2 then t2.category end),'NA') as top2_sell_category
,coalesce(min(case when t2.row_number=3 then t2.category end),'NA') as top3_sell_category
FROM countries t1
FULL JOIN categories t2
USING (website,date)
GROUP BY 1,2
ORDER BY 1,2
我正在寻找一个查询来进行以下转换。
基本上我想在每个网站上,每天找到前 3 个频繁 SELL_COUNTRY 和前 3 个频繁类别。 (比如网站1,日期6-5-2017,SELL_COUNTRY有2*US,1*JP,1*UK,所以TOP1_SELL_COUNTRY是US,JP和UK去TOP2_SELL_COUNTRY 和 TOP3_SELL_COUNTRY。CATEGORY 列的思路相同)
我现在的解决方案涉及到很多子查询,这种方法可行,但我觉得它太复杂了。我很感兴趣 sql master 是如何优雅地做到这一点的。
目前我知道如何使用
来自
到
WITH a1 AS
(
SELECT *,
COUNT(*) OVER( PARTITION BY website,SUBSTRING(visit_date,1,8),sell_country ) AS sell_cntry,
COUNT(*) OVER( PARTITION BY website,SUBSTRING(visit_date,1,8),pur_country ) AS pur_cntry
FROM Yourtable
),
a2 AS
(
SELECT website,
visit_date,
sell_country,
RANK() OVER ( PARTITION BY website,SUBSTRING(visit_date,1,8) ORDER BY sell_cntry DESC ) AS sell_cntry_rnk
FROM a1
),
a3 AS
(
SELECT website,
visit_date,
pur_country,
RANK() OVER ( PARTITION BY website,SUBSTRING(visit_date,1,8) ORDER BY pur_cntry DESC ) AS pur_cntry_rnk
FROM a1
),
a4 AS
(
SELECT a2.website AS company,
a2.v_date,
CASE WHEN a2.sell_cntry_rn = 1 THEN a2.sell_country END AS TOP1_SELL_COUNTRY,
CASE WHEN a2.sell_cntry_rn = 2 THEN a2.sell_country END AS TOP2_SELL_COUNTRY,
CASE WHEN a2.sell_cntry_rn = 3 THEN a2.sell_country END AS TOP3_SELL_COUNTRY,
CASE WHEN a3.pur_cntry_rn = 1 THEN a3.pur_country END AS TOP1_PUR_COUNTRY,
CASE WHEN a3.pur_cntry_rn = 2 THEN a3.pur_country END AS TOP2_PUR_COUNTRY,
CASE WHEN a3.pur_cntry_rn = 3 THEN a3.pur_country END AS TOP3_PUR_COUNTRY
FROM (
SELECT Z.*,
ROW_NUMBER() OVER( PARTITION BY website,v_date ORDER BY sell_cntry_rnk,sell_country ) AS sell_cntry_rn
FROM
(
SELECT DISTINCT website,
SUBSTRING(visit_date,1,8) AS v_date,
sell_cntry_rnk,
sell_country
FROM a2
) Z
WHERE Z.sell_cntry_rnk <= 3
) a2
INNER JOIN
(
SELECT *,
ROW_NUMBER() OVER( PARTITION BY website,v_date ORDER BY pur_cntry_rnk,pur_country ) AS pur_cntry_rn
FROM
( SELECT DISTINCT website,
SUBSTRING(visit_date,1,8) AS v_date,
pur_cntry_rnk,
pur_country
FROM a3
) Z
WHERE Z.pur_cntry_rnk <= 3
) a3
ON a2.website = a3.website
AND a2.v_date = a3.v_date
),
a5 AS
(
SELECT company,
v_date,
MAX(TOP1_SELL_COUNTRY) AS TOP1_SELL_COUNTRY,
MAX(TOP2_SELL_COUNTRY) AS TOP2_SELL_COUNTRY,
MAX(TOP3_SELL_COUNTRY) AS TOP3_SELL_COUNTRY,
MAX(TOP1_PUR_COUNTRY) AS TOP1_PUR_COUNTRY,
MAX(TOP2_PUR_COUNTRY) AS TOP2_PUR_COUNTRY,
MAX(TOP3_PUR_COUNTRY) AS TOP3_PUR_COUNTRY
FROM a4
GROUP BY company,
v_date
)
SELECT company,
v_date,
CASE WHEN TOP1_SELL_COUNTRY IS NULL THEN 'NA' ELSE TOP1_SELL_COUNTRY END AS TOP1_SELL_COUNTRY,
CASE WHEN TOP2_SELL_COUNTRY IS NULL THEN 'NA' ELSE TOP2_SELL_COUNTRY END AS TOP2_SELL_COUNTRY,
CASE WHEN TOP3_SELL_COUNTRY IS NULL THEN 'NA' ELSE TOP3_SELL_COUNTRY END AS TOP3_SELL_COUNTRY,
CASE WHEN TOP1_PUR_COUNTRY IS NULL THEN 'NA' ELSE TOP1_PUR_COUNTRY END AS TOP1_PUR_COUNTRY,
CASE WHEN TOP2_PUR_COUNTRY IS NULL THEN 'NA' ELSE TOP2_PUR_COUNTRY END AS TOP2_PUR_COUNTRY,
CASE WHEN TOP3_PUR_COUNTRY IS NULL THEN 'NA' ELSE TOP3_PUR_COUNTRY END AS TOP3_PUR_COUNTRY
FROM a5
ORDER BY company,v_date;
我会分 3 个步骤完成:
- 按国家/地区分组并按计数排名
- 按类别分组并按计数排名
- 使用条件聚合混合结果(这只会将值放在必要的单元格中,因为
CASE
的结果将只是您的值和许多NULL
值,所以min()
输出值)
像这样:
WITH
countries as (
SELECT *, row_number() over (partition by website,date order by count desc)
FROM (
SELECT
website
,date::date
,sell_country
,count(1)
FROM your_table
GROUP BY 1,2,3
)
)
,categories as (
SELECT *, row_number() over (partition by website,date order by count desc)
FROM (
SELECT
website
,date::date
,category
,count(1)
FROM your_table
GROUP BY 1,2,3
)
)
SELECT
website
,date
,coalesce(min(case when t1.row_number=1 then t1.sell_country end),'NA') as top1_sell_country
,coalesce(min(case when t1.row_number=2 then t1.sell_country end),'NA') as top2_sell_country
,coalesce(min(case when t1.row_number=3 then t1.sell_country end),'NA') as top3_sell_country
,coalesce(min(case when t2.row_number=1 then t2.category end),'NA') as top1_sell_category
,coalesce(min(case when t2.row_number=2 then t2.category end),'NA') as top2_sell_category
,coalesce(min(case when t2.row_number=3 then t2.category end),'NA') as top3_sell_category
FROM countries t1
FULL JOIN categories t2
USING (website,date)
GROUP BY 1,2
ORDER BY 1,2