垂直动态 pivot/transform
Vertica dynamic pivot/transform
我在 vertica 中有一个 table :
id Timestamp Mask1 Mask2
-------------------------------------------
1 11:30 50 100
1 11:35 52 101
2 12:00 53 102
3 09:00 50 100
3 22:10 52 105
. . . .
. . . .
我想转换成:
id rows 09:00 11:30 11:35 12:00 22:10 .......
--------------------------------------------------------------
1 Mask1 Null 50 52 Null Null .......
Mask2 Null 100 101 Null Null .......
2 Mask1 Null Null Null 53 Null .......
Mask2 Null Null Null 102 Null .......
3 Mask1 50 Null Null Null 52 .......
Mask2 100 Null Null Null 105 .......
圆点 (...) 表示我有很多记录。
- 时间戳为一整天,格式为hours:minutes:seconds,从00:00:00开始到24:00:00一天(我刚用hours:minutes做题) .
- 我只定义了两个额外的列 Mask1 和 Mask2。我有大约 200 个 Mask 列可以使用。
- 我显示了 5 条记录,但实际上我有大约一百万条记录。
到目前为止我尝试过的:
- 根据 id 将每条记录转储到 csv 文件中。
- 在 python pandas 中应用转置。
- 加入转置 tables.
可能的通用解决方案可能是在 vertica(或 UDTF)中进行旋转,但我对这个数据库还很陌生。
几天来我一直在为这个逻辑而苦苦挣扎。谁能帮帮我吗。非常感谢。
您可以使用union all
对数据进行逆透视,然后进行条件聚合:
select id, which,
max(case when timestamp >= '09:00' and timestamp < '09:30' then mask end) as "09:00",
max(case when timestamp >= '09:30' and timestamp < '10:00' then mask end) as "09:30",
max(case when timestamp >= '10:00' and timestamp < '10:30' then mask end) as "10:00",
. . .
from ((select id, timestamp,
'Mask1' as which, Mask1 as mask
from t
) union all
(select id, timestamp, 'Mask2' as which, Mask2 as mask
from t
)
) t
group by t.id, t.which;
注意:这包括每行的 id
。我强烈建议这样做,但您可以使用:
select (case when which = 'Mask1' then id end) as id
如果你真的想的话。
下面是解决方案,因为我会根据数据示例中的时间值对其进行编码。
如果你真的想显示 '00:00:00'
到 '23:59:59'
的所有 86400,你将无法显示。 Vertica 的最大列数为 1600。
不过,您可以使用 Vertica 函数 TIME_SLICE(timestamp::TIMESTAMP,1,'MINUTE')::TIME
(TIME_SLICE输入一个时间戳,returns一个时间戳,所以你必须来回转换(::
)),将行数减少到1440 ...
无论如何,我会从 SELECT DISTINCT timestamp FROM input ORDER BY 1;
开始,然后在最后的查询中,将为每个找到的时间戳生成一行(希望它们不会超过 1598....),就像实际用于您的数据的那些一样,进入您的查询:
, SUM(CASE timestamp WHEN '09:00' THEN val END) AS "09:00"
, SUM(CASE timestamp WHEN '11:30' THEN val END) AS "11:30"
, SUM(CASE timestamp WHEN '11:35' THEN val END) AS "11:35"
, SUM(CASE timestamp WHEN '12:00' THEN val END) AS "12:00"
, SUM(CASE timestamp WHEN '22:10' THEN val END) AS "22:10"
SQL 通常没有来自任何给定查询的可变数量的输出列。如果最终列数因数据而异,您将必须从数据生成最终查询,然后 运行 它。
欢迎使用 SQL 和关系数据库..
这是您的数据的完整脚本。我首先沿 "Mask-n" 列名称垂直旋转,然后沿时间戳水平重新旋转。
\pset null Null
-- ^ this is a vsql command to display nulls with the "Null" string
WITH
-- your input, not in final query
input(id,Timestamp,Mask1,Mask2) AS (
SELECT 1 , TIME '11:30' , 50 , 100
UNION ALL SELECT 1 , TIME '11:35' , 52 , 101
UNION ALL SELECT 2 , TIME '12:00' , 53 , 102
UNION ALL SELECT 3 , TIME '09:00' , 50 , 100
UNION ALL SELECT 3 , TIME '22:10' , 52 , 105
)
,
-- real WITH clause starts here
-- need an index for your 200 masks
i(i) AS (
SELECT MICROSECOND(ts) FROM (
SELECT TIMESTAMPADD(MICROSECOND, 1,TIMESTAMP '2000-01-01') AS tm
UNION ALL SELECT TIMESTAMPADD(MICROSECOND,200,TIMESTAMP '2000-01-01') AS tm
)x
TIMESERIES ts AS '1 MICROSECOND' OVER(ORDER BY tm)
)
,
-- verticalised masks
vertical AS (
SELECT
id
, i
, CASE i
WHEN 1 THEN 'Mask001'
WHEN 2 THEN 'Mask002'
WHEN 200 THEN 'Mask200'
END AS rows
, timestamp
, CASE i
WHEN 1 THEN Mask1
WHEN 2 THEN Mask2
WHEN 200 THEN 0 -- no mask200 present
END AS val
FROM input CROSS JOIN i
WHERE i <=2 -- only 2 masks present currently
)
-- test the vertical CTE ...
-- SELECT * FROM vertical order by id,rows,timestamp;
-- out id | i | rows | timestamp | val
-- out ----+---+---------+-----------+-----
-- out 1 | 1 | Mask001 | 11:30:00 | 50
-- out 1 | 1 | Mask001 | 11:35:00 | 52
-- out 1 | 2 | Mask002 | 11:30:00 | 100
-- out 1 | 2 | Mask002 | 11:35:00 | 101
-- out 2 | 1 | Mask001 | 12:00:00 | 53
-- out 2 | 2 | Mask002 | 12:00:00 | 102
-- out 3 | 1 | Mask001 | 09:00:00 | 50
-- out 3 | 1 | Mask001 | 22:10:00 | 52
-- out 3 | 2 | Mask002 | 09:00:00 | 100
-- out 3 | 2 | Mask002 | 22:10:00 | 105
SELECT
id
, rows
, SUM(CASE timestamp WHEN '09:00' THEN val END) AS "09:00"
, SUM(CASE timestamp WHEN '11:30' THEN val END) AS "11:30"
, SUM(CASE timestamp WHEN '11:35' THEN val END) AS "11:35"
, SUM(CASE timestamp WHEN '12:00' THEN val END) AS "12:00"
, SUM(CASE timestamp WHEN '22:10' THEN val END) AS "22:10"
FROM vertical
GROUP BY
id
, rows
ORDER BY
id
, rows
;
-- out Null display is "Null".
-- out id | rows | 09:00 | 11:30 | 11:35 | 12:00 | 22:10
-- out ----+---------+-------+-------+-------+-------+-------
-- out 1 | Mask001 | Null | 50 | 52 | Null | Null
-- out 1 | Mask002 | Null | 100 | 101 | Null | Null
-- out 2 | Mask001 | Null | Null | Null | 53 | Null
-- out 2 | Mask002 | Null | Null | Null | 102 | Null
-- out 3 | Mask001 | 50 | Null | Null | Null | 52
-- out 3 | Mask002 | 100 | Null | Null | Null | 105
-- out (6 rows)
-- out
-- out Time: First fetch (6 rows): 28.143 ms. All rows formatted: 28.205 ms
我在 vertica 中有一个 table :
id Timestamp Mask1 Mask2
-------------------------------------------
1 11:30 50 100
1 11:35 52 101
2 12:00 53 102
3 09:00 50 100
3 22:10 52 105
. . . .
. . . .
我想转换成:
id rows 09:00 11:30 11:35 12:00 22:10 .......
--------------------------------------------------------------
1 Mask1 Null 50 52 Null Null .......
Mask2 Null 100 101 Null Null .......
2 Mask1 Null Null Null 53 Null .......
Mask2 Null Null Null 102 Null .......
3 Mask1 50 Null Null Null 52 .......
Mask2 100 Null Null Null 105 .......
圆点 (...) 表示我有很多记录。
- 时间戳为一整天,格式为hours:minutes:seconds,从00:00:00开始到24:00:00一天(我刚用hours:minutes做题) .
- 我只定义了两个额外的列 Mask1 和 Mask2。我有大约 200 个 Mask 列可以使用。
- 我显示了 5 条记录,但实际上我有大约一百万条记录。
到目前为止我尝试过的:
- 根据 id 将每条记录转储到 csv 文件中。
- 在 python pandas 中应用转置。
- 加入转置 tables.
可能的通用解决方案可能是在 vertica(或 UDTF)中进行旋转,但我对这个数据库还很陌生。
几天来我一直在为这个逻辑而苦苦挣扎。谁能帮帮我吗。非常感谢。
您可以使用union all
对数据进行逆透视,然后进行条件聚合:
select id, which,
max(case when timestamp >= '09:00' and timestamp < '09:30' then mask end) as "09:00",
max(case when timestamp >= '09:30' and timestamp < '10:00' then mask end) as "09:30",
max(case when timestamp >= '10:00' and timestamp < '10:30' then mask end) as "10:00",
. . .
from ((select id, timestamp,
'Mask1' as which, Mask1 as mask
from t
) union all
(select id, timestamp, 'Mask2' as which, Mask2 as mask
from t
)
) t
group by t.id, t.which;
注意:这包括每行的 id
。我强烈建议这样做,但您可以使用:
select (case when which = 'Mask1' then id end) as id
如果你真的想的话。
下面是解决方案,因为我会根据数据示例中的时间值对其进行编码。
如果你真的想显示 '00:00:00'
到 '23:59:59'
的所有 86400,你将无法显示。 Vertica 的最大列数为 1600。
不过,您可以使用 Vertica 函数 TIME_SLICE(timestamp::TIMESTAMP,1,'MINUTE')::TIME
(TIME_SLICE输入一个时间戳,returns一个时间戳,所以你必须来回转换(::
)),将行数减少到1440 ...
无论如何,我会从 SELECT DISTINCT timestamp FROM input ORDER BY 1;
开始,然后在最后的查询中,将为每个找到的时间戳生成一行(希望它们不会超过 1598....),就像实际用于您的数据的那些一样,进入您的查询:
, SUM(CASE timestamp WHEN '09:00' THEN val END) AS "09:00"
, SUM(CASE timestamp WHEN '11:30' THEN val END) AS "11:30"
, SUM(CASE timestamp WHEN '11:35' THEN val END) AS "11:35"
, SUM(CASE timestamp WHEN '12:00' THEN val END) AS "12:00"
, SUM(CASE timestamp WHEN '22:10' THEN val END) AS "22:10"
SQL 通常没有来自任何给定查询的可变数量的输出列。如果最终列数因数据而异,您将必须从数据生成最终查询,然后 运行 它。
欢迎使用 SQL 和关系数据库..
这是您的数据的完整脚本。我首先沿 "Mask-n" 列名称垂直旋转,然后沿时间戳水平重新旋转。
\pset null Null
-- ^ this is a vsql command to display nulls with the "Null" string
WITH
-- your input, not in final query
input(id,Timestamp,Mask1,Mask2) AS (
SELECT 1 , TIME '11:30' , 50 , 100
UNION ALL SELECT 1 , TIME '11:35' , 52 , 101
UNION ALL SELECT 2 , TIME '12:00' , 53 , 102
UNION ALL SELECT 3 , TIME '09:00' , 50 , 100
UNION ALL SELECT 3 , TIME '22:10' , 52 , 105
)
,
-- real WITH clause starts here
-- need an index for your 200 masks
i(i) AS (
SELECT MICROSECOND(ts) FROM (
SELECT TIMESTAMPADD(MICROSECOND, 1,TIMESTAMP '2000-01-01') AS tm
UNION ALL SELECT TIMESTAMPADD(MICROSECOND,200,TIMESTAMP '2000-01-01') AS tm
)x
TIMESERIES ts AS '1 MICROSECOND' OVER(ORDER BY tm)
)
,
-- verticalised masks
vertical AS (
SELECT
id
, i
, CASE i
WHEN 1 THEN 'Mask001'
WHEN 2 THEN 'Mask002'
WHEN 200 THEN 'Mask200'
END AS rows
, timestamp
, CASE i
WHEN 1 THEN Mask1
WHEN 2 THEN Mask2
WHEN 200 THEN 0 -- no mask200 present
END AS val
FROM input CROSS JOIN i
WHERE i <=2 -- only 2 masks present currently
)
-- test the vertical CTE ...
-- SELECT * FROM vertical order by id,rows,timestamp;
-- out id | i | rows | timestamp | val
-- out ----+---+---------+-----------+-----
-- out 1 | 1 | Mask001 | 11:30:00 | 50
-- out 1 | 1 | Mask001 | 11:35:00 | 52
-- out 1 | 2 | Mask002 | 11:30:00 | 100
-- out 1 | 2 | Mask002 | 11:35:00 | 101
-- out 2 | 1 | Mask001 | 12:00:00 | 53
-- out 2 | 2 | Mask002 | 12:00:00 | 102
-- out 3 | 1 | Mask001 | 09:00:00 | 50
-- out 3 | 1 | Mask001 | 22:10:00 | 52
-- out 3 | 2 | Mask002 | 09:00:00 | 100
-- out 3 | 2 | Mask002 | 22:10:00 | 105
SELECT
id
, rows
, SUM(CASE timestamp WHEN '09:00' THEN val END) AS "09:00"
, SUM(CASE timestamp WHEN '11:30' THEN val END) AS "11:30"
, SUM(CASE timestamp WHEN '11:35' THEN val END) AS "11:35"
, SUM(CASE timestamp WHEN '12:00' THEN val END) AS "12:00"
, SUM(CASE timestamp WHEN '22:10' THEN val END) AS "22:10"
FROM vertical
GROUP BY
id
, rows
ORDER BY
id
, rows
;
-- out Null display is "Null".
-- out id | rows | 09:00 | 11:30 | 11:35 | 12:00 | 22:10
-- out ----+---------+-------+-------+-------+-------+-------
-- out 1 | Mask001 | Null | 50 | 52 | Null | Null
-- out 1 | Mask002 | Null | 100 | 101 | Null | Null
-- out 2 | Mask001 | Null | Null | Null | 53 | Null
-- out 2 | Mask002 | Null | Null | Null | 102 | Null
-- out 3 | Mask001 | 50 | Null | Null | Null | 52
-- out 3 | Mask002 | 100 | Null | Null | Null | 105
-- out (6 rows)
-- out
-- out Time: First fetch (6 rows): 28.143 ms. All rows formatted: 28.205 ms