聚合 MySQl 或 Snowflake 中的行值
Aggregating row values in MySQl or Snowflake
我想计算标准偏差。 mer_data 数组的最小值和最大值分为 3 个其他字段,称为 std_dev、min_mer 和 max_mer,按 mac 和时间戳分组。
这需要在不展平数据的情况下完成,因为每个 mer_data 行由 4000 个浮点值组成,并将其与 700k 行相乘得到非常高的维度 table。
mer_data 字段当前保存为 varchar(30000),也许 Json 格式可能有帮助,我不确定。
输入:
输出:
这可以在 Snowflake 或 MySQL 中完成。
此外,需要优化查询,使其不会占用太多计算时间。
如果数字是排成行的,而不是像那样挤在一起,我们可以在 SQL 中讨论如何做。
在行中,GROUP_CONCAT(...)
可以像您展示的那样构造一个 commalist,MIN()
、STDDEV()
等可以做其他事情。
如果您继续保持共鸣,剩下的工作就可以用您的应用程序编程语言完成。 (让SQL挑一个数组很丑。)
虽然您不想拆分数据,但如果您想在纯 SQL 中进行拆分,则需要这样做。 Snowflake 对此类聚合没有任何问题。
WITH fake_data(mac, mer_data) AS (
SELECT * FROM VALUES
('abc','43,44.25,44.5,42.75,44,44.25,42.75,43'),
('def','32.75,33.25,34.25,34.5,32.75,34,34.25,32.75,43')
)
SELECT f.mac,
avg(d.value::float) as avg_dev,
stddev(d.value::float) as std_dev,
MIN(d.value::float) as MIN_MER,
Max(d.value::float) as Max_MER
FROM fake_data f, table(split_to_table(f.mer_data,',')) d
GROUP BY 1
ORDER BY 1;
然而,我不鼓励在分组过程中使用字符串,所以会像这样将其分开:
WITH fake_data(mac, mer_data, timestamp) AS (
SELECT * FROM VALUES
('abc','43,44.25,44.5,42.75,44,44.25,42.75,43', '01-01-22'),
('def','32.75,33.25,34.25,34.5,32.75,34,34.25,32.75,43', '02-01-22')
), boost_data AS (
SELECT seq8() as seq, *
FROM fake_data
), math_step AS (
SELECT f.seq,
avg(d.value::float) as avg_dev,
stddev(d.value::float) as std_dev,
MIN(d.value::float) as MIN_MER,
Max(d.value::float) as Max_MER
FROM boost_data f, table(split_to_table(f.mer_data,',')) d
GROUP BY 1
)
SELECT b.mac,
m.avg_dev,
m.std_dev,
m.MIN_MER,
m.Max_MER,
b.timestamp
FROM boost_data b
JOIN math_step m
ON b.seq = m.seq
ORDER BY 1;
MAC
AVG_DEV
STD_DEV
MIN_MER
MAX_MER
TIMESTAMP
abc
43.5625
0.7529703087
42.75
44.5
01-01-22
def
34.611111111
3.226141056
32.75
43
02-01-22
性能测试:
所以使用这个 SQL 来制作 70K 行,每行 4000 个值:
create table fake_data_tab AS
WITH cte_a AS (
SELECT SEQ8() as s
FROM TABLE(GENERATOR(ROWCOUNT =>70000))
), cte_b AS (
SELECT a.s, uniform(20::float, 50::float, random()) as v
FROM TABLE(GENERATOR(ROWCOUNT =>4000))
CROSS JOIN cte_a a
)
SELECT s::text as mac
,LISTAGG(v,',') AS mer_data
,dateadd(day,s,'2020-01-01')::date as timestamp
FROM cte_b
GROUP BY 1,3;
XTRA_SMALL、
需要 79 秒
现在我们可以测试这两个解决方案了:
第二组代码(按数字分组,带join):
WITH boost_data AS (
SELECT seq8() as seq, *
FROM fake_data_tab
), math_step AS (
SELECT f.seq,
avg(d.value::float) as avg_dev,
stddev(d.value::float) as std_dev,
MIN(d.value::float) as MIN_MER,
Max(d.value::float) as Max_MER
FROM boost_data f, table(split_to_table(f.mer_data,',')) d
GROUP BY 1
)
SELECT b.mac,
m.avg_dev,
m.std_dev,
m.MIN_MER,
m.Max_MER,
b.timestamp
FROM boost_data b
JOIN math_step m
ON b.seq = m.seq
ORDER BY 1;
需要 1 分 47 秒
原组strings/dates
SELECT f.mac,
avg(d.value::float) as avg_dev,
stddev(d.value::float) as std_dev,
MIN(d.value::float) as MIN_MER,
Max(d.value::float) as Max_MER,
f.timestamp
FROM fake_data_tab f, table(split_to_table(f.mer_data,',')) d
GROUP BY 1,6
ORDER BY 1;
需要 1 分 46 秒
嗯,所以将“mac”保留为数字使代码非常快(~3 秒),并且以以太方式处理字符串改变了处理的数据,从 1.5GB 的字符串和 150MB 的数字.
我想计算标准偏差。 mer_data 数组的最小值和最大值分为 3 个其他字段,称为 std_dev、min_mer 和 max_mer,按 mac 和时间戳分组。 这需要在不展平数据的情况下完成,因为每个 mer_data 行由 4000 个浮点值组成,并将其与 700k 行相乘得到非常高的维度 table。 mer_data 字段当前保存为 varchar(30000),也许 Json 格式可能有帮助,我不确定。 输入:
输出:
这可以在 Snowflake 或 MySQL 中完成。 此外,需要优化查询,使其不会占用太多计算时间。
如果数字是排成行的,而不是像那样挤在一起,我们可以在 SQL 中讨论如何做。
在行中,GROUP_CONCAT(...)
可以像您展示的那样构造一个 commalist,MIN()
、STDDEV()
等可以做其他事情。
如果您继续保持共鸣,剩下的工作就可以用您的应用程序编程语言完成。 (让SQL挑一个数组很丑。)
虽然您不想拆分数据,但如果您想在纯 SQL 中进行拆分,则需要这样做。 Snowflake 对此类聚合没有任何问题。
WITH fake_data(mac, mer_data) AS (
SELECT * FROM VALUES
('abc','43,44.25,44.5,42.75,44,44.25,42.75,43'),
('def','32.75,33.25,34.25,34.5,32.75,34,34.25,32.75,43')
)
SELECT f.mac,
avg(d.value::float) as avg_dev,
stddev(d.value::float) as std_dev,
MIN(d.value::float) as MIN_MER,
Max(d.value::float) as Max_MER
FROM fake_data f, table(split_to_table(f.mer_data,',')) d
GROUP BY 1
ORDER BY 1;
然而,我不鼓励在分组过程中使用字符串,所以会像这样将其分开:
WITH fake_data(mac, mer_data, timestamp) AS (
SELECT * FROM VALUES
('abc','43,44.25,44.5,42.75,44,44.25,42.75,43', '01-01-22'),
('def','32.75,33.25,34.25,34.5,32.75,34,34.25,32.75,43', '02-01-22')
), boost_data AS (
SELECT seq8() as seq, *
FROM fake_data
), math_step AS (
SELECT f.seq,
avg(d.value::float) as avg_dev,
stddev(d.value::float) as std_dev,
MIN(d.value::float) as MIN_MER,
Max(d.value::float) as Max_MER
FROM boost_data f, table(split_to_table(f.mer_data,',')) d
GROUP BY 1
)
SELECT b.mac,
m.avg_dev,
m.std_dev,
m.MIN_MER,
m.Max_MER,
b.timestamp
FROM boost_data b
JOIN math_step m
ON b.seq = m.seq
ORDER BY 1;
MAC | AVG_DEV | STD_DEV | MIN_MER | MAX_MER | TIMESTAMP |
---|---|---|---|---|---|
abc | 43.5625 | 0.7529703087 | 42.75 | 44.5 | 01-01-22 |
def | 34.611111111 | 3.226141056 | 32.75 | 43 | 02-01-22 |
性能测试:
所以使用这个 SQL 来制作 70K 行,每行 4000 个值:
create table fake_data_tab AS
WITH cte_a AS (
SELECT SEQ8() as s
FROM TABLE(GENERATOR(ROWCOUNT =>70000))
), cte_b AS (
SELECT a.s, uniform(20::float, 50::float, random()) as v
FROM TABLE(GENERATOR(ROWCOUNT =>4000))
CROSS JOIN cte_a a
)
SELECT s::text as mac
,LISTAGG(v,',') AS mer_data
,dateadd(day,s,'2020-01-01')::date as timestamp
FROM cte_b
GROUP BY 1,3;
XTRA_SMALL、
需要 79 秒现在我们可以测试这两个解决方案了:
第二组代码(按数字分组,带join):
WITH boost_data AS (
SELECT seq8() as seq, *
FROM fake_data_tab
), math_step AS (
SELECT f.seq,
avg(d.value::float) as avg_dev,
stddev(d.value::float) as std_dev,
MIN(d.value::float) as MIN_MER,
Max(d.value::float) as Max_MER
FROM boost_data f, table(split_to_table(f.mer_data,',')) d
GROUP BY 1
)
SELECT b.mac,
m.avg_dev,
m.std_dev,
m.MIN_MER,
m.Max_MER,
b.timestamp
FROM boost_data b
JOIN math_step m
ON b.seq = m.seq
ORDER BY 1;
需要 1 分 47 秒
原组strings/dates
SELECT f.mac,
avg(d.value::float) as avg_dev,
stddev(d.value::float) as std_dev,
MIN(d.value::float) as MIN_MER,
Max(d.value::float) as Max_MER,
f.timestamp
FROM fake_data_tab f, table(split_to_table(f.mer_data,',')) d
GROUP BY 1,6
ORDER BY 1;
需要 1 分 46 秒
嗯,所以将“mac”保留为数字使代码非常快(~3 秒),并且以以太方式处理字符串改变了处理的数据,从 1.5GB 的字符串和 150MB 的数字.