查询结果雪花精度问题
Snowflake Precision Problems in Query Result
问题:我该如何解决这个问题?尝试了铸造和各种技巧,但似乎没有什么可以在保持 table 不变的同时修复它。
奖金问题:为什么会发生这种情况,其他 RDBMS 中是否会发生这种情况?我应该后悔 Snowflake 吗?
我的查询有一个问题,本应等于 0 的内容显示为非常非常小的数字。我可以在示例数据集上重现该问题:
设置自己试试:
drop table if exists "TESTBUG";
create table "TESTBUG" (id number, val float);
insert into "TESTBUG" values(1,0.000);
insert into "TESTBUG" values(2,0.000);
insert into "TESTBUG" values(3,0.001);
insert into "TESTBUG" values(4,0.000);
insert into "TESTBUG" values(5,0.000);
insert into "TESTBUG" values(6,0.000);
insert into "TESTBUG" values(7,0.000);
insert into "TESTBUG" values(8,0.000);
insert into "TESTBUG" values(9,0.000);
insert into "TESTBUG" values(10,0.000);
insert into "TESTBUG" values(11,0.000);
insert into "TESTBUG" values(12,0.000);
insert into "TESTBUG" values(13,0.000);
insert into "TESTBUG" values(14,0.000);
insert into "TESTBUG" values(15,0.000);
insert into "TESTBUG" values(16,0.000);
insert into "TESTBUG" values(17,0.000);
insert into "TESTBUG" values(18,0.000);
insert into "TESTBUG" values(19,0.000);
insert into "TESTBUG" values(20,0.000);
我们这里处理的是20行假数据:
ID
VAL
1
0
2
0
3
0.001
4
0
5
0
6
0
7
0
8
0
9
0
10
0
11
0
12
0
13
0
14
0
15
0
16
0
17
0
18
0
19
0
20
0
这是生成非零结果的 SQL。看起来 MOVING_AVG
是罪魁祸首?我不确定。
同样奇怪的是,当您查看结果时,只有 ID=18 具有非零结果。第 19 行实际上应该是相同的,因为 window 只有 14 个周期长。
WITH LAG AS (
SELECT *,
LAG(val,1) OVER(ORDER BY id) AS lag_val
FROM "RASGO.PUBLIC.TESTBUG"
),
DIFF AS (
SELECT *,
val - lag_val as diff_lag_val
from LAG
),
MOVING_AVG AS (
SELECT *,
avg(diff_lag_val) OVER(ORDER BY id ROWS BETWEEN 14 PRECEDING AND CURRENT ROW) AS moving_avg_diff
FROM DIFF
)
SELECT * FROM MOVING_AVG WHERE id > 14 AND moving_avg_diff < 0
ID
VAL
LAG_VAL
DIFF_LAG_VAL
MOVING_AVG_DIFF
18
0
0
0
-6.666666667e-05
所以打包成一个块:
with testbug(id, val) as (
select * from values
(1, 0.000::float),
(2, 0.000::float),
(3, 0.001::float),
(4, 0.000::float),
(5, 0.000::float),
(6, 0.000::float),
(7, 0.000::float),
(8, 0.000::float),
(9, 0.000::float),
(10, 0.000::float),
(11, 0.000::float),
(12, 0.000::float),
(13, 0.000::float),
(14, 0.000::float),
(15, 0.000::float),
(16, 0.000::float),
(17, 0.000::float),
(18, 0.000::float),
(19, 0.000::float),
(20, 0.000::float)
), diff AS (
SELECT
*,
LAG(val,1) OVER(ORDER BY id) AS lag_val,
val - lag_val as diff_lag_val
from TESTBUG
)
SELECT
*,
avg(diff_lag_val) OVER(ORDER BY id ROWS BETWEEN 14 PRECEDING AND CURRENT ROW) AS moving_avg_diff
FROM DIFF
--QUALIFY id > 14 AND moving_avg_diff < 0
order by id;
给出:
ID
VAL
LAG_VAL
DIFF_LAG_VAL
MOVING_AVG_DIFF
1
0
2
0
0
0
0
3
0.001
0
0.001
0.0005
4
0
0.001
-0.001
0
5
0
0
0
0
6
0
0
0
0
7
0
0
0
0
8
0
0
0
0
9
0
0
0
0
10
0
0
0
0
11
0
0
0
0
12
0
0
0
0
13
0
0
0
0
14
0
0
0
0
15
0
0
0
0
16
0
0
0
0
17
0
0
0
0
18
0
0
0
-0.000066
19
0
0
0
0
20
0
0
0
0
所以问题出在第 4 行,在那里你得到一个负值,然后 18 之前的所有行的移动平均值正负值抵消了..
可以用一个更小的玩具问题来展示:
with testbug(id, val) as (
select * from values
(1, 0.000::float),
(2, 0.000::float),
(3, 0.001::float),
(4, 0.000::float),
(5, 0.000::float),
(6, 0.000::float)
), diff AS (
SELECT
*,
LAG(val,1) OVER(ORDER BY id) AS lag_val,
val - lag_val as diff_lag_val
from TESTBUG
)
SELECT
*,
avg(diff_lag_val) OVER(ORDER BY id ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg_diff
FROM DIFF
order by id;
ID
VAL
LAG_VAL
DIFF_LAG_VAL
MOVING_AVG_DIFF
1
0
2
0
0
0
0
3
0.001
0
0.001
0.0005
4
0
0.001
-0.001
0
5
0
0
0
0
6
0
0
0
-0.0003333333333
检查数学:
SELECT
*,
sum(diff_lag_val) OVER(ORDER BY id ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as moving_sum_diff,
count(diff_lag_val) OVER(ORDER BY id ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as moving_countdiff,
avg(diff_lag_val) OVER(ORDER BY id ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg_diff,
div0(moving_sum_diff, moving_countdiff) as man_avg
FROM DIFF
order by id;
我们现在添加手动步骤来手动计算平均值,我们得到:
ID
VAL
LAG_VAL
DIFF_LAG_VAL
MOVING_SUM_DIFF
MOVING_COUNTDIFF
MOVING_AVG_DIFF
MAN_AVG
1
0
0
2
0
0
0
0
1
0
0
3
0.001
0
0.001
0.001
2
0.0005
0.0005
4
0
0.001
-0.001
0
3
0
0
5
0
0
0
0
3
0
0
6
0
0
0
-0.001
3
-0.0003333333333
-0.0003333333333
所以数学是正确的。
最后的话:
所以这不是浮点数表示问题,而是您对数字进行数学运算,这不是您期望的数字,但是发生的事情是有道理的,尽管它可能不是您所期望的想做。
也就是正在发生的事情:
select column1, column2, div0(column1,column2)
from values
( 0, 18 ),
( 0.001, 2 ),
( -0.001, 15 );
COLUMN1
COLUMN2
DIV0(COLUMN1,COLUMN2)
0
18
0
0.001
2
0.0005
-0.001
15
-0.000066666
问题:我该如何解决这个问题?尝试了铸造和各种技巧,但似乎没有什么可以在保持 table 不变的同时修复它。
奖金问题:为什么会发生这种情况,其他 RDBMS 中是否会发生这种情况?我应该后悔 Snowflake 吗?
我的查询有一个问题,本应等于 0 的内容显示为非常非常小的数字。我可以在示例数据集上重现该问题:
设置自己试试:
drop table if exists "TESTBUG";
create table "TESTBUG" (id number, val float);
insert into "TESTBUG" values(1,0.000);
insert into "TESTBUG" values(2,0.000);
insert into "TESTBUG" values(3,0.001);
insert into "TESTBUG" values(4,0.000);
insert into "TESTBUG" values(5,0.000);
insert into "TESTBUG" values(6,0.000);
insert into "TESTBUG" values(7,0.000);
insert into "TESTBUG" values(8,0.000);
insert into "TESTBUG" values(9,0.000);
insert into "TESTBUG" values(10,0.000);
insert into "TESTBUG" values(11,0.000);
insert into "TESTBUG" values(12,0.000);
insert into "TESTBUG" values(13,0.000);
insert into "TESTBUG" values(14,0.000);
insert into "TESTBUG" values(15,0.000);
insert into "TESTBUG" values(16,0.000);
insert into "TESTBUG" values(17,0.000);
insert into "TESTBUG" values(18,0.000);
insert into "TESTBUG" values(19,0.000);
insert into "TESTBUG" values(20,0.000);
我们这里处理的是20行假数据:
ID | VAL |
---|---|
1 | 0 |
2 | 0 |
3 | 0.001 |
4 | 0 |
5 | 0 |
6 | 0 |
7 | 0 |
8 | 0 |
9 | 0 |
10 | 0 |
11 | 0 |
12 | 0 |
13 | 0 |
14 | 0 |
15 | 0 |
16 | 0 |
17 | 0 |
18 | 0 |
19 | 0 |
20 | 0 |
这是生成非零结果的 SQL。看起来 MOVING_AVG
是罪魁祸首?我不确定。
同样奇怪的是,当您查看结果时,只有 ID=18 具有非零结果。第 19 行实际上应该是相同的,因为 window 只有 14 个周期长。
WITH LAG AS (
SELECT *,
LAG(val,1) OVER(ORDER BY id) AS lag_val
FROM "RASGO.PUBLIC.TESTBUG"
),
DIFF AS (
SELECT *,
val - lag_val as diff_lag_val
from LAG
),
MOVING_AVG AS (
SELECT *,
avg(diff_lag_val) OVER(ORDER BY id ROWS BETWEEN 14 PRECEDING AND CURRENT ROW) AS moving_avg_diff
FROM DIFF
)
SELECT * FROM MOVING_AVG WHERE id > 14 AND moving_avg_diff < 0
ID | VAL | LAG_VAL | DIFF_LAG_VAL | MOVING_AVG_DIFF |
---|---|---|---|---|
18 | 0 | 0 | 0 | -6.666666667e-05 |
所以打包成一个块:
with testbug(id, val) as (
select * from values
(1, 0.000::float),
(2, 0.000::float),
(3, 0.001::float),
(4, 0.000::float),
(5, 0.000::float),
(6, 0.000::float),
(7, 0.000::float),
(8, 0.000::float),
(9, 0.000::float),
(10, 0.000::float),
(11, 0.000::float),
(12, 0.000::float),
(13, 0.000::float),
(14, 0.000::float),
(15, 0.000::float),
(16, 0.000::float),
(17, 0.000::float),
(18, 0.000::float),
(19, 0.000::float),
(20, 0.000::float)
), diff AS (
SELECT
*,
LAG(val,1) OVER(ORDER BY id) AS lag_val,
val - lag_val as diff_lag_val
from TESTBUG
)
SELECT
*,
avg(diff_lag_val) OVER(ORDER BY id ROWS BETWEEN 14 PRECEDING AND CURRENT ROW) AS moving_avg_diff
FROM DIFF
--QUALIFY id > 14 AND moving_avg_diff < 0
order by id;
给出:
ID | VAL | LAG_VAL | DIFF_LAG_VAL | MOVING_AVG_DIFF |
---|---|---|---|---|
1 | 0 | |||
2 | 0 | 0 | 0 | 0 |
3 | 0.001 | 0 | 0.001 | 0.0005 |
4 | 0 | 0.001 | -0.001 | 0 |
5 | 0 | 0 | 0 | 0 |
6 | 0 | 0 | 0 | 0 |
7 | 0 | 0 | 0 | 0 |
8 | 0 | 0 | 0 | 0 |
9 | 0 | 0 | 0 | 0 |
10 | 0 | 0 | 0 | 0 |
11 | 0 | 0 | 0 | 0 |
12 | 0 | 0 | 0 | 0 |
13 | 0 | 0 | 0 | 0 |
14 | 0 | 0 | 0 | 0 |
15 | 0 | 0 | 0 | 0 |
16 | 0 | 0 | 0 | 0 |
17 | 0 | 0 | 0 | 0 |
18 | 0 | 0 | 0 | -0.000066 |
19 | 0 | 0 | 0 | 0 |
20 | 0 | 0 | 0 | 0 |
所以问题出在第 4 行,在那里你得到一个负值,然后 18 之前的所有行的移动平均值正负值抵消了..
可以用一个更小的玩具问题来展示:
with testbug(id, val) as (
select * from values
(1, 0.000::float),
(2, 0.000::float),
(3, 0.001::float),
(4, 0.000::float),
(5, 0.000::float),
(6, 0.000::float)
), diff AS (
SELECT
*,
LAG(val,1) OVER(ORDER BY id) AS lag_val,
val - lag_val as diff_lag_val
from TESTBUG
)
SELECT
*,
avg(diff_lag_val) OVER(ORDER BY id ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg_diff
FROM DIFF
order by id;
ID | VAL | LAG_VAL | DIFF_LAG_VAL | MOVING_AVG_DIFF |
---|---|---|---|---|
1 | 0 | |||
2 | 0 | 0 | 0 | 0 |
3 | 0.001 | 0 | 0.001 | 0.0005 |
4 | 0 | 0.001 | -0.001 | 0 |
5 | 0 | 0 | 0 | 0 |
6 | 0 | 0 | 0 | -0.0003333333333 |
检查数学:
SELECT
*,
sum(diff_lag_val) OVER(ORDER BY id ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as moving_sum_diff,
count(diff_lag_val) OVER(ORDER BY id ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as moving_countdiff,
avg(diff_lag_val) OVER(ORDER BY id ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg_diff,
div0(moving_sum_diff, moving_countdiff) as man_avg
FROM DIFF
order by id;
我们现在添加手动步骤来手动计算平均值,我们得到:
ID | VAL | LAG_VAL | DIFF_LAG_VAL | MOVING_SUM_DIFF | MOVING_COUNTDIFF | MOVING_AVG_DIFF | MAN_AVG |
---|---|---|---|---|---|---|---|
1 | 0 | 0 | |||||
2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | 0.001 | 0 | 0.001 | 0.001 | 2 | 0.0005 | 0.0005 |
4 | 0 | 0.001 | -0.001 | 0 | 3 | 0 | 0 |
5 | 0 | 0 | 0 | 0 | 3 | 0 | 0 |
6 | 0 | 0 | 0 | -0.001 | 3 | -0.0003333333333 | -0.0003333333333 |
所以数学是正确的。
最后的话:
所以这不是浮点数表示问题,而是您对数字进行数学运算,这不是您期望的数字,但是发生的事情是有道理的,尽管它可能不是您所期望的想做。
也就是正在发生的事情:
select column1, column2, div0(column1,column2)
from values
( 0, 18 ),
( 0.001, 2 ),
( -0.001, 15 );
COLUMN1 | COLUMN2 | DIV0(COLUMN1,COLUMN2) |
---|---|---|
0 | 18 | 0 |
0.001 | 2 | 0.0005 |
-0.001 | 15 | -0.000066666 |