给定一个数字序列如何识别缺失的数字
Given a sequence of numbers how to identify the missing numbers
我想得到一个数字序列中所有缺失的数字。
只是想知道是否有比下面更好的方法?
SELECT x
FROM
(
SELECT x,
LAG(x,1) OVER ( ORDER BY x ) prev_x
FROM
( SELECT * FROM
( SELECT 1 AS x ),
( SELECT 2 AS x ),
( SELECT 3 AS x ),
( SELECT 4 AS x ),
( SELECT 5 AS x ),
( SELECT 6 AS x ),
( SELECT 8 AS x ),
( SELECT 10 AS x ),
( SELECT 11 AS x )
)
)
WHERE x-prev_x > 1;
您的查询可以像这样写得更简洁:
SELECT x
FROM (
SELECT x,
lag(x, 1) OVER ( ORDER BY x ) prev_x
FROM ( VALUES (1), (2), (3), (4), (5), (6), (8), (10), (11) ) v(x)
) sub
WHERE x-prev_x > 1;
这将 return 是缺失值 (8, 10
) 之后的下一个最高值,而不是缺失值本身 (7, 9
)。但是当然你没有方便的值。
如果你知道序列中值的范围,那么你可以使用这个:
SELECT s.x
FROM generate_series(<<min>>, <<max>>) s(x)
LEFT JOIN my_table t ON s.x = t.x
WHERE t.x IS NULL;
这 return 是实际缺失值。
如果不知道取值范围,需要添加子查询:
SELECT s.x
FROM ( SELECT min(x), max(x) FROM my_table ) r
JOIN generate_series(r.min, r.max) s(x) ON true
LEFT JOIN my_table t ON s.x = t.x
WHERE t.x IS NULL;
或者,代替 LEFT JOIN
:
SELECT x
FROM ( SELECT min(x), max(x) FROM my_table ) r,
generate_series(r.min, r.max) s(x)
WHERE NOT EXISTS (SELECT 1 FROM my_table t WHERE t.x = s.x);
老实说!
任何其他可行的解决方案都会比有问题的解决方案更好——出于一个简单的原因——这是错误的!它根本 return 没有遗漏号码!它宁愿显示下一个差距之后的数字。仅此而已(希望您会感激我让您大开眼界)
现在,关于更好的解决方案 - 有很多选择供您选择。
注意:以下选项仅适用于 BigQuery!
Option 1
BigQuery 标准 SQL - 请参阅 How to Enable Standard SQL
WITH YourTable AS (
SELECT 1 AS x UNION ALL
SELECT 2 AS x UNION ALL
SELECT 3 AS x UNION ALL
SELECT 6 AS x UNION ALL
SELECT 8 AS x UNION ALL
SELECT 10 AS x UNION ALL
SELECT 11 AS x
),
nums AS (
SELECT num
FROM UNNEST(GENERATE_ARRAY((SELECT MIN(x) FROM YourTable), (SELECT MAX(x) FROM YourTable))) AS num
)
SELECT num FROM nums
LEFT JOIN YourTable ON num = x
WHERE x IS NULL
ORDER BY num
Option 2
BigQuery Legacy SQL 你可以在下面尝试(这里你需要在 select 中设置 start/min 和 end/max 值nums table
的表达式
SELECT num FROM (
SELECT num FROM (
SELECT ROW_NUMBER() OVER() AS num, *
FROM (FLATTEN((SELECT SPLIT(RPAD('', 11, '.'),'') AS h FROM (SELECT NULL)), h))
) WHERE num BETWEEN 1 AND 11
) AS nums
LEFT JOIN (
SELECT x FROM
(SELECT 1 AS x),
(SELECT 2 AS x),
(SELECT 3 AS x),
(SELECT 6 AS x),
(SELECT 8 AS x),
(SELECT 10 AS x),
(SELECT 11 AS x)
) AS YourTable
ON num = x
WHERE x IS NULL
Option 3
BigQuery Legacy SQL - 如果您不想依赖最小值和最大值并且需要设置这些值 - 您可以使用以下解决方案 - 它只需要设置足够高的最大值以适应您的预期增长(例如我放 1000)
SELECT num FROM (
SELECT num FROM (
SELECT ROW_NUMBER() OVER() AS num, *
FROM (FLATTEN((SELECT SPLIT(RPAD('', 1000, '.'),'') AS h FROM (SELECT NULL)), h))
) WHERE num BETWEEN 1 AND 1000
) AS nums
LEFT JOIN YourTable
ON num = x
WHERE x IS NULL
AND num BETWEEN (SELECT MIN(x) FROM YourTable) AND (SELECT MAX(x) FROM YourTable)
Option 4 (for some reason - my favorite so far)
BigQuery 标准 SQL - 没有显式连接
WITH YourTable AS (
SELECT 1 AS x UNION ALL
SELECT 2 AS x UNION ALL
SELECT 3 AS x UNION ALL
SELECT 6 AS x UNION ALL
SELECT 8 AS x UNION ALL
SELECT 10 AS x UNION ALL
SELECT 11 AS x
)
SELECT num
FROM (SELECT x, LEAD(x) OVER(ORDER BY x) AS next_x FROM YourTable),
UNNEST(GENERATE_ARRAY(x + 1,next_x - 1)) AS num
WHERE next_x - x > 1
ORDER BY x
最短 Postgres 中的解决方案是标准 SQL EXCEPT
:
WITH tbl(x) AS (SELECT unnest ('{1,2,3,4,5,6,8,10,11}'::int[]))
-- the CTE provides a temp table - might be an actual table instead
SELECT generate_series(min(x), max(x)) FROM tbl
EXCEPT ALL
TABLE tbl;
集合返回函数 unnest()
是 Postgres 特有的,它只是提供数字集合的最短语法 table。
也适用于数据中的重复值或 NULL 值。
TABLE tbl
是(标准 SQL!)SELECT * FROM tbl
:
的简短语法
相关(更多解释):
- Select rows which are not present in other table
我想得到一个数字序列中所有缺失的数字。
只是想知道是否有比下面更好的方法?
SELECT x
FROM
(
SELECT x,
LAG(x,1) OVER ( ORDER BY x ) prev_x
FROM
( SELECT * FROM
( SELECT 1 AS x ),
( SELECT 2 AS x ),
( SELECT 3 AS x ),
( SELECT 4 AS x ),
( SELECT 5 AS x ),
( SELECT 6 AS x ),
( SELECT 8 AS x ),
( SELECT 10 AS x ),
( SELECT 11 AS x )
)
)
WHERE x-prev_x > 1;
您的查询可以像这样写得更简洁:
SELECT x
FROM (
SELECT x,
lag(x, 1) OVER ( ORDER BY x ) prev_x
FROM ( VALUES (1), (2), (3), (4), (5), (6), (8), (10), (11) ) v(x)
) sub
WHERE x-prev_x > 1;
这将 return 是缺失值 (8, 10
) 之后的下一个最高值,而不是缺失值本身 (7, 9
)。但是当然你没有方便的值。
如果你知道序列中值的范围,那么你可以使用这个:
SELECT s.x
FROM generate_series(<<min>>, <<max>>) s(x)
LEFT JOIN my_table t ON s.x = t.x
WHERE t.x IS NULL;
这 return 是实际缺失值。
如果不知道取值范围,需要添加子查询:
SELECT s.x
FROM ( SELECT min(x), max(x) FROM my_table ) r
JOIN generate_series(r.min, r.max) s(x) ON true
LEFT JOIN my_table t ON s.x = t.x
WHERE t.x IS NULL;
或者,代替 LEFT JOIN
:
SELECT x
FROM ( SELECT min(x), max(x) FROM my_table ) r,
generate_series(r.min, r.max) s(x)
WHERE NOT EXISTS (SELECT 1 FROM my_table t WHERE t.x = s.x);
老实说!
任何其他可行的解决方案都会比有问题的解决方案更好——出于一个简单的原因——这是错误的!它根本 return 没有遗漏号码!它宁愿显示下一个差距之后的数字。仅此而已(希望您会感激我让您大开眼界)
现在,关于更好的解决方案 - 有很多选择供您选择。
注意:以下选项仅适用于 BigQuery!
Option 1
BigQuery 标准 SQL - 请参阅 How to Enable Standard SQL
WITH YourTable AS (
SELECT 1 AS x UNION ALL
SELECT 2 AS x UNION ALL
SELECT 3 AS x UNION ALL
SELECT 6 AS x UNION ALL
SELECT 8 AS x UNION ALL
SELECT 10 AS x UNION ALL
SELECT 11 AS x
),
nums AS (
SELECT num
FROM UNNEST(GENERATE_ARRAY((SELECT MIN(x) FROM YourTable), (SELECT MAX(x) FROM YourTable))) AS num
)
SELECT num FROM nums
LEFT JOIN YourTable ON num = x
WHERE x IS NULL
ORDER BY num
Option 2
BigQuery Legacy SQL 你可以在下面尝试(这里你需要在 select 中设置 start/min 和 end/max 值nums table
的表达式SELECT num FROM (
SELECT num FROM (
SELECT ROW_NUMBER() OVER() AS num, *
FROM (FLATTEN((SELECT SPLIT(RPAD('', 11, '.'),'') AS h FROM (SELECT NULL)), h))
) WHERE num BETWEEN 1 AND 11
) AS nums
LEFT JOIN (
SELECT x FROM
(SELECT 1 AS x),
(SELECT 2 AS x),
(SELECT 3 AS x),
(SELECT 6 AS x),
(SELECT 8 AS x),
(SELECT 10 AS x),
(SELECT 11 AS x)
) AS YourTable
ON num = x
WHERE x IS NULL
Option 3
BigQuery Legacy SQL - 如果您不想依赖最小值和最大值并且需要设置这些值 - 您可以使用以下解决方案 - 它只需要设置足够高的最大值以适应您的预期增长(例如我放 1000)
SELECT num FROM (
SELECT num FROM (
SELECT ROW_NUMBER() OVER() AS num, *
FROM (FLATTEN((SELECT SPLIT(RPAD('', 1000, '.'),'') AS h FROM (SELECT NULL)), h))
) WHERE num BETWEEN 1 AND 1000
) AS nums
LEFT JOIN YourTable
ON num = x
WHERE x IS NULL
AND num BETWEEN (SELECT MIN(x) FROM YourTable) AND (SELECT MAX(x) FROM YourTable)
Option 4 (for some reason - my favorite so far)
BigQuery 标准 SQL - 没有显式连接
WITH YourTable AS (
SELECT 1 AS x UNION ALL
SELECT 2 AS x UNION ALL
SELECT 3 AS x UNION ALL
SELECT 6 AS x UNION ALL
SELECT 8 AS x UNION ALL
SELECT 10 AS x UNION ALL
SELECT 11 AS x
)
SELECT num
FROM (SELECT x, LEAD(x) OVER(ORDER BY x) AS next_x FROM YourTable),
UNNEST(GENERATE_ARRAY(x + 1,next_x - 1)) AS num
WHERE next_x - x > 1
ORDER BY x
最短 Postgres 中的解决方案是标准 SQL EXCEPT
:
WITH tbl(x) AS (SELECT unnest ('{1,2,3,4,5,6,8,10,11}'::int[]))
-- the CTE provides a temp table - might be an actual table instead
SELECT generate_series(min(x), max(x)) FROM tbl
EXCEPT ALL
TABLE tbl;
集合返回函数 unnest()
是 Postgres 特有的,它只是提供数字集合的最短语法 table。
也适用于数据中的重复值或 NULL 值。
TABLE tbl
是(标准 SQL!)SELECT * FROM tbl
:
相关(更多解释):
- Select rows which are not present in other table