在bigquery中比较两个表的有效方法
efficient way to compare two tables in bigquery
我有兴趣比较两个 table 是否包含相同的数据。
我可以这样做:
#standardSQL
SELECT
key1, key2
FROM
(
SELECT
table1.key1,
table1.key2,
table1.column1 - table2.column1 as col1,
table1.col2 - table2.col2 as col2
FROM
`table1` AS table1
LEFT JOIN
`table2` AS table2
ON
table1.key1 = table2.key1
AND
table1.key2 = table2.key2
)
WHERE
col1 != 0
OR
col2 != 0
但是当我想比较所有数字列时,这有点困难,尤其是当我想对多个 table 组合进行比较时。
因此我的问题是:是否有人知道可以遍历所有数字列并将结果集限制为这些差异不为零的那些键?
您将需要指定哪些是数字列,但查看所有列的表示将进行快速比较:
#standardSQL
WITH table_a AS (
SELECT 1 id, 2 n1, 3 n2
), table_b AS (
SELECT 1 id, 2 n1, 4 n2
)
SELECT id
FROM table_a a
JOIN table_b b
USING(id)
WHERE TO_JSON_STRING([a.n1, a.n2]) != TO_JSON_STRING([b.n1, b.n2])
First, I want to bring up issues with your original query
主要问题是 1) 使用 LEFT JOIN ; 2) 使用 col != 0
下面是应该如何修改它以真正捕获两个表的所有差异
运行 您的原始查询和以下查询 - 希望您会看到其中的区别
#standardSQL
SELECT key1, key2
FROM
(
SELECT
IFNULL(table1.key1, table2.key1) key1,
IFNULL(table1.key2, table2.key2) key2,
table1.column1 - table2.column1 AS col1,
table1.col2 - table2.col2 AS col2
FROM `table1` AS table1
FULL OUTER JOIN `table2` AS table2
ON table1.key1 = table2.key1
AND table1.key2 = table2.key2
)
WHERE IFNULL(col1, 1) != 0
OR IFNULL(col2, 1) != 0
或者您可以尝试 运行 您的原始版本和更高版本与虚拟数据对比以查看差异
#standardSQL
WITH `table1` AS (
SELECT 1 key1, 1 key2, 1 column1, 2 col2 UNION ALL
SELECT 2, 2, 3, 4 UNION ALL
SELECT 3, 3, 5, 6
), `table2` AS (
SELECT 1 key1, 1 key2, 1 column1, 29 col2 UNION ALL
SELECT 2, 2, 3, 4 UNION ALL
SELECT 4, 4, 7, 8
)
SELECT key1, key2
FROM
(
SELECT
IFNULL(table1.key1, table2.key1) key1,
IFNULL(table1.key2, table2.key2) key2,
table1.column1 - table2.column1 AS col1,
table1.col2 - table2.col2 AS col2
FROM `table1` AS table1
FULL OUTER JOIN `table2` AS table2
ON table1.key1 = table2.key1
AND table1.key2 = table2.key2
)
WHERE IFNULL(col1, 1) != 0
OR IFNULL(col2, 1) != 0
Secondly, below will highly simplify your overall query
#standardSQL
SELECT
IFNULL(table1.key1, table2.key1) key1,
IFNULL(table1.key2, table2.key2) key2
FROM `table1` AS table1
FULL OUTER JOIN `table2` AS table2
ON table1.key1 = table2.key1
AND table1.key2 = table2.key2
WHERE TO_JSON_STRING(table1) != TO_JSON_STRING(table2)
您可以使用与上述相同的虚拟数据示例对其进行测试
注意:在此解决方案中,您不需要选择特定的列 - 它只是比较所有列!但是如果你只需要比较特定的列——你仍然需要像下面的例子那样挑选它们
#standardSQL
SELECT
IFNULL(table1.key1, table2.key1) key1,
IFNULL(table1.key2, table2.key2) key2
FROM `table1` AS table1
FULL OUTER JOIN `table2` AS table2
ON table1.key1 = table2.key1
AND table1.key2 = table2.key2
WHERE TO_JSON_STRING((table1.column1, table1.col2)) != TO_JSON_STRING((table2.column1, table2.col2))
在标准 SQL 中,我们发现使用两个 EXCEPT DISTINCT
中的 UNION ALL
用于我们的用例:
(
SELECT * FROM table1
EXCEPT DISTINCT
SELECT * from table2
)
UNION ALL
(
SELECT * FROM table2
EXCEPT DISTINCT
SELECT * from table1
)
这会在两个方向产生差异:
-
table1
中不在 table2
中的行
-
table2
中不在 table1
中的行
注意事项和注意事项:
table1
和 table2
必须具有相同的宽度并且列的顺序和类型必须相同。
- 这不能直接用于
STRUCT
或 ARRAY
数据类型。您应该先 UNNEST
,或使用 TO_JSON_STRING
来转换这些数据类型。
- 这也不能直接与
GEOGRAPHY
一起使用,您必须先使用 ST_AsText
转换为文本
我有兴趣比较两个 table 是否包含相同的数据。
我可以这样做:
#standardSQL
SELECT
key1, key2
FROM
(
SELECT
table1.key1,
table1.key2,
table1.column1 - table2.column1 as col1,
table1.col2 - table2.col2 as col2
FROM
`table1` AS table1
LEFT JOIN
`table2` AS table2
ON
table1.key1 = table2.key1
AND
table1.key2 = table2.key2
)
WHERE
col1 != 0
OR
col2 != 0
但是当我想比较所有数字列时,这有点困难,尤其是当我想对多个 table 组合进行比较时。
因此我的问题是:是否有人知道可以遍历所有数字列并将结果集限制为这些差异不为零的那些键?
您将需要指定哪些是数字列,但查看所有列的表示将进行快速比较:
#standardSQL
WITH table_a AS (
SELECT 1 id, 2 n1, 3 n2
), table_b AS (
SELECT 1 id, 2 n1, 4 n2
)
SELECT id
FROM table_a a
JOIN table_b b
USING(id)
WHERE TO_JSON_STRING([a.n1, a.n2]) != TO_JSON_STRING([b.n1, b.n2])
First, I want to bring up issues with your original query
主要问题是 1) 使用 LEFT JOIN ; 2) 使用 col != 0
下面是应该如何修改它以真正捕获两个表的所有差异
运行 您的原始查询和以下查询 - 希望您会看到其中的区别
#standardSQL
SELECT key1, key2
FROM
(
SELECT
IFNULL(table1.key1, table2.key1) key1,
IFNULL(table1.key2, table2.key2) key2,
table1.column1 - table2.column1 AS col1,
table1.col2 - table2.col2 AS col2
FROM `table1` AS table1
FULL OUTER JOIN `table2` AS table2
ON table1.key1 = table2.key1
AND table1.key2 = table2.key2
)
WHERE IFNULL(col1, 1) != 0
OR IFNULL(col2, 1) != 0
或者您可以尝试 运行 您的原始版本和更高版本与虚拟数据对比以查看差异
#standardSQL
WITH `table1` AS (
SELECT 1 key1, 1 key2, 1 column1, 2 col2 UNION ALL
SELECT 2, 2, 3, 4 UNION ALL
SELECT 3, 3, 5, 6
), `table2` AS (
SELECT 1 key1, 1 key2, 1 column1, 29 col2 UNION ALL
SELECT 2, 2, 3, 4 UNION ALL
SELECT 4, 4, 7, 8
)
SELECT key1, key2
FROM
(
SELECT
IFNULL(table1.key1, table2.key1) key1,
IFNULL(table1.key2, table2.key2) key2,
table1.column1 - table2.column1 AS col1,
table1.col2 - table2.col2 AS col2
FROM `table1` AS table1
FULL OUTER JOIN `table2` AS table2
ON table1.key1 = table2.key1
AND table1.key2 = table2.key2
)
WHERE IFNULL(col1, 1) != 0
OR IFNULL(col2, 1) != 0
Secondly, below will highly simplify your overall query
#standardSQL
SELECT
IFNULL(table1.key1, table2.key1) key1,
IFNULL(table1.key2, table2.key2) key2
FROM `table1` AS table1
FULL OUTER JOIN `table2` AS table2
ON table1.key1 = table2.key1
AND table1.key2 = table2.key2
WHERE TO_JSON_STRING(table1) != TO_JSON_STRING(table2)
您可以使用与上述相同的虚拟数据示例对其进行测试
注意:在此解决方案中,您不需要选择特定的列 - 它只是比较所有列!但是如果你只需要比较特定的列——你仍然需要像下面的例子那样挑选它们
#standardSQL
SELECT
IFNULL(table1.key1, table2.key1) key1,
IFNULL(table1.key2, table2.key2) key2
FROM `table1` AS table1
FULL OUTER JOIN `table2` AS table2
ON table1.key1 = table2.key1
AND table1.key2 = table2.key2
WHERE TO_JSON_STRING((table1.column1, table1.col2)) != TO_JSON_STRING((table2.column1, table2.col2))
在标准 SQL 中,我们发现使用两个 EXCEPT DISTINCT
中的 UNION ALL
用于我们的用例:
(
SELECT * FROM table1
EXCEPT DISTINCT
SELECT * from table2
)
UNION ALL
(
SELECT * FROM table2
EXCEPT DISTINCT
SELECT * from table1
)
这会在两个方向产生差异:
-
table1
中不在table2
中的行
-
table2
中不在table1
中的行
注意事项和注意事项:
table1
和table2
必须具有相同的宽度并且列的顺序和类型必须相同。- 这不能直接用于
STRUCT
或ARRAY
数据类型。您应该先UNNEST
,或使用TO_JSON_STRING
来转换这些数据类型。 - 这也不能直接与
GEOGRAPHY
一起使用,您必须先使用ST_AsText
转换为文本