删除 BigQuery 分区中的重复行
Deduplicate rows in a BigQuery partition
我有一个 table 有很多重复的行 - 但我只想一次删除一个分区的行。
我该怎么做?
例如,您可以从按日期分区并填充 1 到 5 的随机整数的 table 开始:
CREATE OR REPLACE TABLE `temp.many_random`
PARTITION BY d
AS
SELECT DATE('2018-10-01') d, fhoffa.x.random_int(0,5) random_int
FROM UNNEST(GENERATE_ARRAY(1, 100))
UNION ALL
SELECT CURRENT_DATE() d, fhoffa.x.random_int(0,5) random_int
FROM UNNEST(GENERATE_ARRAY(1, 100))
让我们看看现有的 table:
中有哪些数据
SELECT d, random_int, COUNT(*) c
FROM `temp.many_random`
GROUP BY 1, 2
ORDER BY 1,2
重复了很多!
我们可以使用 MERGE
和 SELECT DISTINCT *
通过这样的查询对一个分区进行重复数据删除:
MERGE `temp.many_random` t
USING (
SELECT DISTINCT *
FROM `temp.many_random`
WHERE d=CURRENT_DATE()
)
ON FALSE
WHEN NOT MATCHED BY SOURCE AND d=CURRENT_DATE() THEN DELETE
WHEN NOT MATCHED BY TARGET THEN INSERT ROW
那么最终结果是这样的:
我们需要确保 SELECT
和 THEN DELETE
行中的日期相同。这将删除该分区上的所有行,并插入 SELECT DISTINCT
.
中的所有行
灵感来自:
要删除整个 table,请参阅:
附加答案 - 对于不能使用 DISTINCT
:
的复杂行
MERGE `temp.many_random` t
USING (
# choose a single row to delete the duplicates
SELECT a.*
FROM (
SELECT ANY_VALUE(a) a
FROM `temp.many_random` a
WHERE d='2018-10-01'
GROUP BY d, random_int # id
)
)
ON FALSE
WHEN NOT MATCHED BY SOURCE AND d='2018-10-01'
# delete the duplicates
THEN DELETE
WHEN NOT MATCHED BY TARGET THEN INSERT ROW
您还可以对一系列分区进行重复数据删除。
-- WARNING: back up the table before this operation
-- FOR large size timestamp partitioned table
-- -------------------------------------------
-- -- To de-duplicate rows of a given range of a partition table, using surrage_key as unique id
-- -------------------------------------------
DECLARE dt_start DEFAULT TIMESTAMP("2019-09-17T00:00:00", "America/Los_Angeles") ;
DECLARE dt_end DEFAULT TIMESTAMP("2019-09-22T00:00:00", "America/Los_Angeles");
MERGE INTO `my_project`.`data_set`.`the_table` AS INTERNAL_DEST
USING (
SELECT k.*
FROM (
SELECT ARRAY_AGG(original_data LIMIT 1)[OFFSET(0)] k
FROM `my_project`.`data_set`.`the_table` AS original_data
WHERE stamp BETWEEN dt_start AND dt_end
GROUP BY surrogate_key
)
) AS INTERNAL_SOURCE
ON FALSE
WHEN NOT MATCHED BY SOURCE
AND INTERNAL_DEST.stamp BETWEEN dt_start AND dt_end -- remove all data in partiion range
THEN DELETE
WHEN NOT MATCHED THEN INSERT ROW
信用:https://gist.github.com/hui-zheng/f7e972bcbe9cde0c6cb6318f7270b67a
我有一个 table 有很多重复的行 - 但我只想一次删除一个分区的行。
我该怎么做?
例如,您可以从按日期分区并填充 1 到 5 的随机整数的 table 开始:
CREATE OR REPLACE TABLE `temp.many_random`
PARTITION BY d
AS
SELECT DATE('2018-10-01') d, fhoffa.x.random_int(0,5) random_int
FROM UNNEST(GENERATE_ARRAY(1, 100))
UNION ALL
SELECT CURRENT_DATE() d, fhoffa.x.random_int(0,5) random_int
FROM UNNEST(GENERATE_ARRAY(1, 100))
让我们看看现有的 table:
中有哪些数据SELECT d, random_int, COUNT(*) c
FROM `temp.many_random`
GROUP BY 1, 2
ORDER BY 1,2
重复了很多!
我们可以使用 MERGE
和 SELECT DISTINCT *
通过这样的查询对一个分区进行重复数据删除:
MERGE `temp.many_random` t
USING (
SELECT DISTINCT *
FROM `temp.many_random`
WHERE d=CURRENT_DATE()
)
ON FALSE
WHEN NOT MATCHED BY SOURCE AND d=CURRENT_DATE() THEN DELETE
WHEN NOT MATCHED BY TARGET THEN INSERT ROW
那么最终结果是这样的:
我们需要确保 SELECT
和 THEN DELETE
行中的日期相同。这将删除该分区上的所有行,并插入 SELECT DISTINCT
.
灵感来自:
要删除整个 table,请参阅:
附加答案 - 对于不能使用 DISTINCT
:
MERGE `temp.many_random` t
USING (
# choose a single row to delete the duplicates
SELECT a.*
FROM (
SELECT ANY_VALUE(a) a
FROM `temp.many_random` a
WHERE d='2018-10-01'
GROUP BY d, random_int # id
)
)
ON FALSE
WHEN NOT MATCHED BY SOURCE AND d='2018-10-01'
# delete the duplicates
THEN DELETE
WHEN NOT MATCHED BY TARGET THEN INSERT ROW
您还可以对一系列分区进行重复数据删除。
-- WARNING: back up the table before this operation
-- FOR large size timestamp partitioned table
-- -------------------------------------------
-- -- To de-duplicate rows of a given range of a partition table, using surrage_key as unique id
-- -------------------------------------------
DECLARE dt_start DEFAULT TIMESTAMP("2019-09-17T00:00:00", "America/Los_Angeles") ;
DECLARE dt_end DEFAULT TIMESTAMP("2019-09-22T00:00:00", "America/Los_Angeles");
MERGE INTO `my_project`.`data_set`.`the_table` AS INTERNAL_DEST
USING (
SELECT k.*
FROM (
SELECT ARRAY_AGG(original_data LIMIT 1)[OFFSET(0)] k
FROM `my_project`.`data_set`.`the_table` AS original_data
WHERE stamp BETWEEN dt_start AND dt_end
GROUP BY surrogate_key
)
) AS INTERNAL_SOURCE
ON FALSE
WHEN NOT MATCHED BY SOURCE
AND INTERNAL_DEST.stamp BETWEEN dt_start AND dt_end -- remove all data in partiion range
THEN DELETE
WHEN NOT MATCHED THEN INSERT ROW
信用:https://gist.github.com/hui-zheng/f7e972bcbe9cde0c6cb6318f7270b67a