在 BigQuery 中使用标准 SQL 中的 DISTINCT 删除重复项
Removing duplicates with DISTINCT in standard SQL within BigQuery
我正在尝试根据列(在查询中创建)"alpha_ssc_key" 从 table 中去除重复项。但是,当我 运行 这个查询时,它只是 returns 所有结果(而不是消除 "alpha_ssc_key" 重复的行)。任何帮助将不胜感激!
这是在 BigQuery 中完成的。
WITH ssc_test_view AS (
SELECT
DISTINCT CONCAT(CAST(date AS STRING), ciq_id, CAST(quantity AS STRING), CAST(cost_basis AS STRING),fund,security,class,inv_type,share_type) AS alpha_ssc_key,
_metadata_created_at AS file_date,
realized_gain_loss,
cusip,
acq_txn_no,
acquisition_date,
security,
company,
ticker,
ciq_id,
class,
inv_type,
dis_txn_no,
quantity,
categorization,
transaction_type,
cost_basis,
share_type,
fund,
net_proceeds,
unit_cost
FROM
`fcm-dw.acquisition_ssc.ssc_log`)
SELECT
*
FROM
ssc_test_view
试试这个:
SELECT stv.*
FROM (SELECT stv.*,
ROW_NUMBER() OVER (PARTITION BY alpha_ssc_key ORDER BY _metadata_created_at DESC) as seqnum
FROM ssc_test_view stv
) stv
WHERE seqnum = 1;
DISTINCT 应用于所有列 - 不仅仅是您的键 - 这就是为什么仍然返回所有行的原因。在您的情况下,最好按复合键中的字段进行分组,但您需要决定实际重复数据删除的逻辑。在下面的示例中使用了 MIN。但是您可以使用适合您需要的任何聚合函数
试试下面
#standardSQL
WITH ssc_test_view AS (
SELECT
date,
ciq_id,
quantity,
cost_basis,
fund,
security,
class,
inv_type,
share_type,
MIN(_metadata_created_at) AS file_date,
MIN(realized_gain_loss) AS realized_gain_loss,
MIN(cusip) AS cusip,
MIN(acq_txn_no) AS acq_txn_no,
MIN(acquisition_date) AS acquisition_date,
MIN(company) AS company,
MIN(ticker) AS ticker,
MIN(dis_txn_no) AS dis_txn_no,
MIN(categorization) AS categorization,
MIN(transaction_type) AS transaction_type,
MIN(net_proceeds) AS net_proceeds,
MIN(unit_cost) AS unit_cost,
FROM `fcm-dw.acquisition_ssc.ssc_log`
GROUP BY
date,
ciq_id,
quantity,
cost_basis,
fund,
security,
class,
inv_type,
share_type)
SELECT * FROM ssc_test_view
我正在尝试根据列(在查询中创建)"alpha_ssc_key" 从 table 中去除重复项。但是,当我 运行 这个查询时,它只是 returns 所有结果(而不是消除 "alpha_ssc_key" 重复的行)。任何帮助将不胜感激!
这是在 BigQuery 中完成的。
WITH ssc_test_view AS (
SELECT
DISTINCT CONCAT(CAST(date AS STRING), ciq_id, CAST(quantity AS STRING), CAST(cost_basis AS STRING),fund,security,class,inv_type,share_type) AS alpha_ssc_key,
_metadata_created_at AS file_date,
realized_gain_loss,
cusip,
acq_txn_no,
acquisition_date,
security,
company,
ticker,
ciq_id,
class,
inv_type,
dis_txn_no,
quantity,
categorization,
transaction_type,
cost_basis,
share_type,
fund,
net_proceeds,
unit_cost
FROM
`fcm-dw.acquisition_ssc.ssc_log`)
SELECT
*
FROM
ssc_test_view
试试这个:
SELECT stv.*
FROM (SELECT stv.*,
ROW_NUMBER() OVER (PARTITION BY alpha_ssc_key ORDER BY _metadata_created_at DESC) as seqnum
FROM ssc_test_view stv
) stv
WHERE seqnum = 1;
DISTINCT 应用于所有列 - 不仅仅是您的键 - 这就是为什么仍然返回所有行的原因。在您的情况下,最好按复合键中的字段进行分组,但您需要决定实际重复数据删除的逻辑。在下面的示例中使用了 MIN。但是您可以使用适合您需要的任何聚合函数
试试下面
#standardSQL
WITH ssc_test_view AS (
SELECT
date,
ciq_id,
quantity,
cost_basis,
fund,
security,
class,
inv_type,
share_type,
MIN(_metadata_created_at) AS file_date,
MIN(realized_gain_loss) AS realized_gain_loss,
MIN(cusip) AS cusip,
MIN(acq_txn_no) AS acq_txn_no,
MIN(acquisition_date) AS acquisition_date,
MIN(company) AS company,
MIN(ticker) AS ticker,
MIN(dis_txn_no) AS dis_txn_no,
MIN(categorization) AS categorization,
MIN(transaction_type) AS transaction_type,
MIN(net_proceeds) AS net_proceeds,
MIN(unit_cost) AS unit_cost,
FROM `fcm-dw.acquisition_ssc.ssc_log`
GROUP BY
date,
ciq_id,
quantity,
cost_basis,
fund,
security,
class,
inv_type,
share_type)
SELECT * FROM ssc_test_view