标准 SQL:将显式交叉连接重写为 WITH 子句
Standard SQL: rewrite an explicit cross join to WITH clause
考虑一个 table transactions
,它有两个 JSONB 字段 outputs
和 inputs
。
问题是如何使用 WITH 子句重写此查询?
-- Note: This query will process 111.85 MB when run.
SELECT
transactions.hash AS CREATED_TX_HASH,
transactions.block_number AS CREATED_BLOCK_ID,
transactions.block_timestamp AS CREATED_BLOCK_TIME,
outputs.index AS CREATED_INDEX,
outputs.value / 1e8 AS OUTPUT_VALUE_BTC,
transactions.hash AS SPENT_CREATED_TX_HASH,
transactions.block_number AS SPENDING_BLOCK_ID,
transactions.block_timestamp AS SPENDING_BLOCK_TIME,
inputs.index AS SPENT_CREATED_INDEX,
inputs.spent_transaction_hash as SPENDING_TX_HASH,
inputs.spent_output_index AS SPENDING_INDEX,
inputs.value / 1e8 AS INPUT_VALUE_BTC
FROM `bigquery-public-data.crypto_bitcoin.transactions` as transactions
CROSS JOIN
transactions.outputs as outputs
CROSS JOIN
transactions.inputs as inputs
-- FROM `bigquery-public-data.crypto_bitcoin.transactions` as transactions,
-- transactions.outputs as outputs,
-- transactions.inputs as inputs
WHERE transactions.block_timestamp_month < '2009-02-01'
ORDER BY 3
我需要创建 CTE 以保留如下临时结果集:
WITH outputs AS (
SELECT
transactions.hash AS CREATED_TX_HASH,
transactions.block_number AS CREATED_BLOCK_ID,
transactions.block_timestamp AS CREATED_BLOCK_TIME,
outputs.index AS CREATED_INDEX,
outputs.value / 1e8 AS OUTPUT_VALUE_BTC
FROM `bigquery-public-data.crypto_bitcoin.transactions` as transactions,
transactions.outputs as outputs
WHERE transactions.block_timestamp_month < '2009-02-01'
), inputs AS (
SELECT
transactions.hash AS SPENT_CREATED_TX_HASH,
transactions.block_number AS SPENDING_BLOCK_ID,
transactions.block_timestamp AS SPENDING_BLOCK_TIME,
inputs.index AS SPENT_CREATED_INDEX,
inputs.spent_transaction_hash as SPENDING_TX_HASH,
inputs.spent_output_index AS SPENDING_INDEX,
inputs.value / 1e8 AS INPUT_VALUE_BTC
FROM `bigquery-public-data.crypto_bitcoin.transactions` as transactions,
transactions.inputs as inputs
WHERE transactions.block_timestamp_month < '2009-02-01'
)
但我不知道这两个 CTE 上的哪个 SELECT
语句产生与上面的原始查询相同的结果。
您需要通过 CREATED_BLOCK_ID
和 SPENDING_BLOCK_ID
加入它们,另外我使用 ROW_NUMBER
语句来避免重复值。
以下查询应该适合您:
WITH outputs AS (
SELECT
transactions.hash AS CREATED_TX_HASH,
transactions.block_number AS CREATED_BLOCK_ID,
transactions.block_timestamp AS CREATED_BLOCK_TIME,
outputs.index AS CREATED_INDEX,
outputs.value / 1e8 AS OUTPUT_VALUE_BTC
FROM `bigquery-public-data.crypto_bitcoin.transactions` as transactions,
transactions.outputs as outputs
WHERE transactions.block_timestamp_month < '2009-02-01'
), inputs AS (
SELECT
transactions.hash AS SPENT_CREATED_TX_HASH,
transactions.block_number AS SPENDING_BLOCK_ID,
transactions.block_timestamp AS SPENDING_BLOCK_TIME,
inputs.index AS SPENT_CREATED_INDEX,
inputs.spent_transaction_hash as SPENDING_TX_HASH,
inputs.spent_output_index AS SPENDING_INDEX,
inputs.value / 1e8 AS INPUT_VALUE_BTC
FROM `bigquery-public-data.crypto_bitcoin.transactions` as transactions,
transactions.inputs as inputs
WHERE transactions.block_timestamp_month < '2009-02-01'
)
SELECT * from
(
SELECT * ,
ROW_NUMBER() OVER(PARTITION BY CREATED_BLOCK_ID, CREATED_INDEX, SPENDING_BLOCK_ID, SPENT_CREATED_INDEX, CREATED_TX_HASH, SPENT_CREATED_TX_HASH
ORDER BY CREATED_BLOCK_TIME DESC) as last
from outputs o join inputs i
on o.CREATED_BLOCK_ID=SPENDING_BLOCK_ID
order by o.CREATED_BLOCK_ID, o.CREATED_BLOCK_TIME, o.CREATED_INDEX, o.CREATED_TX_HASH
)
WHERE last = 1 AND CREATED_TX_HASH = SPENT_CREATED_TX_HASH
输出如下:
最后,我建议您使用 CROSS JOIN
查询,因为此函数比使用 WITH
子句的子查询具有更好的性能。
考虑一个 table transactions
,它有两个 JSONB 字段 outputs
和 inputs
。
问题是如何使用 WITH 子句重写此查询?
-- Note: This query will process 111.85 MB when run.
SELECT
transactions.hash AS CREATED_TX_HASH,
transactions.block_number AS CREATED_BLOCK_ID,
transactions.block_timestamp AS CREATED_BLOCK_TIME,
outputs.index AS CREATED_INDEX,
outputs.value / 1e8 AS OUTPUT_VALUE_BTC,
transactions.hash AS SPENT_CREATED_TX_HASH,
transactions.block_number AS SPENDING_BLOCK_ID,
transactions.block_timestamp AS SPENDING_BLOCK_TIME,
inputs.index AS SPENT_CREATED_INDEX,
inputs.spent_transaction_hash as SPENDING_TX_HASH,
inputs.spent_output_index AS SPENDING_INDEX,
inputs.value / 1e8 AS INPUT_VALUE_BTC
FROM `bigquery-public-data.crypto_bitcoin.transactions` as transactions
CROSS JOIN
transactions.outputs as outputs
CROSS JOIN
transactions.inputs as inputs
-- FROM `bigquery-public-data.crypto_bitcoin.transactions` as transactions,
-- transactions.outputs as outputs,
-- transactions.inputs as inputs
WHERE transactions.block_timestamp_month < '2009-02-01'
ORDER BY 3
我需要创建 CTE 以保留如下临时结果集:
WITH outputs AS (
SELECT
transactions.hash AS CREATED_TX_HASH,
transactions.block_number AS CREATED_BLOCK_ID,
transactions.block_timestamp AS CREATED_BLOCK_TIME,
outputs.index AS CREATED_INDEX,
outputs.value / 1e8 AS OUTPUT_VALUE_BTC
FROM `bigquery-public-data.crypto_bitcoin.transactions` as transactions,
transactions.outputs as outputs
WHERE transactions.block_timestamp_month < '2009-02-01'
), inputs AS (
SELECT
transactions.hash AS SPENT_CREATED_TX_HASH,
transactions.block_number AS SPENDING_BLOCK_ID,
transactions.block_timestamp AS SPENDING_BLOCK_TIME,
inputs.index AS SPENT_CREATED_INDEX,
inputs.spent_transaction_hash as SPENDING_TX_HASH,
inputs.spent_output_index AS SPENDING_INDEX,
inputs.value / 1e8 AS INPUT_VALUE_BTC
FROM `bigquery-public-data.crypto_bitcoin.transactions` as transactions,
transactions.inputs as inputs
WHERE transactions.block_timestamp_month < '2009-02-01'
)
但我不知道这两个 CTE 上的哪个 SELECT
语句产生与上面的原始查询相同的结果。
您需要通过 CREATED_BLOCK_ID
和 SPENDING_BLOCK_ID
加入它们,另外我使用 ROW_NUMBER
语句来避免重复值。
以下查询应该适合您:
WITH outputs AS (
SELECT
transactions.hash AS CREATED_TX_HASH,
transactions.block_number AS CREATED_BLOCK_ID,
transactions.block_timestamp AS CREATED_BLOCK_TIME,
outputs.index AS CREATED_INDEX,
outputs.value / 1e8 AS OUTPUT_VALUE_BTC
FROM `bigquery-public-data.crypto_bitcoin.transactions` as transactions,
transactions.outputs as outputs
WHERE transactions.block_timestamp_month < '2009-02-01'
), inputs AS (
SELECT
transactions.hash AS SPENT_CREATED_TX_HASH,
transactions.block_number AS SPENDING_BLOCK_ID,
transactions.block_timestamp AS SPENDING_BLOCK_TIME,
inputs.index AS SPENT_CREATED_INDEX,
inputs.spent_transaction_hash as SPENDING_TX_HASH,
inputs.spent_output_index AS SPENDING_INDEX,
inputs.value / 1e8 AS INPUT_VALUE_BTC
FROM `bigquery-public-data.crypto_bitcoin.transactions` as transactions,
transactions.inputs as inputs
WHERE transactions.block_timestamp_month < '2009-02-01'
)
SELECT * from
(
SELECT * ,
ROW_NUMBER() OVER(PARTITION BY CREATED_BLOCK_ID, CREATED_INDEX, SPENDING_BLOCK_ID, SPENT_CREATED_INDEX, CREATED_TX_HASH, SPENT_CREATED_TX_HASH
ORDER BY CREATED_BLOCK_TIME DESC) as last
from outputs o join inputs i
on o.CREATED_BLOCK_ID=SPENDING_BLOCK_ID
order by o.CREATED_BLOCK_ID, o.CREATED_BLOCK_TIME, o.CREATED_INDEX, o.CREATED_TX_HASH
)
WHERE last = 1 AND CREATED_TX_HASH = SPENT_CREATED_TX_HASH
输出如下:
最后,我建议您使用 CROSS JOIN
查询,因为此函数比使用 WITH
子句的子查询具有更好的性能。