通过更接近键的值加入
JOIN by closer value to key
使用以下示例数据:
WITH values AS (
SELECT
1 AS shard,
2008 AS year,
1 AS value
UNION ALL
SELECT
1 AS shard,
20012 AS year,
2 AS value
UNION ALL
SELECT
2 AS shard,
2011 AS year,
3 AS value
UNION ALL
SELECT
2 AS shard,
1998 AS year,
4 AS value
UNION ALL
SELECT
2 AS shard,
2001 AS year,
5 AS value
UNION ALL
SELECT
4 AS shard,
1990 AS year,
6 AS value
ORDER BY year
),
data AS (
SELECT
1 AS id,
1 AS shard,
2010 AS year
UNION ALL
SELECT
1 AS id,
2 AS shard,
2000 AS year
UNION ALL
SELECT
1 AS id,
3 AS shard,
1990 AS year
UNION ALL
SELECT
2 AS id,
1 AS shard,
2010 AS year
UNION ALL
SELECT
2 AS id,
2 AS shard,
2000 AS year
UNION ALL
SELECT
2 AS id,
3 AS shard,
1990 AS year
)
我想加入我的 data
集合,其中存储的值存储在 values
集合中。 Data 有一个 id
来区分每个进程,所以我想为每个 id 执行 JOIN。此外,JOIN 有一个双映射键,即 shard
和 year
字段。我想为我的数据中的每个条目检索 CLOSER 年份在我的 values
集合中与其 shard
属性相匹配的值。
我想出了这段代码,但它没有按预期工作,因为它没有考虑 values.shard
字段,而且无论它们在哪个分片上,它每年都会匹配。
SELECT *
FROM (
SELECT
data.id,
data.year,
values.year AS closer_year,
ABS(data.year - values.year) AS diff,
values.value,
ROW_NUMBER() OVER (PARTITION BY data.id, data.shard ORDER BY ABS(data.year - values.year)) AS rn
FROM data, values
)
WHERE rn = 1
对于示例数据,预期输出应为:
id year closer_year diff value rn
1 2010 2008 2 1 1
1 2000 2001 1 5 1
1 1990 null null null 1
2 2010 2008 2 1 1
2 2000 2001 1 5 1
2 1990 null null null 1
我错过了什么?
我在发布问题后发现了我所缺少的东西。如果有人有类似的用例,我会回答。
重读文本时,我注意到我缺少的“匹配分片”属性确实是一个左连接,所以像这样重写查询解决了问题:
SELECT *
FROM (
SELECT
data.id,
data.year,
values.year AS closer_year,
ABS(data.year - values.year) AS diff,
values.value,
ROW_NUMBER() OVER (PARTITION BY data.id, data.shard ORDER BY ABS(data.year - values.year)) AS rn
FROM data
LEFT JOIN values
ON data.shard = values.shard
)
WHERE rn = 1
使用以下示例数据:
WITH values AS (
SELECT
1 AS shard,
2008 AS year,
1 AS value
UNION ALL
SELECT
1 AS shard,
20012 AS year,
2 AS value
UNION ALL
SELECT
2 AS shard,
2011 AS year,
3 AS value
UNION ALL
SELECT
2 AS shard,
1998 AS year,
4 AS value
UNION ALL
SELECT
2 AS shard,
2001 AS year,
5 AS value
UNION ALL
SELECT
4 AS shard,
1990 AS year,
6 AS value
ORDER BY year
),
data AS (
SELECT
1 AS id,
1 AS shard,
2010 AS year
UNION ALL
SELECT
1 AS id,
2 AS shard,
2000 AS year
UNION ALL
SELECT
1 AS id,
3 AS shard,
1990 AS year
UNION ALL
SELECT
2 AS id,
1 AS shard,
2010 AS year
UNION ALL
SELECT
2 AS id,
2 AS shard,
2000 AS year
UNION ALL
SELECT
2 AS id,
3 AS shard,
1990 AS year
)
我想加入我的 data
集合,其中存储的值存储在 values
集合中。 Data 有一个 id
来区分每个进程,所以我想为每个 id 执行 JOIN。此外,JOIN 有一个双映射键,即 shard
和 year
字段。我想为我的数据中的每个条目检索 CLOSER 年份在我的 values
集合中与其 shard
属性相匹配的值。
我想出了这段代码,但它没有按预期工作,因为它没有考虑 values.shard
字段,而且无论它们在哪个分片上,它每年都会匹配。
SELECT *
FROM (
SELECT
data.id,
data.year,
values.year AS closer_year,
ABS(data.year - values.year) AS diff,
values.value,
ROW_NUMBER() OVER (PARTITION BY data.id, data.shard ORDER BY ABS(data.year - values.year)) AS rn
FROM data, values
)
WHERE rn = 1
对于示例数据,预期输出应为:
id year closer_year diff value rn
1 2010 2008 2 1 1
1 2000 2001 1 5 1
1 1990 null null null 1
2 2010 2008 2 1 1
2 2000 2001 1 5 1
2 1990 null null null 1
我错过了什么?
我在发布问题后发现了我所缺少的东西。如果有人有类似的用例,我会回答。
重读文本时,我注意到我缺少的“匹配分片”属性确实是一个左连接,所以像这样重写查询解决了问题:
SELECT *
FROM (
SELECT
data.id,
data.year,
values.year AS closer_year,
ABS(data.year - values.year) AS diff,
values.value,
ROW_NUMBER() OVER (PARTITION BY data.id, data.shard ORDER BY ABS(data.year - values.year)) AS rn
FROM data
LEFT JOIN values
ON data.shard = values.shard
)
WHERE rn = 1