left-joining 时模糊匹配 BigQuery 中的多个列
Fuzzy matching multiple columns in BigQuery when left-joining
例子
with
lhs_table as (
select 'Nia' as firstName, 'Johnson Jr.' as lastName, 'FEMALE' as gender, 'UNLV' as school, 'Mountain West Conference' as conference union all
select 'Jana' as firstName, 'Abdullah' as lastName, 'FEMALE' as gender, 'Cincinnati' as school, 'American Athletic Conference' as conference union all
select 'Kay' as firstName, 'Sieper' as lastName, 'FEMALE' as gender, 'Loyola Maryland' as school, 'Patriot League' as conference union all
select 'Alessia' as firstName, 'Capley' as lastName, 'FEMALE' as gender, 'Presbyterian' as school, 'Big South Conference' as conference union all
select 'Aaliyah' as firstName, 'Stanley' as lastName, 'FEMALE' as gender, 'FGCU' as school, 'ASUN Conference' as conference
),
rhs_table as (
select 1611707 as playerId, 'Kayla' as firstName, 'Sieper' as lastName, 'Loyola Maryland' as teamMarket, 'Patriot League' as conferenceName union all
select 1380430 as playerId, 'Jana' as firstName, 'Abdullah' as lastName, 'Cincinnati' as teamMarket, 'American Athletic Conference' as conferenceName union all
select 1234567 as playerId, 'Mela' as firstName, 'Aravada' as lastName, 'Cincinnati' as teamMarket, 'American Athletic Conference' as conferenceName union all
select 1354105 as playerId, 'Aaliyah' as firstName, 'Stanley' as lastName, 'FGCU' as teamMarket, 'ASUN Conference' as conferenceName union all
select 1138439 as playerId, 'Aaliyah' as firstName, 'Stanley' as lastName, 'Emory' as teamMarket, 'ASUN Conference' as conferenceName union all
select 996101 as playerId, 'Nia' as firstName, 'Johnson' as lastName, 'Emmanuel (GA)' as teamMarket, 'Conference Carolinas' as conferenceName union all
select 977605 as playerId, 'Nia' as firstName, 'Johnson' as lastName, 'UNLV' as teamMarket, 'Mountain West Conference' as conferenceName union all
select 1329967 as playerId, 'Alessia' as firstName, 'Capley' as lastName, 'Presbyterian' as teamMarket, 'Big South Conference' as conferenceName union all
select 995234 as playerId, 'Nia' as firstName, 'Johnson' as lastName, 'Delta St.' as teamMarket, 'Gulf South Conference' as conferenceName union all
select 4567890 as playerId, 'Britney' as firstName, 'Capley' as lastName, 'Presbyterian' as teamMarket, 'Big South Conference' as conferenceName
)
select
b.playerId
,a.*
from lhs_table as a
left join rhs_table as b
on a.firstName = b.firstName
and a.lastName = b.lastName
and a.school = b.teamMarket
and a.conference = b.conferenceName
我们希望将 rhs_table
加入 lhs_table
以获取 playerId。 lhs_table
中的每个人在rhs_table
中都有对应的行,但是连接并不是那么简单:
- 对于 Nia Johnson Jr.,
Jr.
在 rhs_table
中缺失
- 对于 Kay Sieper,她的全名
Kayla
用于 rhs_table
- 我们想忽略(即不离开加入)错误球队的 RHS 球员(Emmanuel 上的 Nia,Emory 上的 Aaliyah)。
由于这些不匹配,我们需要 fuzzy match
。我们尝试用 on a.firstName like b.firstName
替换 on a.firstName = b.firstName
。请注意,会议是表格之间完全匹配的第 1 列,如果有帮助,我们可以手动确保团队匹配,尽管这需要一些时间。重要的部分是处理拼写不同的名称。
正确的5个playerId依次为977605, 1380430, 1611707, 1329967, 1354105
。我们可以通过某种方式模糊匹配来获取这些 playerIds 吗?
考虑以下方法
select
array_agg(b.playerId order by d limit 1) playerId,
any_value(a).*
from (
select
`bqutil.fn.levenshtein`(a.firstName, b.firstName) / greatest(length(a.firstName), length(b.firstName)) +
`bqutil.fn.levenshtein`(a.lastName, b.lastName) / greatest(length(a.lastName), length(b.lastName)) +
`bqutil.fn.levenshtein`(a.school, b.teamMarket) / greatest(length(a.school), length(b.teamMarket)) +
`bqutil.fn.levenshtein`(a.conference, b.conferenceName) / greatest(length(a.conference), length(b.conferenceName)) d,
a, b
from lhs_table as a
cross join rhs_table as b
)
group by to_json_string(a)
如果应用于您问题中的示例数据 - 输出为
您可以 play/experiment 上面的变体,如下面的例子
select
array_agg(b.playerId order by d limit 1) playerId,
any_value(a).*
from (
select
`bqutil.fn.levenshtein`(format('%t', a), format('%t', b)) d,
a, b
from lhs_table as a
cross join rhs_table as b
)
group by to_json_string(a)
例子
with
lhs_table as (
select 'Nia' as firstName, 'Johnson Jr.' as lastName, 'FEMALE' as gender, 'UNLV' as school, 'Mountain West Conference' as conference union all
select 'Jana' as firstName, 'Abdullah' as lastName, 'FEMALE' as gender, 'Cincinnati' as school, 'American Athletic Conference' as conference union all
select 'Kay' as firstName, 'Sieper' as lastName, 'FEMALE' as gender, 'Loyola Maryland' as school, 'Patriot League' as conference union all
select 'Alessia' as firstName, 'Capley' as lastName, 'FEMALE' as gender, 'Presbyterian' as school, 'Big South Conference' as conference union all
select 'Aaliyah' as firstName, 'Stanley' as lastName, 'FEMALE' as gender, 'FGCU' as school, 'ASUN Conference' as conference
),
rhs_table as (
select 1611707 as playerId, 'Kayla' as firstName, 'Sieper' as lastName, 'Loyola Maryland' as teamMarket, 'Patriot League' as conferenceName union all
select 1380430 as playerId, 'Jana' as firstName, 'Abdullah' as lastName, 'Cincinnati' as teamMarket, 'American Athletic Conference' as conferenceName union all
select 1234567 as playerId, 'Mela' as firstName, 'Aravada' as lastName, 'Cincinnati' as teamMarket, 'American Athletic Conference' as conferenceName union all
select 1354105 as playerId, 'Aaliyah' as firstName, 'Stanley' as lastName, 'FGCU' as teamMarket, 'ASUN Conference' as conferenceName union all
select 1138439 as playerId, 'Aaliyah' as firstName, 'Stanley' as lastName, 'Emory' as teamMarket, 'ASUN Conference' as conferenceName union all
select 996101 as playerId, 'Nia' as firstName, 'Johnson' as lastName, 'Emmanuel (GA)' as teamMarket, 'Conference Carolinas' as conferenceName union all
select 977605 as playerId, 'Nia' as firstName, 'Johnson' as lastName, 'UNLV' as teamMarket, 'Mountain West Conference' as conferenceName union all
select 1329967 as playerId, 'Alessia' as firstName, 'Capley' as lastName, 'Presbyterian' as teamMarket, 'Big South Conference' as conferenceName union all
select 995234 as playerId, 'Nia' as firstName, 'Johnson' as lastName, 'Delta St.' as teamMarket, 'Gulf South Conference' as conferenceName union all
select 4567890 as playerId, 'Britney' as firstName, 'Capley' as lastName, 'Presbyterian' as teamMarket, 'Big South Conference' as conferenceName
)
select
b.playerId
,a.*
from lhs_table as a
left join rhs_table as b
on a.firstName = b.firstName
and a.lastName = b.lastName
and a.school = b.teamMarket
and a.conference = b.conferenceName
我们希望将 rhs_table
加入 lhs_table
以获取 playerId。 lhs_table
中的每个人在rhs_table
中都有对应的行,但是连接并不是那么简单:
- 对于 Nia Johnson Jr.,
Jr.
在rhs_table
中缺失
- 对于 Kay Sieper,她的全名
Kayla
用于rhs_table
- 我们想忽略(即不离开加入)错误球队的 RHS 球员(Emmanuel 上的 Nia,Emory 上的 Aaliyah)。
由于这些不匹配,我们需要 fuzzy match
。我们尝试用 on a.firstName like b.firstName
替换 on a.firstName = b.firstName
。请注意,会议是表格之间完全匹配的第 1 列,如果有帮助,我们可以手动确保团队匹配,尽管这需要一些时间。重要的部分是处理拼写不同的名称。
正确的5个playerId依次为977605, 1380430, 1611707, 1329967, 1354105
。我们可以通过某种方式模糊匹配来获取这些 playerIds 吗?
考虑以下方法
select
array_agg(b.playerId order by d limit 1) playerId,
any_value(a).*
from (
select
`bqutil.fn.levenshtein`(a.firstName, b.firstName) / greatest(length(a.firstName), length(b.firstName)) +
`bqutil.fn.levenshtein`(a.lastName, b.lastName) / greatest(length(a.lastName), length(b.lastName)) +
`bqutil.fn.levenshtein`(a.school, b.teamMarket) / greatest(length(a.school), length(b.teamMarket)) +
`bqutil.fn.levenshtein`(a.conference, b.conferenceName) / greatest(length(a.conference), length(b.conferenceName)) d,
a, b
from lhs_table as a
cross join rhs_table as b
)
group by to_json_string(a)
如果应用于您问题中的示例数据 - 输出为
您可以 play/experiment 上面的变体,如下面的例子
select
array_agg(b.playerId order by d limit 1) playerId,
any_value(a).*
from (
select
`bqutil.fn.levenshtein`(format('%t', a), format('%t', b)) d,
a, b
from lhs_table as a
cross join rhs_table as b
)
group by to_json_string(a)