在 R 中加入不精确的字符串
joining on inexact strings in R
我想加入两个表..但是我想加入的数据不完全匹配..加入 NFL 球员姓名..
以下数据集..
> dput(att75a)
structure(list(rusher_player_name = c("A.Ekeler", "A.Jones",
"A.Kamara", "A.Mattison", "A.Peterson", "B.Hill"), mean_epa = c(-0.110459963350783,
0.0334332018597805, -0.119488111742492, -0.155261835310445, -0.123485646124451,
-0.0689611296359916), success_rate = c(0.357664233576642, 0.40495867768595,
0.401129943502825, 0.283018867924528, 0.322727272727273, 0.35
), plays = c(137L, 242L, 177L, 106L, 220L, 80L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -6L))
> dput(rb2019capa)
structure(list(rusher_player_name = c("Aaron Jones", "Adrian Peterson",
"Alexander Mattison", "Alvin Kamara", "Austin Ekeler", "Brian Hill"
), Team = c("Packers", "Redskins", "Vikings", "Saints", "Chargers",
"Falcons"), `Salary Cap Value` = c(695487, 1780000, 700545, 1050693,
646668, 645000), `Cash Spent` = c(645000, 2530000, 1317180, 807500,
645000, 645000)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L))
例如,我正在尝试加入 A.Mattison 的 Alexander Mattison..等等..
我尝试了 stringdist 和 fuzzyjoin 但无法解决我的问题..
请考虑...将每个数据集的 head() 压缩为每个问题询问指南.. 原始数据集的长度为 51 obs。和 168 obs...这会影响连接的执行方式吗?
清理这些名称的最佳方法是什么?
感谢您的宝贵时间..
用 % 替换点,形成 SQL 模式并根据匹配加入。
library(sqldf)
sqldf("select *
from att75a a
left join rb2019capa r
on r.rusher_player_name like replace(a.rusher_player_name, '.', '%')")
给予:
rusher_player_name mean_epa success_rate plays rusher_player_name..5
1 A.Ekeler -0.11045996 0.3576642 137 Austin Ekeler
2 A.Jones 0.03343320 0.4049587 242 Aaron Jones
3 A.Kamara -0.11948811 0.4011299 177 Alvin Kamara
4 A.Mattison -0.15526184 0.2830189 106 Alexander Mattison
5 A.Peterson -0.12348565 0.3227273 220 Adrian Peterson
6 B.Hill -0.06896113 0.3500000 80 Brian Hill
Team Salary Cap Value Cash Spent
1 Chargers 646668 645000
2 Packers 695487 645000
3 Saints 1050693 807500
4 Vikings 700545 1317180
5 Redskins 1780000 2530000
6 Falcons 645000 645000
使用 sub
将名字替换为首字母。
library(dplyr)
rb2019capa %>%
mutate(rusher_player_name=
sub("^([A-Z])\S+\s([A-Za-z].*)$", "\1.\2", rusher_player_name)) %>%
inner_join(att75a, by="rusher_player_name") # or left_join (up to you)
# A tibble: 6 x 7
rusher_player_name Team `Salary Cap Value` `Cash Spent` mean_epa success_rate plays
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <int>
1 A.Jones Packers 695487 645000 0.0334 0.405 242
2 A.Peterson Redskins 1780000 2530000 -0.123 0.323 220
3 A.Mattison Vikings 700545 1317180 -0.155 0.283 106
4 A.Kamara Saints 1050693 807500 -0.119 0.401 177
5 A.Ekeler Chargers 646668 645000 -0.110 0.358 137
6 B.Hill Falcons 645000 645000 -0.0690 0.35 80
我想加入两个表..但是我想加入的数据不完全匹配..加入 NFL 球员姓名..
以下数据集..
> dput(att75a)
structure(list(rusher_player_name = c("A.Ekeler", "A.Jones",
"A.Kamara", "A.Mattison", "A.Peterson", "B.Hill"), mean_epa = c(-0.110459963350783,
0.0334332018597805, -0.119488111742492, -0.155261835310445, -0.123485646124451,
-0.0689611296359916), success_rate = c(0.357664233576642, 0.40495867768595,
0.401129943502825, 0.283018867924528, 0.322727272727273, 0.35
), plays = c(137L, 242L, 177L, 106L, 220L, 80L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -6L))
> dput(rb2019capa)
structure(list(rusher_player_name = c("Aaron Jones", "Adrian Peterson",
"Alexander Mattison", "Alvin Kamara", "Austin Ekeler", "Brian Hill"
), Team = c("Packers", "Redskins", "Vikings", "Saints", "Chargers",
"Falcons"), `Salary Cap Value` = c(695487, 1780000, 700545, 1050693,
646668, 645000), `Cash Spent` = c(645000, 2530000, 1317180, 807500,
645000, 645000)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L))
例如,我正在尝试加入 A.Mattison 的 Alexander Mattison..等等..
我尝试了 stringdist 和 fuzzyjoin 但无法解决我的问题..
请考虑...将每个数据集的 head() 压缩为每个问题询问指南.. 原始数据集的长度为 51 obs。和 168 obs...这会影响连接的执行方式吗?
清理这些名称的最佳方法是什么?
感谢您的宝贵时间..
用 % 替换点,形成 SQL 模式并根据匹配加入。
library(sqldf)
sqldf("select *
from att75a a
left join rb2019capa r
on r.rusher_player_name like replace(a.rusher_player_name, '.', '%')")
给予:
rusher_player_name mean_epa success_rate plays rusher_player_name..5
1 A.Ekeler -0.11045996 0.3576642 137 Austin Ekeler
2 A.Jones 0.03343320 0.4049587 242 Aaron Jones
3 A.Kamara -0.11948811 0.4011299 177 Alvin Kamara
4 A.Mattison -0.15526184 0.2830189 106 Alexander Mattison
5 A.Peterson -0.12348565 0.3227273 220 Adrian Peterson
6 B.Hill -0.06896113 0.3500000 80 Brian Hill
Team Salary Cap Value Cash Spent
1 Chargers 646668 645000
2 Packers 695487 645000
3 Saints 1050693 807500
4 Vikings 700545 1317180
5 Redskins 1780000 2530000
6 Falcons 645000 645000
使用 sub
将名字替换为首字母。
library(dplyr)
rb2019capa %>%
mutate(rusher_player_name=
sub("^([A-Z])\S+\s([A-Za-z].*)$", "\1.\2", rusher_player_name)) %>%
inner_join(att75a, by="rusher_player_name") # or left_join (up to you)
# A tibble: 6 x 7
rusher_player_name Team `Salary Cap Value` `Cash Spent` mean_epa success_rate plays
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <int>
1 A.Jones Packers 695487 645000 0.0334 0.405 242
2 A.Peterson Redskins 1780000 2530000 -0.123 0.323 220
3 A.Mattison Vikings 700545 1317180 -0.155 0.283 106
4 A.Kamara Saints 1050693 807500 -0.119 0.401 177
5 A.Ekeler Chargers 646668 645000 -0.110 0.358 137
6 B.Hill Falcons 645000 645000 -0.0690 0.35 80