使用 R 中的另一个数据框列完成列名
Complete column names with another dataframe column in R
我有这个 table:
library(rvest)
library(tidyverse)
tables_team_pl <- read_html('https://www.win-or-lose.com/football-team-colours/')
color_table <- tables_team_pl %>% html_table() %>% pluck(1) %>% select(-Away)
还有这个:
table_1 <- structure(list(Team = c("Arsenal", "Aston Villa", "Blackburn",
"Bolton", "Chelsea", "Everton", "Fulham", "Liverpool", "Manchester City",
"Manchester Utd", "Newcastle Utd", "Norwich City", "QPR", "Stoke City",
"Sunderland", "Swansea City", "Tottenham", "West Brom", "Wigan Athletic",
"Wolves")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-20L))
如您所见,第二个 table 的名称不完整。例如,Manchester Utd
应该是 Manchester United
,就像第一个 table.
所以,我只需要 完成 这第二个 table 从第一个 table.
中提取相同的名字
所以,我会table_1 更正:Manchester Utd应该改成Manchester Unites,Blackburn应该改成Blackburn Rovers等等。 完整的名字应该来自第一个 table.
还有第二个 table 我有 QPR,应该是“Queens Park Rangers”。
有什么帮助吗?
我们可以使用strindist
加入
library(fuzzyjoin)
library(dplyr)
stringdist_left_join(table_1, color_table, by = "Team", method = "soundex") %>%
transmute(Team = coalesce(Team.y, Team.x)) %>%
distinct
这是使用 agrep
的 base R 解决方案。它具有允许设置最大数量的 insertions、deletions 和 substitutions 以实现一场比赛。
table_1_original <- table_1
table_1$Team <- data.frame( Team=sapply( as.matrix(table_1), function(x){
a=agrep( x, tables_team_pl,
max=list(insert=0,del=0,subs=3));
if(!identical(a, integer(0))){ tables_team_pl[a] }
else{ x } } ) )
结果包括与原始结果的比较:
cbind(table_1_original, table_1)
Team Team
1 Arsenal Arsenal
2 Aston Villa Aston Villa
3 Blackburn Blackburn Rovers
4 Bolton Bolton
5 Chelsea Chelsea
6 Everton Everton
7 Fulham Fulham
8 Liverpool Liverpool
9 Manchester City Manchester City
10 Manchester Utd Manchester United
11 Newcastle Utd Newcastle United
12 Norwich City Norwich City
13 Queens Queens Park Rangers
14 Stoke City Stoke City
15 Sunderland Sunderland
16 Swansea City Swansea City
17 Tottenham Tottenham Hotspur
18 West Brom West Bromwich Albion
19 Wigan Athletic Wigan Athletic
20 Wolves Wolverhampton Wanderers
已过滤 HTML 没有颜色的数据:
tables_team_pl <- c("Aberdeen", "AFC Bournemouth", "AFC Wimbledon", "Arsenal",
"Aston Villa", "Birmingham City", "Blackburn Rovers", "Bradford City",
"Brentford", "Brighton & Hove Albion", "Bristol City", "Burnley",
"Cardiff City", "Celtic", "Chelsea", "Crystal Palace", "Derby County",
"Dundee", "Dundee United", "Everton", "Fulham", "Hamilton Academical",
"Heart of Midlothian", "Hibernian", "Huddersfield Town", "Hull City",
"Inverness Caledonian Thistle", "Kilmarnock", "Leeds United",
"Leicester City", "Liverpool", "Livingston", "Manchester City",
"Manchester United", "Middlesbrough", "Millwall", "Motherwell",
"Newcastle United", "Norwich City", "Nottingham Forest", "Partick Thistle",
"Portsmouth", "Preston North End", "Queens Park Rangers", "Rangers",
"Reading", "Ross County", "Rotherham", "Sheffield United", "Sheffield Wednesday",
"Southampton", "St Johnstone", "St Mirren", "Stoke City", "Sunderland",
"Swansea", "Tottenham Hotspur", "Watford", "West Bromwich Albion",
"West Ham United", "Wolverhampton Wanderers", "Wycombe Wanderers")
我有这个 table:
library(rvest)
library(tidyverse)
tables_team_pl <- read_html('https://www.win-or-lose.com/football-team-colours/')
color_table <- tables_team_pl %>% html_table() %>% pluck(1) %>% select(-Away)
还有这个:
table_1 <- structure(list(Team = c("Arsenal", "Aston Villa", "Blackburn",
"Bolton", "Chelsea", "Everton", "Fulham", "Liverpool", "Manchester City",
"Manchester Utd", "Newcastle Utd", "Norwich City", "QPR", "Stoke City",
"Sunderland", "Swansea City", "Tottenham", "West Brom", "Wigan Athletic",
"Wolves")), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-20L))
如您所见,第二个 table 的名称不完整。例如,Manchester Utd
应该是 Manchester United
,就像第一个 table.
所以,我只需要 完成 这第二个 table 从第一个 table.
中提取相同的名字所以,我会table_1 更正:Manchester Utd应该改成Manchester Unites,Blackburn应该改成Blackburn Rovers等等。 完整的名字应该来自第一个 table.
还有第二个 table 我有 QPR,应该是“Queens Park Rangers”。
有什么帮助吗?
我们可以使用strindist
加入
library(fuzzyjoin)
library(dplyr)
stringdist_left_join(table_1, color_table, by = "Team", method = "soundex") %>%
transmute(Team = coalesce(Team.y, Team.x)) %>%
distinct
这是使用 agrep
的 base R 解决方案。它具有允许设置最大数量的 insertions、deletions 和 substitutions 以实现一场比赛。
table_1_original <- table_1
table_1$Team <- data.frame( Team=sapply( as.matrix(table_1), function(x){
a=agrep( x, tables_team_pl,
max=list(insert=0,del=0,subs=3));
if(!identical(a, integer(0))){ tables_team_pl[a] }
else{ x } } ) )
结果包括与原始结果的比较:
cbind(table_1_original, table_1)
Team Team
1 Arsenal Arsenal
2 Aston Villa Aston Villa
3 Blackburn Blackburn Rovers
4 Bolton Bolton
5 Chelsea Chelsea
6 Everton Everton
7 Fulham Fulham
8 Liverpool Liverpool
9 Manchester City Manchester City
10 Manchester Utd Manchester United
11 Newcastle Utd Newcastle United
12 Norwich City Norwich City
13 Queens Queens Park Rangers
14 Stoke City Stoke City
15 Sunderland Sunderland
16 Swansea City Swansea City
17 Tottenham Tottenham Hotspur
18 West Brom West Bromwich Albion
19 Wigan Athletic Wigan Athletic
20 Wolves Wolverhampton Wanderers
已过滤 HTML 没有颜色的数据:
tables_team_pl <- c("Aberdeen", "AFC Bournemouth", "AFC Wimbledon", "Arsenal",
"Aston Villa", "Birmingham City", "Blackburn Rovers", "Bradford City",
"Brentford", "Brighton & Hove Albion", "Bristol City", "Burnley",
"Cardiff City", "Celtic", "Chelsea", "Crystal Palace", "Derby County",
"Dundee", "Dundee United", "Everton", "Fulham", "Hamilton Academical",
"Heart of Midlothian", "Hibernian", "Huddersfield Town", "Hull City",
"Inverness Caledonian Thistle", "Kilmarnock", "Leeds United",
"Leicester City", "Liverpool", "Livingston", "Manchester City",
"Manchester United", "Middlesbrough", "Millwall", "Motherwell",
"Newcastle United", "Norwich City", "Nottingham Forest", "Partick Thistle",
"Portsmouth", "Preston North End", "Queens Park Rangers", "Rangers",
"Reading", "Ross County", "Rotherham", "Sheffield United", "Sheffield Wednesday",
"Southampton", "St Johnstone", "St Mirren", "Stoke City", "Sunderland",
"Swansea", "Tottenham Hotspur", "Watford", "West Bromwich Albion",
"West Ham United", "Wolverhampton Wanderers", "Wycombe Wanderers")