按列名合并两个数据框(merge() 不起作用)
Merge two data frames by column name (merge() doesn't work)
我有两个数据框(这是头部):
1:
# A tibble: 6 x 2
twitterID Username
<chr> <chr>
1 849567328899616768 AchimKessler
2 1117749912 Achim_P
3 186552155 NA
4 172269309 agnieszka_mdb
5 1127961248493129728 StegemannAlbert
6 1178640571725955073 BDobrindt
2:
# A tibble: 6 x 3
Username TwitterID Name
<chr> <dbl> <chr>
1 achimkessler NA Achim Kessler
2 achim_p NA Achim Post
3 achim_p NA Achim Post
4 achim_p NA Achim Post (Minden)
5 NA NA Adis Ahmetovic
6 NA NA Agnes Alpers
我希望通过用户名加入他们,以使用数据框 1 中的 twitterID 填充数据框 2 中的 TwitterID 列。
df <- merge(x = 2, y = 1, by = "Username", all.x = TRUE)
...应该可以完成这项工作,但事实并非如此。
在输出 df 中,有许多 NA twitterID 的用户名实际上相互匹配的实例。如果有帮助,我可以上传一些更大的数据集供您测试。
非常感谢任何帮助。
您可以使用 tidyverse
。您可以首先使用户名大小写相同,然后加入第二个数据框,然后使用 coalesce
将 NA 替换为 ID(如果可用)。
library(tidyverse)
df1 %>%
dplyr::mutate(Username = tolower(Username)) %>%
dplyr::rename(TwitterID = twitterID) %>%
dplyr::left_join(., df2, by = "Username") %>%
dplyr::mutate(TwitterID = coalesce(TwitterID.x, TwitterID.y)) %>%
dplyr::select(-TwitterID.x,-TwitterID.y) %>%
distinct()
输出
Username Name TwitterID
1 achimkessler Achim Kessler 849567328899616768
2 achim_p Achim Post 1117749912
3 achim_p Achim Post (Minden) 1117749912
4 <NA> Adis Ahmetovic 186552155
5 <NA> Agnes Alpers 186552155
6 agnieszka_mdb <NA> 172269309
7 stegemannalbert <NA> 1127961248493129728
8 bdobrindt <NA> 1178640571725955073
或者您仍然可以使用 merge
,但您只需要确保数据帧的格式相同(即两个用户名列均为小写)。
df1 <- df1 %>%
dplyr::rename(TwitterID = twitterID) %>%
dplyr::mutate(Username = tolower(Username))
df <- merge(x = df2, y = df1, by = "Username", all.x = TRUE)
输出
Username TwitterID.x Name TwitterID.y
1 achim_p NA Achim Post 1117749912
2 achim_p NA Achim Post 1117749912
3 achim_p NA Achim Post (Minden) 1117749912
4 achimkessler NA Achim Kessler 849567328899616768
5 <NA> NA Adis Ahmetovic 186552155
6 <NA> NA Agnes Alpers 186552155
数据
df1 <-
structure(list(
twitterID = c(
"849567328899616768",
"1117749912",
"186552155",
"172269309",
"1127961248493129728",
"1178640571725955073"
),
Username = c(
"AchimKessler",
"Achim_P",
NA,
"agnieszka_mdb",
"StegemannAlbert",
"BDobrindt"
)
),
class = "data.frame",
row.names = c(NA,-6L))
df2 <-
structure(
list(
Username = c("achimkessler", "achim_p", "achim_p",
"achim_p", NA, NA),
TwitterID = c(NA, NA, NA, NA, NA, NA),
Name = c(
"Achim Kessler",
"Achim Post",
"Achim Post",
"Achim Post (Minden)",
"Adis Ahmetovic",
"Agnes Alpers"
)
),
class = "data.frame",
row.names = c(NA,-6L)
)
如果 df1
中的大写有问题,请在 merge
中使用 tolower
。
merge(transform(df1, Username=tolower(Username)), df2, all=TRUE)
# Username twitterID TwitterID Name
# 1 achim_p 1117749912 NA Achim Post
# 2 achim_p 1117749912 NA Achim Post
# 3 achim_p 1117749912 NA Achim Post (Minden)
# 4 achimkessler 849567328899616768 NA Achim Kessler
# 5 agnieszka_mdb 172269309 NA <NA>
# 6 bdobrindt 1178640571725955073 NA <NA>
# 7 stegemannalbert 1127961248493129728 NA <NA>
# 8 <NA> 186552155 NA Adis Ahmetovic
# 9 <NA> 186552155 NA Agnes Alpers
或者:
merge(transform(df1, Username=tolower(Username)), df2[-2], all.y=TRUE) |>
(\(x) {x[, 'Name'] <- gsub('\s+\(.*', '', x[, 'Name']);x})() |>
unique()
# Username twitterID Name
# 1 achim_p 1117749912 Achim Post
# 4 achimkessler 849567328899616768 Achim Kessler
# 5 <NA> 186552155 Adis Ahmetovic
# 6 <NA> 186552155 Agnes Alpers
否则,如果我是你,我会看一下 ?agrep
进行近似字符串匹配。
> R.version.string
[1] "R version 4.1.2 (2021-11-01)"
数据:
df1 <- structure(list(twitterID = c("849567328899616768", "1117749912",
"186552155", "172269309", "1127961248493129728", "1178640571725955073"
), Username = c("AchimKessler", "Achim_P", NA, "agnieszka_mdb",
"StegemannAlbert", "BDobrindt")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
df2 <- structure(list(Username = c("achimkessler", "achim_p", "achim_p",
"achim_p", NA, NA), TwitterID = c(NA, NA, NA, NA, NA, NA), Name = c("Achim Kessler",
"Achim Post", "Achim Post", "Achim Post (Minden)", "Adis Ahmetovic",
"Agnes Alpers")), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6"))
我有两个数据框(这是头部):
1:
# A tibble: 6 x 2
twitterID Username
<chr> <chr>
1 849567328899616768 AchimKessler
2 1117749912 Achim_P
3 186552155 NA
4 172269309 agnieszka_mdb
5 1127961248493129728 StegemannAlbert
6 1178640571725955073 BDobrindt
2:
# A tibble: 6 x 3
Username TwitterID Name
<chr> <dbl> <chr>
1 achimkessler NA Achim Kessler
2 achim_p NA Achim Post
3 achim_p NA Achim Post
4 achim_p NA Achim Post (Minden)
5 NA NA Adis Ahmetovic
6 NA NA Agnes Alpers
我希望通过用户名加入他们,以使用数据框 1 中的 twitterID 填充数据框 2 中的 TwitterID 列。
df <- merge(x = 2, y = 1, by = "Username", all.x = TRUE)
...应该可以完成这项工作,但事实并非如此。
在输出 df 中,有许多 NA twitterID 的用户名实际上相互匹配的实例。如果有帮助,我可以上传一些更大的数据集供您测试。 非常感谢任何帮助。
您可以使用 tidyverse
。您可以首先使用户名大小写相同,然后加入第二个数据框,然后使用 coalesce
将 NA 替换为 ID(如果可用)。
library(tidyverse)
df1 %>%
dplyr::mutate(Username = tolower(Username)) %>%
dplyr::rename(TwitterID = twitterID) %>%
dplyr::left_join(., df2, by = "Username") %>%
dplyr::mutate(TwitterID = coalesce(TwitterID.x, TwitterID.y)) %>%
dplyr::select(-TwitterID.x,-TwitterID.y) %>%
distinct()
输出
Username Name TwitterID
1 achimkessler Achim Kessler 849567328899616768
2 achim_p Achim Post 1117749912
3 achim_p Achim Post (Minden) 1117749912
4 <NA> Adis Ahmetovic 186552155
5 <NA> Agnes Alpers 186552155
6 agnieszka_mdb <NA> 172269309
7 stegemannalbert <NA> 1127961248493129728
8 bdobrindt <NA> 1178640571725955073
或者您仍然可以使用 merge
,但您只需要确保数据帧的格式相同(即两个用户名列均为小写)。
df1 <- df1 %>%
dplyr::rename(TwitterID = twitterID) %>%
dplyr::mutate(Username = tolower(Username))
df <- merge(x = df2, y = df1, by = "Username", all.x = TRUE)
输出
Username TwitterID.x Name TwitterID.y
1 achim_p NA Achim Post 1117749912
2 achim_p NA Achim Post 1117749912
3 achim_p NA Achim Post (Minden) 1117749912
4 achimkessler NA Achim Kessler 849567328899616768
5 <NA> NA Adis Ahmetovic 186552155
6 <NA> NA Agnes Alpers 186552155
数据
df1 <-
structure(list(
twitterID = c(
"849567328899616768",
"1117749912",
"186552155",
"172269309",
"1127961248493129728",
"1178640571725955073"
),
Username = c(
"AchimKessler",
"Achim_P",
NA,
"agnieszka_mdb",
"StegemannAlbert",
"BDobrindt"
)
),
class = "data.frame",
row.names = c(NA,-6L))
df2 <-
structure(
list(
Username = c("achimkessler", "achim_p", "achim_p",
"achim_p", NA, NA),
TwitterID = c(NA, NA, NA, NA, NA, NA),
Name = c(
"Achim Kessler",
"Achim Post",
"Achim Post",
"Achim Post (Minden)",
"Adis Ahmetovic",
"Agnes Alpers"
)
),
class = "data.frame",
row.names = c(NA,-6L)
)
如果 df1
中的大写有问题,请在 merge
中使用 tolower
。
merge(transform(df1, Username=tolower(Username)), df2, all=TRUE)
# Username twitterID TwitterID Name
# 1 achim_p 1117749912 NA Achim Post
# 2 achim_p 1117749912 NA Achim Post
# 3 achim_p 1117749912 NA Achim Post (Minden)
# 4 achimkessler 849567328899616768 NA Achim Kessler
# 5 agnieszka_mdb 172269309 NA <NA>
# 6 bdobrindt 1178640571725955073 NA <NA>
# 7 stegemannalbert 1127961248493129728 NA <NA>
# 8 <NA> 186552155 NA Adis Ahmetovic
# 9 <NA> 186552155 NA Agnes Alpers
或者:
merge(transform(df1, Username=tolower(Username)), df2[-2], all.y=TRUE) |>
(\(x) {x[, 'Name'] <- gsub('\s+\(.*', '', x[, 'Name']);x})() |>
unique()
# Username twitterID Name
# 1 achim_p 1117749912 Achim Post
# 4 achimkessler 849567328899616768 Achim Kessler
# 5 <NA> 186552155 Adis Ahmetovic
# 6 <NA> 186552155 Agnes Alpers
否则,如果我是你,我会看一下 ?agrep
进行近似字符串匹配。
> R.version.string
[1] "R version 4.1.2 (2021-11-01)"
数据:
df1 <- structure(list(twitterID = c("849567328899616768", "1117749912",
"186552155", "172269309", "1127961248493129728", "1178640571725955073"
), Username = c("AchimKessler", "Achim_P", NA, "agnieszka_mdb",
"StegemannAlbert", "BDobrindt")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
df2 <- structure(list(Username = c("achimkessler", "achim_p", "achim_p",
"achim_p", NA, NA), TwitterID = c(NA, NA, NA, NA, NA, NA), Name = c("Achim Kessler",
"Achim Post", "Achim Post", "Achim Post (Minden)", "Adis Ahmetovic",
"Agnes Alpers")), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6"))