按列名合并两个数据框(merge() 不起作用)

Merge two data frames by column name (merge() doesn't work)

我有两个数据框(这是头部):

1:

# A tibble: 6 x 2
  twitterID           Username       
  <chr>               <chr>          
1 849567328899616768  AchimKessler   
2 1117749912          Achim_P        
3 186552155           NA             
4 172269309           agnieszka_mdb  
5 1127961248493129728 StegemannAlbert
6 1178640571725955073 BDobrindt 

2:

# A tibble: 6 x 3
  Username     TwitterID Name                
  <chr>            <dbl> <chr>               
1 achimkessler        NA Achim Kessler       
2 achim_p             NA Achim Post          
3 achim_p             NA Achim Post          
4 achim_p             NA Achim Post  (Minden)
5 NA                  NA Adis Ahmetovic      
6 NA                  NA Agnes Alpers  

我希望通过用户名加入他们,以使用数据框 1 中的 twitterID 填充数据框 2 中的 TwitterID 列。

df <- merge(x = 2, y = 1, by = "Username", all.x = TRUE)

...应该可以完成这项工作,但事实并非如此。

在输出 df 中,有许多 NA twitterID 的用户名实际上相互匹配的实例。如果有帮助,我可以上传一些更大的数据集供您测试。 非常感谢任何帮助。

您可以使用 tidyverse。您可以首先使用户名大小写相同,然后加入第二个数据框,然后使用 coalesce 将 NA 替换为 ID(如果可用)。

library(tidyverse)

df1 %>%
  dplyr::mutate(Username = tolower(Username)) %>%
  dplyr::rename(TwitterID = twitterID) %>% 
  dplyr::left_join(., df2, by = "Username") %>%
  dplyr::mutate(TwitterID = coalesce(TwitterID.x, TwitterID.y)) %>%
  dplyr::select(-TwitterID.x,-TwitterID.y) %>% 
  distinct()

输出

         Username                Name           TwitterID
1    achimkessler       Achim Kessler  849567328899616768
2         achim_p          Achim Post          1117749912
3         achim_p Achim Post (Minden)          1117749912
4            <NA>      Adis Ahmetovic           186552155
5            <NA>        Agnes Alpers           186552155
6   agnieszka_mdb                <NA>           172269309
7 stegemannalbert                <NA> 1127961248493129728
8       bdobrindt                <NA> 1178640571725955073

或者您仍然可以使用 merge,但您只需要确保数据帧的格式相同(即两个用户名列均为小写)。

df1 <- df1 %>% 
  dplyr::rename(TwitterID = twitterID) %>% 
  dplyr::mutate(Username = tolower(Username))

df <- merge(x = df2, y = df1, by = "Username", all.x = TRUE)

输出

      Username TwitterID.x                Name        TwitterID.y
1      achim_p          NA          Achim Post         1117749912
2      achim_p          NA          Achim Post         1117749912
3      achim_p          NA Achim Post (Minden)         1117749912
4 achimkessler          NA       Achim Kessler 849567328899616768
5         <NA>          NA      Adis Ahmetovic          186552155
6         <NA>          NA        Agnes Alpers          186552155

数据

df1 <-
  structure(list(
    twitterID = c(
      "849567328899616768",
      "1117749912",
      "186552155",
      "172269309",
      "1127961248493129728",
      "1178640571725955073"
    ),
    Username = c(
      "AchimKessler",
      "Achim_P",
      NA,
      "agnieszka_mdb",
      "StegemannAlbert",
      "BDobrindt"
    )
  ),
  class = "data.frame",
  row.names = c(NA,-6L))

df2 <-
  structure(
    list(
      Username = c("achimkessler", "achim_p", "achim_p",
                   "achim_p", NA, NA),
      TwitterID = c(NA, NA, NA, NA, NA, NA),
      Name = c(
        "Achim Kessler",
        "Achim Post",
        "Achim Post",
        "Achim Post (Minden)",
        "Adis Ahmetovic",
        "Agnes Alpers"
      )
    ),
    class = "data.frame",
    row.names = c(NA,-6L)
  )

如果 df1 中的大写有问题,请在 merge 中使用 tolower

merge(transform(df1, Username=tolower(Username)), df2, all=TRUE)
#          Username           twitterID TwitterID                 Name
# 1         achim_p          1117749912        NA           Achim Post
# 2         achim_p          1117749912        NA           Achim Post
# 3         achim_p          1117749912        NA Achim Post  (Minden)
# 4    achimkessler  849567328899616768        NA        Achim Kessler
# 5   agnieszka_mdb           172269309        NA                 <NA>
# 6       bdobrindt 1178640571725955073        NA                 <NA>
# 7 stegemannalbert 1127961248493129728        NA                 <NA>
# 8            <NA>           186552155        NA       Adis Ahmetovic
# 9            <NA>           186552155        NA         Agnes Alpers

或者:

merge(transform(df1, Username=tolower(Username)), df2[-2], all.y=TRUE) |>
  (\(x) {x[, 'Name'] <- gsub('\s+\(.*', '', x[, 'Name']);x})() |>
  unique()
#       Username          twitterID           Name
# 1      achim_p         1117749912     Achim Post
# 4 achimkessler 849567328899616768  Achim Kessler
# 5         <NA>          186552155 Adis Ahmetovic
# 6         <NA>          186552155   Agnes Alpers

否则,如果我是你,我会看一下 ?agrep 进行近似字符串匹配。

> R.version.string
[1] "R version 4.1.2 (2021-11-01)"

数据:

df1 <- structure(list(twitterID = c("849567328899616768", "1117749912", 
"186552155", "172269309", "1127961248493129728", "1178640571725955073"
), Username = c("AchimKessler", "Achim_P", NA, "agnieszka_mdb", 
"StegemannAlbert", "BDobrindt")), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6"))

df2 <- structure(list(Username = c("achimkessler", "achim_p", "achim_p", 
"achim_p", NA, NA), TwitterID = c(NA, NA, NA, NA, NA, NA), Name = c("Achim Kessler", 
"Achim Post", "Achim Post", "Achim Post  (Minden)", "Adis Ahmetovic", 
"Agnes Alpers")), class = "data.frame", row.names = c("1", "2", 
"3", "4", "5", "6"))