写入 code/function 以最高相似度匹配列名

Writing code/function that matches column names by highest similarity

我有五个数据集,随着时间的推移涵盖相同的主题。

library(data.table)
DT <- data.table(A= round(rnorm(10,10,10),2),
                 B= round(rnorm(10,10,10),2),
                 C= round(rnorm(10,10,10),2))
DT_2 <- data.table(A= round(rnorm(10,10,10),2),
                 B= round(rnorm(10,10,10),2),
                 C= round(rnorm(10,10,10),2),
                 D= round(rnorm(10,10,10),2))
DT_3 <- DT
DT_4 <- DT_2
DT_5 <- DT_2
names(DT)   <- c("something","nothing", "anything")
names(DT_2) <- c("some thing","no thing", "any thing", "number4")
names(DT_3) <- c("some thing wrong","nothing", "anything_")
names(DT_4) <- c("something","nothingg", "anything", "number_4")
names(DT_5) <- c("something","nothing", "anything happening", "number4")

然而,每一年,它们都有点不同。列的名称略有更改,添加了一些列,删除了一些列。我想“rowbind”这些数据集。然而,每个数据集大约有 100 列,手动使所有列名一致将是地狱。

编辑:请注意,列不一定具有相同的索引,例如下面编辑的列名中的情况,其中 DT_2 具有列 XXX

# EDIT
names(DT)<- c("something","nothing", "anything", "number4")
names(DT_2)<- c("some thing","no thing","XXX", "number4")
names(DT_3)<- c("some thing wrong","nothing", "anything_")
names(DT_4)<- c("something","nothingg", "anything", "number_4")
names(DT_5)<- c("something","nothing", "anything happening", "number4")

我认为编写一个函数来为我做这件事可能是个更好的主意。

我曾经就一个功能类似 的功能寻求帮助。以下函数在不指定变量名称的情况下合并具有变量名称的大写和非大写版本的列。

非常巧妙,它还指定合并了哪些 var 名称。

library(data.table)
library(magrittr) # piping is used to improve readability
names(DT_panel) %>% 
  data.table(orig = ., lc = tolower(.)) %>% 
  .[, {
    if (.N > 1L) {
      new <- toupper(.BY)
      old <- setdiff(orig, new)
      DT_panel[, (new) := fcoalesce(.SD), .SDcols = orig]
      DT_panel[, (old) := NULL]
      sprintf("Coalesced %s onto %s", toString(old), new)
    }
  }, by = lc]

此外,我发现了这个问题here,它根据列条目进行模糊连接。

library(fuzzyjoin); library(dplyr);

stringdist_join(a, b, 
                by = "name",
                mode = "left",
                ignore_case = FALSE, 
                method = "jw", 
                max_dist = 99, 
                distance_col = "dist") %>%
  group_by(name.x) %>%
  top_n(1, -dist)

问题是我对这两种解决方案的理解都不够好,无法将它们组合成一个提供我想要的解决方案的函数。

有人可以帮我开始吗?我想要的输出如下:

DT <- data.table(A= round(rnorm(10,10,10),2),
                 B= round(rnorm(10,10,10),2),
                 C= round(rnorm(10,10,10),2))
DT_2 <- data.table(A= round(rnorm(10,10,10),2),
                 B= round(rnorm(10,10,10),2),
                 C= round(rnorm(10,10,10),2),
                 D= round(rnorm(10,10,10),2))
D <- c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
DT_3 <- DT
DT_4 <- DT_2
DT_5 <- DT_2
DT <- cbind(DT, D)
DT_3 <- cbind(DT_3, D)
DT <- rbind (DT, DT_2, DT_3, DT_4, DT_5)
names(DT) <- c("something","nothing", "anything", "number4")

此方法基于fuzzyjoin::stringdist_join。它处理新的和删除的列。

从一些虚拟数据开始。

library(tidyverse)

df1 <- tibble("something" = 1,"nothing" = 2, "anything" = 3, "number4" = 4)
df2 <- tibble("some thing" = 1,"no thing" = 2,"XXX" = 99, "number4" = 4)
df3 <- tibble("some thing wrong" = 1,"nothing" = 2, "anything_" = 4)
df4 <- tibble("something" = 1,"nothingg" = 2, "anything" = 2, "number_4" = 4, "YYY" = 100)
df5 <- tibble("something" = 1,"nothing" = 2, "anything happening" = 2, "number4" = 4)

fuzzy_rowbind模糊合并两个数据框。它使用 fuzzyjoin::stringdist_join 来识别哪些列最相似。第二个数据框的列已重命名并合并。

fuzzy_rowbind <- function(a, b, method = "cosine", max_dist = 0.9999) {
  a_name_df <- tibble(name = names(a))
  b_name_df <- tibble(name = names(b))
  
  fj <- 
    fuzzyjoin::stringdist_join(
      a_name_df,
      b_name_df, 
      by = "name",
      mode = "left",
      ignore_case = FALSE, 
      method = method, 
      max_dist = max_dist, 
      distance_col = "dist"
    ) %>%
    arrange(dist)
  
  name_mapping <- NULL
  while (nrow(fj) > 0 && !all(b_name_df$name %in% name_mapping$name.y)) {
    name_mapping <- bind_rows(name_mapping, fj %>% slice(1))
    
    fj <- fj %>% filter(!name.x %in% name_mapping$name.x, !name.y %in% name_mapping$name.y)
  }
  
  new_names <- setNames(name_mapping$name.y, name_mapping$name.x)
  
  b_renamed <- rename(b, new_names[!is.na(new_names)])
  
  enframe(new_names, name = "new_name", value = "original_name") %>%
    filter(new_name != original_name, !is.na(new_name)) %>%
    as.data.frame() %>%
    print()
  cat("\n")
  
  bind_rows(a, b_renamed)
}

例如,当我们组合 df1df2 时会发生这种情况。

fuzzy_rowbind(df1, df2)
#>    new_name original_name
#> 1 something    some thing
#> 2   nothing      no thing
#> 
#> # A tibble: 2 x 5
#>   something nothing anything number4   XXX
#>       <dbl>   <dbl>    <dbl>   <dbl> <dbl>
#> 1         1       2        3       4    NA
#> 2         1       2       NA       4    99

接下来,定义 fuzzy_rowbind_all 可以获取数据帧列表并将它们组合在一起。

fuzzy_rowbind_all <- function(l) {
  last(accumulate(l, fuzzy_rowbind))
}

这是在我们的数据框中使用的fuzzy_rowbind_all

fuzzy_rowbind_all(
  lst(df1, df2, df3, df4, df5)
)
#>    new_name original_name
#> 1 something    some thing
#> 2   nothing      no thing
#> 
#>    new_name    original_name
#> 1  anything        anything_
#> 2 something some thing wrong
#> 
#>   new_name original_name
#> 1  nothing      nothingg
#> 2  number4      number_4
#> 
#>   new_name      original_name
#> 1 anything anything happening
#> 
#> # A tibble: 5 x 6
#>   something nothing anything number4   XXX   YYY
#>       <dbl>   <dbl>    <dbl>   <dbl> <dbl> <dbl>
#> 1         1       2        3       4    NA    NA
#> 2         1       2       NA       4    99    NA
#> 3         1       2        4      NA    NA    NA
#> 4         1       2        2       4    NA   100
#> 5         1       2        2       4    NA    NA