在 R 中，对 data.frames 中的 select 特定变量使用非标准评估

Question

我有几个类似关系数据库的大型 data.frames 设置，我想创建一个函数来查找我需要的任何变量并从特定的 data.frame 并将其添加到我目前正在处理的 data.frame 中。我有一种方法可以做到这一点，但它需要临时列出所有 data.frames，这似乎效率低下。我怀疑非标准评估会为我解决这个问题，但我不确定该怎么做。

以下是有效但似乎效率低下的方法：

Table1 <- data.frame(ID = LETTERS[1:10], ColA = rnorm(10), ColB = rnorm(10),
                     ColC = rnorm(10))

Table2 <- data.frame(ID = LETTERS[1:10], ColD = rnorm(10), ColE = rnorm(10),
                     ColF = rnorm(10))

Table3 <- data.frame(ID = LETTERS[1:10], ColG = rnorm(10), ColH = rnorm(10),
                     ColI = rnorm(10))

Key <- data.frame(Table = rep(c("Table1", "Table2", "Table3"), each = 4),
                  ColumnName = c("ID", paste0("Col", LETTERS[1:3]),
                                 "ID", paste0("Col", LETTERS[4:6]),
                                 "ID", paste0("Col", LETTERS[7:9])))

# function for grabbing info from other tables
grab <- function(StartDF, ColNames){

      AllDFs <- list(Table1, Table2, Table3)
      names(AllDFs) <- c("Table1", "Table2", "Table3")

      # Determine which data.frames have that column
      WhichDF <- Key %>% filter(ColumnName %in% ColNames) %>% 
            select(Table)

      TempDF <- StartDF

      for(i in 1:length(ColNames)){
            ToAdd <- AllDFs[WhichDF[i, 1]]
            ToAdd <- ToAdd[[1]] %>% 
                  select(c(ColNames[i], ID))

            TempDF <- TempDF %>% left_join(ToAdd)
            rm(ToAdd)
      }

      return(TempDF)


}

grab(Table1, c("ColE", "ColH"))

最好是这样的：

grab <- function(StartDF, ColNames){

      # Some function that returns the column names of all the data.frames
      # without me creating a new object that is a list of them

      # Some function that left_joins the correct data.frame plus the column
      # "ID" to my starting data.frame, again without needing to create that list 
      # of all the data.frames

}

Answer 1

无需手动创建 list，我们可以使用 mget[=15 直接获取从 'Key' 数据集的 'Table' 列返回的对象的值=]

library(dplyr)
library(purrr)
grab <- function(StartDF, ColNames){



     # filter the rows of Key based on the ColNames input
     # pull the Table column as a vector
     # column was factor, so convert to character class
     # return the value of the objects with mget in a list
     Tables <- Key %>% 
               filter(ColumnName %in% ColNames) %>% 
               pull(Table) %>%
               as.character %>%
               mget(envir = .GlobalEnv) 


      TempDF <- StartDF

      # use the same left_joins in a loop after selecting only the
      # ID and corresponding columns from 'ColNames'
      for(i in seq_along(ColNames)){
            ToAdd  <- Tables[[i]] %>%
                         select(ColNames[i], ID)          

            TempDF <- TempDF %>% 
                  left_join(ToAdd)
            rm(ToAdd)
      }

      TempDF


}

grab(Table1, c("ColE", "ColH"))

或者另一种选择是reduce

grab <- function(StartDF, ColNames) {
     #only change is that instead of a for loop
     # use reduce with left_join after selecting the corresponding columns
     # with map
     Key %>%
       filter(ColumnName %in% ColNames) %>% 
       pull(Table) %>%
       as.character %>%
       mget(envir = .GlobalEnv)  %>%
       map2(ColNames, ~ .x %>%
                     select(ID, .y)) %>%
       append(list(Table1), .)  %>%
       reduce(left_join)

   }

grab(Table1, c("ColE", "ColH"))
#   ID       ColA       ColB        ColC        ColE        ColH
#1   A -0.9490093  0.5177143 -1.91015491  0.07777086  1.86277670
#2   B -0.7182786 -1.1019146 -0.70802738 -0.73965230  0.18375660
#3   C  0.5064516 -1.6904354  1.11106206  2.04315508 -0.65365228
#4   D  0.9362477  0.5260682 -0.03419651 -0.51628310 -1.17104181
#5   E  0.5636047 -0.9470895  0.43303304 -2.95928629  1.86425049
#6   F  1.0598531  0.4144901  0.10239896  1.57681703 -0.05382603
#7   G  1.1335047 -0.8282173 -0.28327898  2.02917831  0.50768462
#8   H  0.2941341  0.3261185 -0.15528127 -0.46470035 -0.86561320
#9   I -2.1434905  0.6567689  0.02298549  0.90822132  0.64360337
#10  J  0.4291258  1.3410147  0.67544567  0.12466251  0.75989623

Answer 2

已接受的解决方案中存在严重错误。如果您不注意 ColNames 参数中的顺序，则该函数将不起作用。此外，我重新定义了您的数据以改为使用 tibbles。它们基本上与数据框相同，但它们的默认设置更好（例如，您不需要 StringsAsFactors = FALSE）

library(tidyverse)

Table1 <- tibble(
  ID = LETTERS[1:10], ColA = rnorm(10), ColB = rnorm(10), ColC = rnorm(10)
)
Table2 <- tibble(
  ID = LETTERS[1:10], ColD = rnorm(10), ColE = rnorm(10), ColF = rnorm(10)
)
Table3 <- tibble(
  ID = LETTERS[1:10], ColG = rnorm(10), ColH = rnorm(10), ColI = rnorm(10)
)

Key <- tibble(
  Table = rep(c("Table1", "Table2", "Table3"), each = 4),
  ColumnName = c("ID", paste0("Col", LETTERS[1:3]),
                 "ID", paste0("Col", LETTERS[4:6]),
                 "ID", paste0("Col", LETTERS[7:9]))
)

grab_akrun <- function(StartDF, ColNames) {
  #only change is that instead of a for loop
  # use reduce with left_join after selecting the corresponding columns
  # with map
  Key %>%
    filter(ColumnName %in% ColNames) %>% 
    pull(Table) %>%
    as.character %>%
    mget(envir = .GlobalEnv)  %>%
    map2(ColNames, ~ .x %>%
           select(ID, .y)) %>%
    append(list(Table1), .)  %>%
    reduce(left_join)

}

grab_akrun(Table1, c("ColE", "ColH"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#>    ID      ColA   ColB   ColC   ColE   ColH
#>    <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#>  1 A     -0.658 -0.613  0.689 -0.850 -0.795
#>  2 B      0.143  0.732 -0.212 -1.74   1.99 
#>  3 C     -0.966 -0.570 -0.354  0.559 -1.11 
#>  4 D     -1.05   0.269 -0.856 -0.370 -1.35 
#>  5 E      0.255 -0.349  0.329  1.39   0.421
#>  6 F      1.51   1.38   0.707 -0.639  0.289
#>  7 G     -1.28   1.44  -1.35   1.94  -1.04 
#>  8 H     -1.56  -0.434  0.231  0.467  0.656
#>  9 I     -0.553 -1.64  -0.761  0.133  0.249
#> 10 J     -0.950  0.418 -0.843  0.593  0.343

这可行，但如果您更改顺序：

grab_akrun(Table1, c("ColH", "ColE"))
#> Error: Unknown column `ColH`

相反，您应该这样处理：

grab_new <- function(StartDF, ColNames) {
  Key %>% 
    filter(ColumnName %in% ColNames) %>% 
    pluck("Table") %>%
    mget(inherits = TRUE) %>% 
    map(~select(.x, ID, intersect(colnames(.x), ColNames))) %>% 
    reduce(left_join, .init = StartDF)
}

grab_new(Table1, c("ColE", "ColH"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#>    ID      ColA   ColB   ColC   ColE   ColH
#>    <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#>  1 A     -0.658 -0.613  0.689 -0.850 -0.795
#>  2 B      0.143  0.732 -0.212 -1.74   1.99 
#>  3 C     -0.966 -0.570 -0.354  0.559 -1.11 
#>  4 D     -1.05   0.269 -0.856 -0.370 -1.35 
#>  5 E      0.255 -0.349  0.329  1.39   0.421
#>  6 F      1.51   1.38   0.707 -0.639  0.289
#>  7 G     -1.28   1.44  -1.35   1.94  -1.04 
#>  8 H     -1.56  -0.434  0.231  0.467  0.656
#>  9 I     -0.553 -1.64  -0.761  0.133  0.249
#> 10 J     -0.950  0.418 -0.843  0.593  0.343
grab_new(Table1, c("ColH", "ColE"))
#> Joining, by = "ID"Joining, by = "ID"
#> # A tibble: 10 x 6
#>    ID      ColA   ColB   ColC   ColE   ColH
#>    <chr>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
#>  1 A     -0.658 -0.613  0.689 -0.850 -0.795
#>  2 B      0.143  0.732 -0.212 -1.74   1.99 
#>  3 C     -0.966 -0.570 -0.354  0.559 -1.11 
#>  4 D     -1.05   0.269 -0.856 -0.370 -1.35 
#>  5 E      0.255 -0.349  0.329  1.39   0.421
#>  6 F      1.51   1.38   0.707 -0.639  0.289
#>  7 G     -1.28   1.44  -1.35   1.94  -1.04 
#>  8 H     -1.56  -0.434  0.231  0.467  0.656
#>  9 I     -0.553 -1.64  -0.761  0.133  0.249
#> 10 J     -0.950  0.418 -0.843  0.593  0.343

按预期工作。

^{由 reprex package (v0.3.0)}

于 2020-01-21 创建

在 R 中，对 data.frames 中的 select 特定变量使用非标准评估

In R, use nonstandard evaluation to select specific variables from data.frames

evaluation

r

tidyverse