在 for 循环中按名称引用列

Question

我在 R 中有一个循环，它循环遍历数据帧 las_ref 中名称与向量 las_names 中的值匹配的那些列。由于原始 las_ref 数据框包含具有重复 incidentid 值的行，我的目标是为每个 incidentid 提取 las_names 中每一列中最常见的值，生成一个新列并使用连接生成数据框 las_ref3，每行中都有最常见的值。

las_ref

的结构

incidentid incident var1
001             abc   45
002             abc   NA
002              NA   78
003             def   12
004             xyz   NA
004             xyz   10
004             abc   10

las_ref3

的预期结构

incidentid incident var1 incident-new var1-new
001             abc   45          abc       45
002             abc   NA          abc       78
002              NA   78          abc       78
003             def   12          def       12 
004             xyz   NA          xyz       10
004             xyz   10          xyz       10
004             abs   10          xyz       10

下面是我正在尝试使用的循环。

las_names <- c("incident","var1")
for(i in las_names) {
  las_ref2 <- las_ref %>%
    group_by(across(c(incidentid, paste(i)))) %>%
    tally() %>%
    filter(!is.na(paste(i))) %>%
    arrange(incidentid, desc(n)) %>%
    summarize(paste(i,"-new") = first(paste(i)))
  las_ref3 <- las_ref %>%
    left_join(select(las_ref2, incidentid, paste(i,"-new")), by = c("incidentid"))
}

目前似乎有两个问题，我认为围绕使用 i 来引用上面示例中的列 incident 和 var。

首先是 !is.na() 函数没有删除 i 引用的列中的 NA 值，尽管这不会产生错误。第二个确实会产生错误，是总结行。当此行为运行.

时，我收到 unexpected '=' in 错误

当我运行循环外的代码时，这两个问题都没有发生，单独指定列名 - 结果按预期出现。由于数据集相当大，我希望使用循环来避免单独处理每一列。

Answer 1

我们可以按以下方式编写您的循环以使其工作：

library(tidyverse)

las_ref <- tribble(~incidentid , ~incident, ~var1,
    "001",             "abc",   45,
    "002",             "abc",   NA,
    "002",              NA,     78,
    "003",             "def",   12,
    "004",             "xyz",   NA,
    "004",             "xyz",   10,
    "004",             "abc",   10
)

las_names <- c("incident","var1")

las_ref3 <- las_ref

for(i in las_names) {
  las_ref2 <- las_ref %>%
    group_by(across(c(incidentid, paste(i)))) %>%
    tally() %>%
    filter(!is.na(!! sym(paste(i)))) %>%
    arrange(incidentid, desc(n)) %>%
    summarize("{i}-new" := first(!! sym(paste(i))))
  las_ref3 <- las_ref3 %>%
    left_join(select(las_ref2, incidentid, !! sym(paste0(i,"-new"))), by = c("incidentid"))
}

las_ref3
#> # A tibble: 7 x 5
#>   incidentid incident  var1 `incident-new` `var1-new`
#>   <chr>      <chr>    <dbl> <chr>               <dbl>
#> 1 001        abc         45 abc                    45
#> 2 002        abc         NA abc                    78
#> 3 002        <NA>        78 abc                    78
#> 4 003        def         12 def                    12
#> 5 004        xyz         NA xyz                    10
#> 6 004        xyz         10 xyz                    10
#> 7 004        abc         10 xyz                    10

我们也可以在 mutate 中使用 purrr::map_dfc 和 purrr:set_names:

las_ref %>% 
  group_by(incidentid) %>% 
  mutate(map_dfc(set_names(las_names, paste0(las_names, "-new")),
              ~ count(cur_data_all(), "{.x}" := eval(sym(.x))) %>%
                arrange(desc(n)) %>%
                slice_head() %>%
                pull(eval(sym(.x)))
  )
  )

#> # A tibble: 7 x 5
#> # Groups:   incidentid [4]
#>   incidentid incident  var1 `incident-new` `var1-new`
#>   <chr>      <chr>    <dbl> <chr>               <dbl>
#> 1 001        abc         45 abc                    45
#> 2 002        abc         NA abc                    78
#> 3 002        <NA>        78 abc                    78
#> 4 003        def         12 def                    12
#> 5 004        xyz         NA xyz                    10
#> 6 004        xyz         10 xyz                    10
#> 7 004        abc         10 xyz                    10

我在 github、{dplyover} 上有一个包，它通过使用 over 及其 .names 参数使上述方法更简单：

library(dplyover) # https://github.com/TimTeaFan/dplyover

las_ref %>% 
  group_by(incidentid) %>% 
  mutate(over(las_names,
              ~ count(cur_data_all(), "{.x}" := eval(sym(.x))) %>%
                arrange(desc(n)) %>%
                slice_head() %>%
                pull(eval(sym(.x))),
              .names = "{x}-new")
  )

#> # A tibble: 7 x 5
#> # Groups:   incidentid [4]
#>   incidentid incident  var1 `incident-new` `var1-new`
#>   <chr>      <chr>    <dbl> <chr>               <dbl>
#> 1 001        abc         45 abc                    45
#> 2 002        abc         NA abc                    78
#> 3 002        <NA>        78 abc                    78
#> 4 003        def         12 def                    12
#> 5 004        xyz         NA xyz                    10
#> 6 004        xyz         10 xyz                    10
#> 7 004        abc         10 xyz                    10

^{由 reprex package (v2.0.1)}

创建于 2022-02-11

Answer 2

避免手动循环的版本：

library(tidyverse)

las_ref <- tribble(~incidentid , ~incident, ~var1,
                   "001",             "abc",   45,
                   "002",             "abc",   NA,
                   "002",              NA,     78,
                   "003",             "def",   12,
                   "004",             "xyz",   NA,
                   "004",             "xyz",   10,
                   "004",             "abc",   10
)

las_ref3 <- las_ref %>% 
  group_by(incidentid) %>% 
  mutate(
    across(c(incident, var1), ~names(sort(table(.x, useNA = 'no'), decreasing = T))[1], .names = '{.col}-new' )
  )

  incidentid incident  var1 incident-new var1-new
  <chr>      <chr>    <dbl> <chr>        <chr>   
1 001        abc         45 abc          45      
2 002        abc         NA abc          78      
3 002        NA          78 abc          78      
4 003        def         12 def          12      
5 004        xyz         NA xyz          10      
6 004        xyz         10 xyz          10      
7 004        abc         10 xyz          10

在 for 循环中按名称引用列

Referring to columns by name in for loop

for-loop

r

dplyr