如何根据某些条件优化粘贴 single/multiple 列名称及其值

Question

我想粘贴列名及其值。它必须基于某种条件（if语句），可以基于单个变量，也可以基于多个变量。

下面是一个小例子，展示了数据的样子。我想加快这个过程并获得与 fun2、fun3 和 fun4 相同的结果。

为了尽可能简单，如果 a、b、c 和 d 列的值大于零，则只有一个规则设置为缺失。但是，我留下了规则的名称，因为它可以不同，例如“规则 1”> 0 和“规则 2”如果不丢失。

library("data.table")
library("tidytable")
library("glue")
library("stringi")
library("benchr")

dat <- data.table(id = 1:10,
                  t1 = rnorm(10),
                  t2 = rnorm(10),
                  a  = c(0, NA,  0,  1,  0, NA,  1,  1,  0, 1),
                  b  = c(0, NA, NA,  0,  1,  0,  1, NA,  1, 1),
                  c  = c(0, NA,  0, NA,  0,  1, NA,  1,  1, 1),
                  d  = c(0, NA,  1,  1,  0,  1,  0,  1, NA, 1),
                  re = "")

这是数据的样子：

id         t1         t2  a  b  c  d re
 1  0.6883367 -0.3454049  0  0  0  0 '' 
 2 -1.0653127 -1.3035077 NA NA NA NA '' 
 3  0.5210550  0.8489376  0 NA  0  1 '' 
 4  0.3697369 -0.1135827  1  0 NA  1 '' 
 5  1.3195759 -1.5431305  0  1  0  0 '' 
 6 -0.2106836 -0.3421900 NA  0  1  1 '' 
 7 -0.2258871 -2.1644697  1  1 NA  0 '' 
 8 -0.7132686  1.7673775  1 NA  1  1 '' 
 9  0.9467068  1.8188665  0  1  1 NA '' 
10 -0.3900479  1.7306935  1  1  1  1 ''

波纹管是所需的输出。这个想法是保留一个列，其中包含一些值被设置为缺失的原因的描述。在此示例中，只有前两个人同时拥有 t1 和 t2 的记录。个体 1、2、3 有 t1 的记录，而个体 1、2、5、7、9 有 t2 的记录。

id       t1     t2     a     b     c     d    re                                      
 1  -0.182   1.43      0     0     0     0   ""                                      
 2  -1.31    0.733    NA    NA    NA    NA   ""                                      
 3  -0.0613 NA         0    NA     0     1   "Rule2:t2(d=1);"                       
 4  NA      NA         1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1);"        
 5  NA       1.78      0     1     0     0   "Rule1:t1(b=1); "                       
 6  NA      NA        NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(c=1);"        
 7  NA      -0.345     1     1    NA     0   "Rule1:t1(a=1 b=1); "                   
 8  NA      NA         1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 c=1);"   
 9  NA      -1.22      0     1     1    NA   "Rule1:t1(b=1 c=1); "                   
10  NA      NA         1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1);"

第一次尝试 (fun1)。不是预期的结果，因为它在 mutate 中寻找单个空格。所有其他函数（fun2、fun3 和 fun4）打印正确的结果。

fun1 <- function(tbl) {
  lhs0 <- c("t1", "t2")
  rhs0 <- list(c("a", "b", "c"), "d")
  rul0 <- c("Rule1", "Rule2")
  for (i in 1:length(lhs0)) {
    lhs <- lhs0[i]
    rhs <- rhs0[[i]]
    rul <- rul0[i]
    tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
    tbl <- tbl %>%
      mutate.(
        re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
        !!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
      ) %>%
      select.(-aux)
  }
  return(tbl)
}

  id    t1     t2     a     b     c     d    re                                      
<int> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
   1    NA   1.43     0     0     0     0   "Rule1:t1(  ); "                        
   2    NA   0.733   NA    NA    NA    NA   "Rule1:t1(  ); "                        
   3    NA  NA        0    NA     0     1   "Rule2:t2(d=1); Rule1:t1(  ); "         
   4    NA  NA        1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1  ); "      
   5    NA   1.78     0     1     0     0   "Rule1:t1( b=1 ); "                     
   6    NA  NA       NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(  c=1); "      
   7    NA  -0.345    1     1    NA     0   "Rule1:t1(a=1 b=1 ); "                  
   8    NA  NA        1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1  c=1); "   
   9    NA  -1.22     0     1     1    NA   "Rule1:t1( b=1 c=1); "                  
  10    NA  NA        1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "

函数 2 (fun2) 使用“trimws”。

fun2 <- function(tbl) {
  lhs0 <- c("t1", "t2")
  rhs0 <- list(c("a", "b", "c"), "d")
  rul0 <- c("Rule1", "Rule2")
  for (i in 1:length(lhs0)) {
    lhs <- lhs0[i]
    rhs <- rhs0[[i]]
    rul <- rul0[i]
    tbl[, aux := trimws(do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
    tbl <- tbl %>%
      mutate.(
        re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
        !!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
      ) %>%
      select.(-aux)
  }
  return(tbl)
}

  id      t1     t2     a     b     c     d    re                                      
<int>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
   1 -0.182   1.43      0     0     0     0   ""                                      
   2 -1.31    0.733    NA    NA    NA    NA   ""                                      
   3 -0.0613 NA         0    NA     0     1   "Rule2:t2(d=1); "                       
   4 NA      NA         1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1); "        
   5 NA       1.78      0     1     0     0   "Rule1:t1(b=1); "                       
   6 NA      NA        NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(c=1); "        
   7 NA      -0.345     1     1    NA     0   "Rule1:t1(a=1 b=1); "                   
   8 NA      NA         1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1  c=1); "   
   9 NA      -1.22      0     1     1    NA   "Rule1:t1(b=1 c=1); "                   
  10 NA      NA         1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "

函数 3 (fun3) 使用带正则表达式的“gsub”。

fun3 <- function(tbl) {
  lhs0 <- c("t1", "t2")
  rhs0 <- list(c("a", "b", "c"), "d")
  rul0 <- c("Rule1", "Rule2")
  for (i in 1:length(lhs0)) {
    lhs <- lhs0[i]
    rhs <- rhs0[[i]]
    rul <- rul0[i]
    tbl[, aux := gsub("\s+","", do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
    tbl <- tbl %>%
      mutate.(
        re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
        !!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
      ) %>%
      select.(-aux)
  }
  return(tbl)
}

  id      t1     t2     a     b     c     d    re                                      
<int>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
  1 -0.182   1.43      0     0     0     0   ""                                      
  2 -1.31    0.733    NA    NA    NA    NA   ""                                      
  3 -0.0613 NA         0    NA     0     1   "Rule2:t2(d=1); "                       
  4 NA      NA         1     0    NA     1   "Rule2:t2(d=1); Rule1:t1(a=1); "        
  5 NA       1.78      0     1     0     0   "Rule1:t1(b=1); "                       
  6 NA      NA        NA     0     1     1   "Rule2:t2(d=1); Rule1:t1(c=1); "        
  7 NA      -0.345     1     1    NA     0   "Rule1:t1(a=1b=1); "                   
  8 NA      NA         1    NA     1     1   "Rule2:t2(d=1); Rule1:t1(a=1c=1); "   
  9 NA      -1.22      0     1     1    NA   "Rule1:t1(b=1c=1); "                   
 10 NA      NA         1     1     1     1   "Rule2:t2(d=1); Rule1:t1(a=1b=1c=1); "

函数 4 (fun4) 使用 stri_detect inside mutate with regular expression.

fun4 <- function(tbl) {
  lhs0 <- c("t1", "t2")
  rhs0 <- list(c("a", "b", "c"), "d")
  rul0 <- c("Rule1", "Rule2")
  for (i in 1:length(lhs0)) {
    lhs <- lhs0[i]
    rhs <- rhs0[[i]]
    rul <- rul0[i]
    tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
    tbl <- tbl %>%
      mutate.(
        re = case_when.(!stri_detect(aux, regex = "[[:alpha:]]") ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
        !!lhs := !!rlang::parse_expr(glue("case_when.(!stri_detect(aux, regex = '[[:alpha:]]') ~ {lhs}, TRUE ~ NA_real_)"))
      ) %>%
      select.(-aux)
  }
  return(tbl)
}

  id      t1     t2     a     b     c     d re                                      
<int>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>                                   
   1 -0.182   1.43      0     0     0     0 ""                                      
   2 -1.31    0.733    NA    NA    NA    NA ""                                      
   3 -0.0613 NA         0    NA     0     1 "Rule2:t2(d=1); "                       
   4 NA      NA         1     0    NA     1 "Rule2:t2(d=1); Rule1:t1(a=1  ); "      
   5 NA       1.78      0     1     0     0 "Rule1:t1( b=1 ); "                     
   6 NA      NA        NA     0     1     1 "Rule2:t2(d=1); Rule1:t1(  c=1); "      
   7 NA      -0.345     1     1    NA     0 "Rule1:t1(a=1 b=1 ); "                  
   8 NA      NA         1    NA     1     1 "Rule2:t2(d=1); Rule1:t1(a=1  c=1); "   
   9 NA      -1.22      0     1     1    NA "Rule1:t1( b=1 c=1); "                  
  10 NA      NA         1     1     1     1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "

具有更多数据的基准

n <- 200000
dat <- data.table(id = 1:n,
                  t1 = rnorm(n),
                  t2 = rnorm(n),
                  a  = sample(c(0, NA, 1), n, replace = TRUE),
                  b  = sample(c(0, NA, 1), n, replace = TRUE),
                  c  = sample(c(0, NA, 1), n, replace = TRUE),
                  d  = sample(c(0, NA, 1), n, replace = TRUE),
                  re = "")

benchmark(fun1(dat),
          fun2(dat),
          fun3(dat),
          fun4(dat))

Benchmark summary:
  Time units : milliseconds 
     expr n.eval min lw.qu median mean up.qu  max total relative
fun1(dat)    100 642   653    660  668   666  774 66800     1.00
fun2(dat)    100 742   756    763  773   768  874 77300     1.16
fun3(dat)    100 765   779    785  794   791  903 79400     1.19
fun4(dat)    100 743   756    763  777   770 1010 77700     1.16

有没有人知道如何加快这个过程？

谢谢。

Answer 1

首先，我承认我没能击败基准测试（感谢挑战）。可能有一些方法可以提高它的速度，但让我推荐一种方法，它可以做同样的事情（小数据更快，大数据大致相同）但支持每条规则的功能。这不是你直接问的，但你暗示了每个规则的不同功能。

（我已经 更新了 代码，感谢 @Cole 找到了我早期探索的残余。）

RULES <- list(
  Rule1 = list(
    rule = "Rule1",
    lhs = "t1",
    rhs = c("a", "b", "c"),
    fun = function(z) !is.na(z) & z > 0
  ),
  Rule2 = list(
    rule = "Rule2",
    lhs = "t2",
    rhs = "d",
    fun = is.na
    )
)

fun9 <- function(dat, RULES = list()) {
  nr <- nrow(dat)
  # RE <- lapply(seq_along(RULES), function(ign) rep("", nr))
  RE <- asplit(matrix("", nrow = length(RULES), ncol = nr), 1)
  for (r in seq_along(RULES)) {
    fun <- RULES[[r]]$fun
    lhs <- RULES[[r]]$lhs
    for (rhs in RULES[[r]]$rhs) {
      lgl <- do.call(fun, list(dat[[rhs]]))
      set(dat, which(lgl), lhs, NA)
      RE[[r]][lgl] <- sprintf("%s %s=1", RE[[r]][lgl], rhs)
    }
    ind <- nzchar(RE[[r]])
    RE[[r]][ind] <- sprintf("%s:%s(%s)", RULES[[r]]$rule, lhs, RE[[r]][ind])
  }
  set(dat, j = "re", value = do.call(paste, c(RE, sep = ";")))
}

RULES和使用fun9的前提应该是不言而喻的。

用小数据进行基准测试似乎很有希望：

set.seed(2021)
dat <- data.table(id = 1:10,
                  t1 = rnorm(10),
                  t2 = rnorm(10),
                  a  = c(0, NA,  0,  1,  0, NA,  1,  1,  0, 1),
                  b  = c(0, NA, NA,  0,  1,  0,  1, NA,  1, 1),
                  c  = c(0, NA,  0, NA,  0,  1, NA,  1,  1, 1),
                  d  = c(0, NA,  1,  1,  0,  1,  0,  1, NA, 1),
                  re = "")
fun9(dat, RULES)[]
#        id         t1         t2     a     b     c     d                                re
#     <int>      <num>      <num> <num> <num> <num> <num>                            <char>
#  1:     1 -0.1224600 -1.0822049     0     0     0     0                                 ;
#  2:     2  0.5524566         NA    NA    NA    NA    NA                   ;Rule2:t2( d=1)
#  3:     3  0.3486495  0.1819954     0    NA     0     1                                 ;
#  4:     4         NA  1.5085418     1     0    NA     1                   Rule1:t1( a=1);
#  5:     5         NA  1.6044701     0     1     0     0                   Rule1:t1( b=1);
#  6:     6         NA -1.8414756    NA     0     1     1                   Rule1:t1( c=1);
#  7:     7         NA  1.6233102     1     1    NA     0               Rule1:t1( a=1 b=1);
#  8:     8         NA  0.1313890     1    NA     1     1               Rule1:t1( a=1 c=1);
#  9:     9         NA         NA     0     1     1    NA Rule1:t1( b=1 c=1);Rule2:t2( d=1)
# 10:    10         NA  1.5133183     1     1     1     1           Rule1:t1( a=1 b=1 c=1);

bench::mark(fun4(dat), fun9(dat, RULES), check = FALSE)
# # A tibble: 2 x 13
#   expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                  time             gc                  
#   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                  <list>           <list>              
# 1 fun4(dat)          9.52ms   11.1ms      88.5     316KB     2.06    43     1      486ms <NULL> <Rprofmem[,3] [84 x 3]> <bch:tm [44]>    <tibble [44 x 3]>   
# 2 fun9(dat, RULES)   97.5us  113.5us    7760.       416B     6.24  3731     3      481ms <NULL> <Rprofmem[,3] [2 x 3]>  <bch:tm [3,734]> <tibble [3,734 x 3]>

刚刚从 `itr/sec` 开始，这个 fun9 看起来有点快。

有更大的数据：

set.seed(2021)
n <- 200000
dat <- data.table(id = 1:n,
                  t1 = rnorm(n),
                  t2 = rnorm(n),
                  a  = sample(c(0, NA, 1), n, replace = TRUE),
                  b  = sample(c(0, NA, 1), n, replace = TRUE),
                  c  = sample(c(0, NA, 1), n, replace = TRUE),
                  d  = sample(c(0, NA, 1), n, replace = TRUE),
                  re = "")
bench::mark(fun4(dat), fun9(dat, RULES), check = FALSE)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 2 x 13
#   expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory                   time         gc              
#   <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>                   <list>       <list>          
# 1 fun4(dat)           1.24s    1.24s     0.806    62.9MB     1.61     1     2      1.24s <NULL> <Rprofmem[,3] [150 x 3]> <bch:tm [1]> <tibble [1 x 3]>
# 2 fun9(dat, RULES) 296.11ms  315.4ms     3.17     53.8MB     4.76     2     3    630.8ms <NULL> <Rprofmem[,3] [70 x 3]>  <bch:tm [2]> <tibble [2 x 3]>

虽然此解决方案不使用 tidytable 或其流程，但速度更快。 re 的清理是另一个步骤，可能会将这种速度降低到致命的水平:-)。

旁注：我试图使用 lapply、mget 和其他技巧在 data.table 数据环境中做事，但最终，使用 data.table::set () 和简单向量似乎是最快的。

如何根据某些条件优化粘贴 single/multiple 列名称及其值

How to optimize pasting single/multiple column names with its values based on some condition

r

dplyr

data.table

tidyverse

tidytable