如何根据某些条件优化粘贴 single/multiple 列名称及其值
How to optimize pasting single/multiple column names with its values based on some condition
我想粘贴列名及其值。它必须基于某种条件(if语句),可以基于单个变量,也可以基于多个变量。
下面是一个小例子,展示了数据的样子。我想加快这个过程并获得与 fun2、fun3 和 fun4 相同的结果。
为了尽可能简单,如果 a、b、c 和 d 列的值大于零,则只有一个规则设置为缺失。但是,我留下了规则的名称,因为它可以不同,例如“规则 1”> 0 和“规则 2”如果不丢失。
library("data.table")
library("tidytable")
library("glue")
library("stringi")
library("benchr")
dat <- data.table(id = 1:10,
t1 = rnorm(10),
t2 = rnorm(10),
a = c(0, NA, 0, 1, 0, NA, 1, 1, 0, 1),
b = c(0, NA, NA, 0, 1, 0, 1, NA, 1, 1),
c = c(0, NA, 0, NA, 0, 1, NA, 1, 1, 1),
d = c(0, NA, 1, 1, 0, 1, 0, 1, NA, 1),
re = "")
这是数据的样子:
id t1 t2 a b c d re
1 0.6883367 -0.3454049 0 0 0 0 ''
2 -1.0653127 -1.3035077 NA NA NA NA ''
3 0.5210550 0.8489376 0 NA 0 1 ''
4 0.3697369 -0.1135827 1 0 NA 1 ''
5 1.3195759 -1.5431305 0 1 0 0 ''
6 -0.2106836 -0.3421900 NA 0 1 1 ''
7 -0.2258871 -2.1644697 1 1 NA 0 ''
8 -0.7132686 1.7673775 1 NA 1 1 ''
9 0.9467068 1.8188665 0 1 1 NA ''
10 -0.3900479 1.7306935 1 1 1 1 ''
波纹管是所需的输出。这个想法是保留一个列,其中包含一些值被设置为缺失的原因的描述。
在此示例中,只有前两个人同时拥有 t1 和 t2 的记录。
个体 1、2、3 有 t1 的记录,而个体 1、2、5、7、9 有 t2 的记录。
id t1 t2 a b c d re
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1);"
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1);"
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1);"
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1);"
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1);"
第一次尝试 (fun1)。不是预期的结果,因为它在 mutate 中寻找单个空格。
所有其他函数(fun2、fun3 和 fun4)打印正确的结果。
fun1 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 NA 1.43 0 0 0 0 "Rule1:t1( ); "
2 NA 0.733 NA NA NA NA "Rule1:t1( ); "
3 NA NA 0 NA 0 1 "Rule2:t2(d=1); Rule1:t1( ); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1 ); "
5 NA 1.78 0 1 0 0 "Rule1:t1( b=1 ); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1( c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1 ); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1( b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
函数 2 (fun2) 使用“trimws”。
fun2 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := trimws(do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1); "
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
函数 3 (fun3) 使用带正则表达式的“gsub”。
fun3 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := gsub("\s+","", do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1); "
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1b=1c=1); "
函数 4 (fun4) 使用 stri_detect inside mutate with regular expression.
fun4 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(!stri_detect(aux, regex = "[[:alpha:]]") ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(!stri_detect(aux, regex = '[[:alpha:]]') ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1 ); "
5 NA 1.78 0 1 0 0 "Rule1:t1( b=1 ); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1( c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1 ); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1( b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
具有更多数据的基准
n <- 200000
dat <- data.table(id = 1:n,
t1 = rnorm(n),
t2 = rnorm(n),
a = sample(c(0, NA, 1), n, replace = TRUE),
b = sample(c(0, NA, 1), n, replace = TRUE),
c = sample(c(0, NA, 1), n, replace = TRUE),
d = sample(c(0, NA, 1), n, replace = TRUE),
re = "")
benchmark(fun1(dat),
fun2(dat),
fun3(dat),
fun4(dat))
Benchmark summary:
Time units : milliseconds
expr n.eval min lw.qu median mean up.qu max total relative
fun1(dat) 100 642 653 660 668 666 774 66800 1.00
fun2(dat) 100 742 756 763 773 768 874 77300 1.16
fun3(dat) 100 765 779 785 794 791 903 79400 1.19
fun4(dat) 100 743 756 763 777 770 1010 77700 1.16
有没有人知道如何加快这个过程?
谢谢。
首先,我承认我没能击败基准测试(感谢挑战)。可能有一些方法可以提高它的速度,但让我推荐一种方法,它可以做同样的事情(小数据更快,大数据大致相同)但支持每条规则的功能。这不是你直接问的,但你暗示了每个规则的不同功能。
(我已经 更新了 代码,感谢 @Cole 找到了我早期探索的残余。)
RULES <- list(
Rule1 = list(
rule = "Rule1",
lhs = "t1",
rhs = c("a", "b", "c"),
fun = function(z) !is.na(z) & z > 0
),
Rule2 = list(
rule = "Rule2",
lhs = "t2",
rhs = "d",
fun = is.na
)
)
fun9 <- function(dat, RULES = list()) {
nr <- nrow(dat)
# RE <- lapply(seq_along(RULES), function(ign) rep("", nr))
RE <- asplit(matrix("", nrow = length(RULES), ncol = nr), 1)
for (r in seq_along(RULES)) {
fun <- RULES[[r]]$fun
lhs <- RULES[[r]]$lhs
for (rhs in RULES[[r]]$rhs) {
lgl <- do.call(fun, list(dat[[rhs]]))
set(dat, which(lgl), lhs, NA)
RE[[r]][lgl] <- sprintf("%s %s=1", RE[[r]][lgl], rhs)
}
ind <- nzchar(RE[[r]])
RE[[r]][ind] <- sprintf("%s:%s(%s)", RULES[[r]]$rule, lhs, RE[[r]][ind])
}
set(dat, j = "re", value = do.call(paste, c(RE, sep = ";")))
}
RULES
和使用fun9
的前提应该是不言而喻的。
用小数据进行基准测试似乎很有希望:
set.seed(2021)
dat <- data.table(id = 1:10,
t1 = rnorm(10),
t2 = rnorm(10),
a = c(0, NA, 0, 1, 0, NA, 1, 1, 0, 1),
b = c(0, NA, NA, 0, 1, 0, 1, NA, 1, 1),
c = c(0, NA, 0, NA, 0, 1, NA, 1, 1, 1),
d = c(0, NA, 1, 1, 0, 1, 0, 1, NA, 1),
re = "")
fun9(dat, RULES)[]
# id t1 t2 a b c d re
# <int> <num> <num> <num> <num> <num> <num> <char>
# 1: 1 -0.1224600 -1.0822049 0 0 0 0 ;
# 2: 2 0.5524566 NA NA NA NA NA ;Rule2:t2( d=1)
# 3: 3 0.3486495 0.1819954 0 NA 0 1 ;
# 4: 4 NA 1.5085418 1 0 NA 1 Rule1:t1( a=1);
# 5: 5 NA 1.6044701 0 1 0 0 Rule1:t1( b=1);
# 6: 6 NA -1.8414756 NA 0 1 1 Rule1:t1( c=1);
# 7: 7 NA 1.6233102 1 1 NA 0 Rule1:t1( a=1 b=1);
# 8: 8 NA 0.1313890 1 NA 1 1 Rule1:t1( a=1 c=1);
# 9: 9 NA NA 0 1 1 NA Rule1:t1( b=1 c=1);Rule2:t2( d=1)
# 10: 10 NA 1.5133183 1 1 1 1 Rule1:t1( a=1 b=1 c=1);
bench::mark(fun4(dat), fun9(dat, RULES), check = FALSE)
# # A tibble: 2 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 fun4(dat) 9.52ms 11.1ms 88.5 316KB 2.06 43 1 486ms <NULL> <Rprofmem[,3] [84 x 3]> <bch:tm [44]> <tibble [44 x 3]>
# 2 fun9(dat, RULES) 97.5us 113.5us 7760. 416B 6.24 3731 3 481ms <NULL> <Rprofmem[,3] [2 x 3]> <bch:tm [3,734]> <tibble [3,734 x 3]>
刚刚从 `itr/sec`
开始,这个 fun9
看起来有点快。
有更大的数据:
set.seed(2021)
n <- 200000
dat <- data.table(id = 1:n,
t1 = rnorm(n),
t2 = rnorm(n),
a = sample(c(0, NA, 1), n, replace = TRUE),
b = sample(c(0, NA, 1), n, replace = TRUE),
c = sample(c(0, NA, 1), n, replace = TRUE),
d = sample(c(0, NA, 1), n, replace = TRUE),
re = "")
bench::mark(fun4(dat), fun9(dat, RULES), check = FALSE)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 2 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 fun4(dat) 1.24s 1.24s 0.806 62.9MB 1.61 1 2 1.24s <NULL> <Rprofmem[,3] [150 x 3]> <bch:tm [1]> <tibble [1 x 3]>
# 2 fun9(dat, RULES) 296.11ms 315.4ms 3.17 53.8MB 4.76 2 3 630.8ms <NULL> <Rprofmem[,3] [70 x 3]> <bch:tm [2]> <tibble [2 x 3]>
虽然此解决方案不使用 tidytable
或其流程,但速度更快。 re
的清理是另一个步骤,可能会将这种速度降低到致命的水平:-)。
旁注:我试图使用 lapply
、mget
和其他技巧在 data.table
数据环境中做事,但最终,使用 data.table::set
() 和简单向量似乎是最快的。
我想粘贴列名及其值。它必须基于某种条件(if语句),可以基于单个变量,也可以基于多个变量。
下面是一个小例子,展示了数据的样子。我想加快这个过程并获得与 fun2、fun3 和 fun4 相同的结果。
为了尽可能简单,如果 a、b、c 和 d 列的值大于零,则只有一个规则设置为缺失。但是,我留下了规则的名称,因为它可以不同,例如“规则 1”> 0 和“规则 2”如果不丢失。
library("data.table")
library("tidytable")
library("glue")
library("stringi")
library("benchr")
dat <- data.table(id = 1:10,
t1 = rnorm(10),
t2 = rnorm(10),
a = c(0, NA, 0, 1, 0, NA, 1, 1, 0, 1),
b = c(0, NA, NA, 0, 1, 0, 1, NA, 1, 1),
c = c(0, NA, 0, NA, 0, 1, NA, 1, 1, 1),
d = c(0, NA, 1, 1, 0, 1, 0, 1, NA, 1),
re = "")
这是数据的样子:
id t1 t2 a b c d re
1 0.6883367 -0.3454049 0 0 0 0 ''
2 -1.0653127 -1.3035077 NA NA NA NA ''
3 0.5210550 0.8489376 0 NA 0 1 ''
4 0.3697369 -0.1135827 1 0 NA 1 ''
5 1.3195759 -1.5431305 0 1 0 0 ''
6 -0.2106836 -0.3421900 NA 0 1 1 ''
7 -0.2258871 -2.1644697 1 1 NA 0 ''
8 -0.7132686 1.7673775 1 NA 1 1 ''
9 0.9467068 1.8188665 0 1 1 NA ''
10 -0.3900479 1.7306935 1 1 1 1 ''
波纹管是所需的输出。这个想法是保留一个列,其中包含一些值被设置为缺失的原因的描述。 在此示例中,只有前两个人同时拥有 t1 和 t2 的记录。 个体 1、2、3 有 t1 的记录,而个体 1、2、5、7、9 有 t2 的记录。
id t1 t2 a b c d re
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1);"
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1);"
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1);"
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1);"
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1);"
第一次尝试 (fun1)。不是预期的结果,因为它在 mutate 中寻找单个空格。 所有其他函数(fun2、fun3 和 fun4)打印正确的结果。
fun1 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 NA 1.43 0 0 0 0 "Rule1:t1( ); "
2 NA 0.733 NA NA NA NA "Rule1:t1( ); "
3 NA NA 0 NA 0 1 "Rule2:t2(d=1); Rule1:t1( ); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1 ); "
5 NA 1.78 0 1 0 0 "Rule1:t1( b=1 ); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1( c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1 ); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1( b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
函数 2 (fun2) 使用“trimws”。
fun2 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := trimws(do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1); "
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
函数 3 (fun3) 使用带正则表达式的“gsub”。
fun3 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := gsub("\s+","", do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD))), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(aux == "" ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(aux == '' ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1); "
5 NA 1.78 0 1 0 0 "Rule1:t1(b=1); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1(c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1b=1); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1(b=1c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1b=1c=1); "
函数 4 (fun4) 使用 stri_detect inside mutate with regular expression.
fun4 <- function(tbl) {
lhs0 <- c("t1", "t2")
rhs0 <- list(c("a", "b", "c"), "d")
rul0 <- c("Rule1", "Rule2")
for (i in 1:length(lhs0)) {
lhs <- lhs0[i]
rhs <- rhs0[[i]]
rul <- rul0[i]
tbl[, aux := do.call(paste, Map(function(x, y) fifelse(y > 0, paste(x, y, sep = '='), "", na = ""), names(.SD), .SD)), .SDcols = rhs]
tbl <- tbl %>%
mutate.(
re = case_when.(!stri_detect(aux, regex = "[[:alpha:]]") ~ re, TRUE ~ paste0(rul, ":", lhs, "(", aux,"); ", re)),
!!lhs := !!rlang::parse_expr(glue("case_when.(!stri_detect(aux, regex = '[[:alpha:]]') ~ {lhs}, TRUE ~ NA_real_)"))
) %>%
select.(-aux)
}
return(tbl)
}
id t1 t2 a b c d re
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 -0.182 1.43 0 0 0 0 ""
2 -1.31 0.733 NA NA NA NA ""
3 -0.0613 NA 0 NA 0 1 "Rule2:t2(d=1); "
4 NA NA 1 0 NA 1 "Rule2:t2(d=1); Rule1:t1(a=1 ); "
5 NA 1.78 0 1 0 0 "Rule1:t1( b=1 ); "
6 NA NA NA 0 1 1 "Rule2:t2(d=1); Rule1:t1( c=1); "
7 NA -0.345 1 1 NA 0 "Rule1:t1(a=1 b=1 ); "
8 NA NA 1 NA 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 c=1); "
9 NA -1.22 0 1 1 NA "Rule1:t1( b=1 c=1); "
10 NA NA 1 1 1 1 "Rule2:t2(d=1); Rule1:t1(a=1 b=1 c=1); "
具有更多数据的基准
n <- 200000
dat <- data.table(id = 1:n,
t1 = rnorm(n),
t2 = rnorm(n),
a = sample(c(0, NA, 1), n, replace = TRUE),
b = sample(c(0, NA, 1), n, replace = TRUE),
c = sample(c(0, NA, 1), n, replace = TRUE),
d = sample(c(0, NA, 1), n, replace = TRUE),
re = "")
benchmark(fun1(dat),
fun2(dat),
fun3(dat),
fun4(dat))
Benchmark summary:
Time units : milliseconds
expr n.eval min lw.qu median mean up.qu max total relative
fun1(dat) 100 642 653 660 668 666 774 66800 1.00
fun2(dat) 100 742 756 763 773 768 874 77300 1.16
fun3(dat) 100 765 779 785 794 791 903 79400 1.19
fun4(dat) 100 743 756 763 777 770 1010 77700 1.16
有没有人知道如何加快这个过程?
谢谢。
首先,我承认我没能击败基准测试(感谢挑战)。可能有一些方法可以提高它的速度,但让我推荐一种方法,它可以做同样的事情(小数据更快,大数据大致相同)但支持每条规则的功能。这不是你直接问的,但你暗示了每个规则的不同功能。
(我已经 更新了 代码,感谢 @Cole 找到了我早期探索的残余。)
RULES <- list(
Rule1 = list(
rule = "Rule1",
lhs = "t1",
rhs = c("a", "b", "c"),
fun = function(z) !is.na(z) & z > 0
),
Rule2 = list(
rule = "Rule2",
lhs = "t2",
rhs = "d",
fun = is.na
)
)
fun9 <- function(dat, RULES = list()) {
nr <- nrow(dat)
# RE <- lapply(seq_along(RULES), function(ign) rep("", nr))
RE <- asplit(matrix("", nrow = length(RULES), ncol = nr), 1)
for (r in seq_along(RULES)) {
fun <- RULES[[r]]$fun
lhs <- RULES[[r]]$lhs
for (rhs in RULES[[r]]$rhs) {
lgl <- do.call(fun, list(dat[[rhs]]))
set(dat, which(lgl), lhs, NA)
RE[[r]][lgl] <- sprintf("%s %s=1", RE[[r]][lgl], rhs)
}
ind <- nzchar(RE[[r]])
RE[[r]][ind] <- sprintf("%s:%s(%s)", RULES[[r]]$rule, lhs, RE[[r]][ind])
}
set(dat, j = "re", value = do.call(paste, c(RE, sep = ";")))
}
RULES
和使用fun9
的前提应该是不言而喻的。
用小数据进行基准测试似乎很有希望:
set.seed(2021)
dat <- data.table(id = 1:10,
t1 = rnorm(10),
t2 = rnorm(10),
a = c(0, NA, 0, 1, 0, NA, 1, 1, 0, 1),
b = c(0, NA, NA, 0, 1, 0, 1, NA, 1, 1),
c = c(0, NA, 0, NA, 0, 1, NA, 1, 1, 1),
d = c(0, NA, 1, 1, 0, 1, 0, 1, NA, 1),
re = "")
fun9(dat, RULES)[]
# id t1 t2 a b c d re
# <int> <num> <num> <num> <num> <num> <num> <char>
# 1: 1 -0.1224600 -1.0822049 0 0 0 0 ;
# 2: 2 0.5524566 NA NA NA NA NA ;Rule2:t2( d=1)
# 3: 3 0.3486495 0.1819954 0 NA 0 1 ;
# 4: 4 NA 1.5085418 1 0 NA 1 Rule1:t1( a=1);
# 5: 5 NA 1.6044701 0 1 0 0 Rule1:t1( b=1);
# 6: 6 NA -1.8414756 NA 0 1 1 Rule1:t1( c=1);
# 7: 7 NA 1.6233102 1 1 NA 0 Rule1:t1( a=1 b=1);
# 8: 8 NA 0.1313890 1 NA 1 1 Rule1:t1( a=1 c=1);
# 9: 9 NA NA 0 1 1 NA Rule1:t1( b=1 c=1);Rule2:t2( d=1)
# 10: 10 NA 1.5133183 1 1 1 1 Rule1:t1( a=1 b=1 c=1);
bench::mark(fun4(dat), fun9(dat, RULES), check = FALSE)
# # A tibble: 2 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 fun4(dat) 9.52ms 11.1ms 88.5 316KB 2.06 43 1 486ms <NULL> <Rprofmem[,3] [84 x 3]> <bch:tm [44]> <tibble [44 x 3]>
# 2 fun9(dat, RULES) 97.5us 113.5us 7760. 416B 6.24 3731 3 481ms <NULL> <Rprofmem[,3] [2 x 3]> <bch:tm [3,734]> <tibble [3,734 x 3]>
刚刚从 `itr/sec`
开始,这个 fun9
看起来有点快。
有更大的数据:
set.seed(2021)
n <- 200000
dat <- data.table(id = 1:n,
t1 = rnorm(n),
t2 = rnorm(n),
a = sample(c(0, NA, 1), n, replace = TRUE),
b = sample(c(0, NA, 1), n, replace = TRUE),
c = sample(c(0, NA, 1), n, replace = TRUE),
d = sample(c(0, NA, 1), n, replace = TRUE),
re = "")
bench::mark(fun4(dat), fun9(dat, RULES), check = FALSE)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 2 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 fun4(dat) 1.24s 1.24s 0.806 62.9MB 1.61 1 2 1.24s <NULL> <Rprofmem[,3] [150 x 3]> <bch:tm [1]> <tibble [1 x 3]>
# 2 fun9(dat, RULES) 296.11ms 315.4ms 3.17 53.8MB 4.76 2 3 630.8ms <NULL> <Rprofmem[,3] [70 x 3]> <bch:tm [2]> <tibble [2 x 3]>
虽然此解决方案不使用 tidytable
或其流程,但速度更快。 re
的清理是另一个步骤,可能会将这种速度降低到致命的水平:-)。
旁注:我试图使用 lapply
、mget
和其他技巧在 data.table
数据环境中做事,但最终,使用 data.table::set
() 和简单向量似乎是最快的。