
Match all logic rules with a dataframe (need super fast function)


fu <- function(dat , rule , res.only=T){
debug.vec <- rep("no",nrow(dat)) # control of rule triggers
rule.id <- 1 # rule number in vector
for(i in 1:nrow(dat)){
  # check if the rule "rule[rule.id]" has worked on this "i" index in dat[i,]
  current_rule <- with(data = dat[i,] , expr = eval(parse(text = rule[rule.id]))  )
   if(current_rule){  # if the rule is triggered
          debug.vec[i] <- rule[rule.id]
          if(  rule.id==length(rule)  ) break   # stop if there are no more rules
           rule.id <- rule.id+1  # go to the next rule
if(!res.only)  return(  cbind(dat,debug.vec)  )  
return(  sum(debug.vec!="no")==length(rule)   )


dat <- as.data.frame(matrix(data = sample(10,30,replace = T),ncol = 3))
colnames(dat) <- paste0("x" ,1:ncol(dat))


   x1 x2 x3
1   3  5  9
2   3  3  3
3  10  9  4
4   2  9  1
5   6  9  7
6   5  3  5
7   4  8 10
8   6 10  7
9   9  7  9
10 10 10  9


rule <- c("x1>5 & x2>2" , "x1>x2" , "x3!=4" )


> fu(dat = dat, rule = rule, res.only = T)
[1] TRUE

或者您可以更改标志 res.only = F 并查看序列在 debug.vec 列中的位置

> fu(dat = dat, rule = rule, res.only = F)
   x1 x2 x3   debug.vec
1   3  5  9          no
2   3  3  3          no
3  10  9  4 x1>5 & x2>2
4   2  9  1          no
5   6  9  7          no
6   5  3  5       x1>x2
7   4  8 10       x3!=4
8   6 10  7          no
9   9  7  9          no
10 10 10  9          no

我需要这个函数的最快版本,也许使用 Rccp 包 或类似的东西..


Waldi 功能与我的功能不一样,有问题

dat <- structure(list(x1 = c(2L, 5L, 1L, 3L, 9L, 2L, 6L, 3L, 3L, 9L), 
                      x2 = c(2L, 1L, 6L, 10L, 8L, 10L, 10L, 4L, 6L, 4L), 
                      x3 = c(4L, 9L, 8L, 7L, 10L, 1L, 2L, 8L, 3L, 10L)),
                   class = "data.frame", row.names = c(NA, -10L))
rule <- c("x1>5 & x2>2" , "x1>x2" , "x3!=4" )

my_fu(dat = dat, rule = rule, res.only = F)


> my_fu(dat = dat, rule = rule, res.only = F)
   x1 x2 x3   debug.vec
1   2  2  4          no
2   5  1  9          no
3   1  6  8          no
4   3 10  7          no
5   9  8 10 x1>5 & x2>2
6   2 10  1          no
7   6 10  2          no
8   3  4  8          no
9   3  6  3          no
10  9  4 10       x1>x2


> my_fu(dat = dat, rule = rule, res.only = T)

一种可能的简单基础 R 方式:

with(dat,sapply(rule, function(rule) eval(parse(text = rule))))

      x1>5 & x2>2 x1>x2 x3!=4
 [1,]       FALSE FALSE  TRUE
 [2,]       FALSE FALSE  TRUE
 [3,]        TRUE  TRUE FALSE
 [4,]       FALSE FALSE  TRUE
 [5,]        TRUE FALSE  TRUE
 [6,]       FALSE  TRUE  TRUE
 [7,]       FALSE FALSE  TRUE
 [8,]        TRUE FALSE  TRUE
 [9,]        TRUE  TRUE  TRUE
[10,]        TRUE FALSE  TRUE

any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule))
[1] TRUE


microbenchmark::microbenchmark(any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule)),
                               fu(dat = dat, rule = rule, res.only = T))

Unit: microseconds
                                                                  expr     min       lq    mean   median
 any(with(dat, sapply(rule, function(rule) eval(parse(text = rule)))))  93.201  97.7010 127.817 104.9010
                              fu(dat = dat, rule = rule, res.only = T) 465.902 499.7015 611.827 523.2505
       uq      max neval
 124.8010  834.201   100
 643.2015 2018.500   100


dat <- structure(list(x1 = c(2L, 5L, 1L, 3L, 9L, 2L, 6L, 3L, 3L, 9L), 
    x2 = c(2L, 1L, 6L, 10L, 8L, 10L, 10L, 4L, 6L, 4L), x3 = c(4L, 
    9L, 8L, 7L, 10L, 1L, 2L, 8L, 3L, 10L)), class = "data.frame", row.names = c(NA, 


   x1 x2 x3
1   2  2  4
2   5  1  9
3   1  6  8
4   3 10  7
5   9  8 10
6   2 10  1
7   6 10  2
8   3  4  8
9   3  6  3
10  9  4 10

with(dat,sapply(rule, function(rule) eval(parse(text = rule))))
      x1>5 & x2>2 x1>x2 x3!=4
 [2,]       FALSE  TRUE  TRUE
 [3,]       FALSE FALSE  TRUE
 [4,]       FALSE FALSE  TRUE
 [5,]        TRUE  TRUE  TRUE
 [6,]       FALSE FALSE  TRUE
 [7,]        TRUE FALSE  TRUE
 [8,]       FALSE FALSE  TRUE
 [9,]       FALSE FALSE  TRUE
[10,]        TRUE  TRUE  TRUE

any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule))
[1] TRUE

fu(dat = dat, rule = rule, res.only = T)
根据您的更新,我编写了一个新的 fu 函数,即 TIC_fu()

TIC_fu <- function(dat, rule, res.only = TRUE) {
  m <- with(dat, lapply(rule, function(r) eval(str2expression(r))))
  idx <- na.omit(
      function(x, y) {
        k <- which(y)
        ifelse(all(k <= x), NA, min(k[k > x]))
      }, m,
      init = 0, accumulate = TRUE
  if (!res.only) {
    fidx <- head(idx, length(rule))
    debug.vec <- replace(rep("no", nrow(dat)), fidx, rule[seq_along(fidx)])
    return(cbind(dat, debug.vec))
  length(idx) >= length(rule)


> TIC_fu(dat, rule, FALSE)
   x1 x2 x3   debug.vec
1   2  2  4          no
2   5  1  9          no
3   1  6  8          no
4   3 10  7          no
5   9  8 10 x1>5 & x2>2
6   2 10  1          no
7   6 10  2          no
8   3  4  8          no
9   3  6  3          no
10  9  4 10       x1>x2

> TIC_fu(dat,rule)


> microbenchmark(
+   TIC_fu(dat, rule, FALSE),
+   fu(dat, rule, FALSE),
+   unit = "relative"
+ )
Unit: relative
                     expr      min       lq     mean   median     uq      max
 TIC_fu(dat, rule, FALSE) 1.000000 1.000000 1.000000 1.000000 1.0000 1.000000
     fu(dat, rule, FALSE) 4.639093 4.555523 3.383911 4.450056 4.3993 1.007532



  any(with(dat, rowSums(sapply(rule, function(rule) eval(parse(text = rule))))==length(rule))),
  any(with(dat, rowSums(sapply(rule, function(rule) eval(str2lang(rule))))==length(rule))),
  any(with(dat, rowSums(sapply(rule, function(rule) eval(str2expression(rule))))==length(rule))),
  any(with(dat, eval(str2expression(paste0(rule,collapse = " & ")))))


Unit: microseconds
   any(with(dat, rowSums(sapply(rule, function(rule) eval(parse(text = rule)))) ==      length(rule)))
       any(with(dat, rowSums(sapply(rule, function(rule) eval(str2lang(rule)))) ==      length(rule)))
 any(with(dat, rowSums(sapply(rule, function(rule) eval(str2expression(rule)))) ==      length(rule)))
                                  any(with(dat, eval(str2expression(paste0(rule, collapse = " & ")))))
  min   lq    mean median     uq   max neval
 94.0 98.6 131.431 107.35 121.90 632.7   100
 37.5 39.2  48.887  44.05  48.50 174.1   100
 36.8 39.6  51.627  46.20  48.45 241.4   100
 12.7 15.8  19.786  17.00  19.75  97.9   100