将所有逻辑规则与数据框匹配(需要超快速功能)
Match all logic rules with a dataframe (need super fast function)
我有一个函数可以检查数据帧中是否存在逻辑序列
fu <- function(dat , rule , res.only=T){
debug.vec <- rep("no",nrow(dat)) # control of rule triggers
rule.id <- 1 # rule number in vector
for(i in 1:nrow(dat)){
# check if the rule "rule[rule.id]" has worked on this "i" index in dat[i,]
current_rule <- with(data = dat[i,] , expr = eval(parse(text = rule[rule.id])) )
if(current_rule){ # if the rule is triggered
debug.vec[i] <- rule[rule.id]
if( rule.id==length(rule) ) break # stop if there are no more rules
rule.id <- rule.id+1 # go to the next rule
}}
if(!res.only) return( cbind(dat,debug.vec) )
return( sum(debug.vec!="no")==length(rule) )
}
例如我有一些数据
set.seed(123)
dat <- as.data.frame(matrix(data = sample(10,30,replace = T),ncol = 3))
colnames(dat) <- paste0("x" ,1:ncol(dat))
..
dat
x1 x2 x3
1 3 5 9
2 3 3 3
3 10 9 4
4 2 9 1
5 6 9 7
6 5 3 5
7 4 8 10
8 6 10 7
9 9 7 9
10 10 10 9
还有一个有规则的向量
rule <- c("x1>5 & x2>2" , "x1>x2" , "x3!=4" )
该函数检查数据帧中是否存在这样的逻辑序列并给出合乎逻辑的答案
> fu(dat = dat, rule = rule, res.only = T)
[1] TRUE
或者您可以更改标志 res.only = F
并查看序列在 debug.vec
列中的位置
> fu(dat = dat, rule = rule, res.only = F)
x1 x2 x3 debug.vec
1 3 5 9 no
2 3 3 3 no
3 10 9 4 x1>5 & x2>2
4 2 9 1 no
5 6 9 7 no
6 5 3 5 x1>x2
7 4 8 10 x3!=4
8 6 10 7 no
9 9 7 9 no
10 10 10 9 no
我需要这个函数的最快版本,也许使用 Rccp 包 或类似的东西..
UPD=======================
Waldi
功能与我的功能不一样,有问题
UPD_2_=================================== =
# Is this correct?
是的,这是正确的,如果规则[k]被触发然后搜索规则[k+1]从新的数据行开始
请原谅我的问题不够准确,这是我的错
我的函数返回了FALSE
因为最后一个规则"x3!=4"
没有生效,应该是
dat <- structure(list(x1 = c(2L, 5L, 1L, 3L, 9L, 2L, 6L, 3L, 3L, 9L),
x2 = c(2L, 1L, 6L, 10L, 8L, 10L, 10L, 4L, 6L, 4L),
x3 = c(4L, 9L, 8L, 7L, 10L, 1L, 2L, 8L, 3L, 10L)),
class = "data.frame", row.names = c(NA, -10L))
dat
rule <- c("x1>5 & x2>2" , "x1>x2" , "x3!=4" )
my_fu(dat = dat, rule = rule, res.only = F)
只有两条规则有效
> my_fu(dat = dat, rule = rule, res.only = F)
x1 x2 x3 debug.vec
1 2 2 4 no
2 5 1 9 no
3 1 6 8 no
4 3 10 7 no
5 9 8 10 x1>5 & x2>2
6 2 10 1 no
7 6 10 2 no
8 3 4 8 no
9 3 6 3 no
10 9 4 10 x1>x2
应该是
> my_fu(dat = dat, rule = rule, res.only = T)
[1] FALSE
一种可能的简单基础 R 方式:
with(dat,sapply(rule, function(rule) eval(parse(text = rule))))
x1>5 & x2>2 x1>x2 x3!=4
[1,] FALSE FALSE TRUE
[2,] FALSE FALSE TRUE
[3,] TRUE TRUE FALSE
[4,] FALSE FALSE TRUE
[5,] TRUE FALSE TRUE
[6,] FALSE TRUE TRUE
[7,] FALSE FALSE TRUE
[8,] TRUE FALSE TRUE
[9,] TRUE TRUE TRUE
[10,] TRUE FALSE TRUE
any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule))
[1] TRUE
性能:
microbenchmark::microbenchmark(any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule)),
fu(dat = dat, rule = rule, res.only = T))
Unit: microseconds
expr min lq mean median
any(with(dat, sapply(rule, function(rule) eval(parse(text = rule))))) 93.201 97.7010 127.817 104.9010
fu(dat = dat, rule = rule, res.only = T) 465.902 499.7015 611.827 523.2505
uq max neval
124.8010 834.201 100
643.2015 2018.500 100
其他测试:
dat <- structure(list(x1 = c(2L, 5L, 1L, 3L, 9L, 2L, 6L, 3L, 3L, 9L),
x2 = c(2L, 1L, 6L, 10L, 8L, 10L, 10L, 4L, 6L, 4L), x3 = c(4L,
9L, 8L, 7L, 10L, 1L, 2L, 8L, 3L, 10L)), class = "data.frame", row.names = c(NA,
-10L))
dat
x1 x2 x3
1 2 2 4
2 5 1 9
3 1 6 8
4 3 10 7
5 9 8 10
6 2 10 1
7 6 10 2
8 3 4 8
9 3 6 3
10 9 4 10
with(dat,sapply(rule, function(rule) eval(parse(text = rule))))
x1>5 & x2>2 x1>x2 x3!=4
[1,] FALSE FALSE FALSE
[2,] FALSE TRUE TRUE
[3,] FALSE FALSE TRUE
[4,] FALSE FALSE TRUE
[5,] TRUE TRUE TRUE
[6,] FALSE FALSE TRUE
[7,] TRUE FALSE TRUE
[8,] FALSE FALSE TRUE
[9,] FALSE FALSE TRUE
[10,] TRUE TRUE TRUE
any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule))
[1] TRUE
fu(dat)
fu(dat = dat, rule = rule, res.only = T)
[1] FALSE
# Is this correct?
更新
根据您的更新,我编写了一个新的 fu
函数,即 TIC_fu()
TIC_fu <- function(dat, rule, res.only = TRUE) {
m <- with(dat, lapply(rule, function(r) eval(str2expression(r))))
idx <- na.omit(
Reduce(
function(x, y) {
k <- which(y)
ifelse(all(k <= x), NA, min(k[k > x]))
}, m,
init = 0, accumulate = TRUE
)
)[-1]
if (!res.only) {
fidx <- head(idx, length(rule))
debug.vec <- replace(rep("no", nrow(dat)), fidx, rule[seq_along(fidx)])
return(cbind(dat, debug.vec))
}
length(idx) >= length(rule)
}
你会看到
> TIC_fu(dat, rule, FALSE)
x1 x2 x3 debug.vec
1 2 2 4 no
2 5 1 9 no
3 1 6 8 no
4 3 10 7 no
5 9 8 10 x1>5 & x2>2
6 2 10 1 no
7 6 10 2 no
8 3 4 8 no
9 3 6 3 no
10 9 4 10 x1>x2
> TIC_fu(dat,rule)
[1] FALSE
用于基准测试
> microbenchmark(
+ TIC_fu(dat, rule, FALSE),
+ fu(dat, rule, FALSE),
+ unit = "relative"
+ )
Unit: relative
expr min lq mean median uq max
TIC_fu(dat, rule, FALSE) 1.000000 1.000000 1.000000 1.000000 1.0000 1.000000
fu(dat, rule, FALSE) 4.639093 4.555523 3.383911 4.450056 4.3993 1.007532
neval
100
100
上一个答案
这里有一些选项与所做的类似,但唯一的区别在于parse
、str2lang
和str2expression
microbenchmark::microbenchmark(
any(with(dat, rowSums(sapply(rule, function(rule) eval(parse(text = rule))))==length(rule))),
any(with(dat, rowSums(sapply(rule, function(rule) eval(str2lang(rule))))==length(rule))),
any(with(dat, rowSums(sapply(rule, function(rule) eval(str2expression(rule))))==length(rule))),
any(with(dat, eval(str2expression(paste0(rule,collapse = " & ")))))
)
你会看到
Unit: microseconds
expr
any(with(dat, rowSums(sapply(rule, function(rule) eval(parse(text = rule)))) == length(rule)))
any(with(dat, rowSums(sapply(rule, function(rule) eval(str2lang(rule)))) == length(rule)))
any(with(dat, rowSums(sapply(rule, function(rule) eval(str2expression(rule)))) == length(rule)))
any(with(dat, eval(str2expression(paste0(rule, collapse = " & ")))))
min lq mean median uq max neval
94.0 98.6 131.431 107.35 121.90 632.7 100
37.5 39.2 48.887 44.05 48.50 174.1 100
36.8 39.6 51.627 46.20 48.45 241.4 100
12.7 15.8 19.786 17.00 19.75 97.9 100
我有一个函数可以检查数据帧中是否存在逻辑序列
fu <- function(dat , rule , res.only=T){
debug.vec <- rep("no",nrow(dat)) # control of rule triggers
rule.id <- 1 # rule number in vector
for(i in 1:nrow(dat)){
# check if the rule "rule[rule.id]" has worked on this "i" index in dat[i,]
current_rule <- with(data = dat[i,] , expr = eval(parse(text = rule[rule.id])) )
if(current_rule){ # if the rule is triggered
debug.vec[i] <- rule[rule.id]
if( rule.id==length(rule) ) break # stop if there are no more rules
rule.id <- rule.id+1 # go to the next rule
}}
if(!res.only) return( cbind(dat,debug.vec) )
return( sum(debug.vec!="no")==length(rule) )
}
例如我有一些数据
set.seed(123)
dat <- as.data.frame(matrix(data = sample(10,30,replace = T),ncol = 3))
colnames(dat) <- paste0("x" ,1:ncol(dat))
..
dat
x1 x2 x3
1 3 5 9
2 3 3 3
3 10 9 4
4 2 9 1
5 6 9 7
6 5 3 5
7 4 8 10
8 6 10 7
9 9 7 9
10 10 10 9
还有一个有规则的向量
rule <- c("x1>5 & x2>2" , "x1>x2" , "x3!=4" )
该函数检查数据帧中是否存在这样的逻辑序列并给出合乎逻辑的答案
> fu(dat = dat, rule = rule, res.only = T)
[1] TRUE
或者您可以更改标志 res.only = F
并查看序列在 debug.vec
列中的位置
> fu(dat = dat, rule = rule, res.only = F)
x1 x2 x3 debug.vec
1 3 5 9 no
2 3 3 3 no
3 10 9 4 x1>5 & x2>2
4 2 9 1 no
5 6 9 7 no
6 5 3 5 x1>x2
7 4 8 10 x3!=4
8 6 10 7 no
9 9 7 9 no
10 10 10 9 no
我需要这个函数的最快版本,也许使用 Rccp 包 或类似的东西..
UPD=======================
Waldi
功能与我的功能不一样,有问题
UPD_2_=================================== =
# Is this correct?
是的,这是正确的,如果规则[k]被触发然后搜索规则[k+1]从新的数据行开始
我的函数返回了FALSE
因为最后一个规则"x3!=4"
没有生效,应该是
dat <- structure(list(x1 = c(2L, 5L, 1L, 3L, 9L, 2L, 6L, 3L, 3L, 9L),
x2 = c(2L, 1L, 6L, 10L, 8L, 10L, 10L, 4L, 6L, 4L),
x3 = c(4L, 9L, 8L, 7L, 10L, 1L, 2L, 8L, 3L, 10L)),
class = "data.frame", row.names = c(NA, -10L))
dat
rule <- c("x1>5 & x2>2" , "x1>x2" , "x3!=4" )
my_fu(dat = dat, rule = rule, res.only = F)
只有两条规则有效
> my_fu(dat = dat, rule = rule, res.only = F)
x1 x2 x3 debug.vec
1 2 2 4 no
2 5 1 9 no
3 1 6 8 no
4 3 10 7 no
5 9 8 10 x1>5 & x2>2
6 2 10 1 no
7 6 10 2 no
8 3 4 8 no
9 3 6 3 no
10 9 4 10 x1>x2
应该是
> my_fu(dat = dat, rule = rule, res.only = T)
[1] FALSE
一种可能的简单基础 R 方式:
with(dat,sapply(rule, function(rule) eval(parse(text = rule))))
x1>5 & x2>2 x1>x2 x3!=4
[1,] FALSE FALSE TRUE
[2,] FALSE FALSE TRUE
[3,] TRUE TRUE FALSE
[4,] FALSE FALSE TRUE
[5,] TRUE FALSE TRUE
[6,] FALSE TRUE TRUE
[7,] FALSE FALSE TRUE
[8,] TRUE FALSE TRUE
[9,] TRUE TRUE TRUE
[10,] TRUE FALSE TRUE
any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule))
[1] TRUE
性能:
microbenchmark::microbenchmark(any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule)),
fu(dat = dat, rule = rule, res.only = T))
Unit: microseconds
expr min lq mean median
any(with(dat, sapply(rule, function(rule) eval(parse(text = rule))))) 93.201 97.7010 127.817 104.9010
fu(dat = dat, rule = rule, res.only = T) 465.902 499.7015 611.827 523.2505
uq max neval
124.8010 834.201 100
643.2015 2018.500 100
其他测试:
dat <- structure(list(x1 = c(2L, 5L, 1L, 3L, 9L, 2L, 6L, 3L, 3L, 9L),
x2 = c(2L, 1L, 6L, 10L, 8L, 10L, 10L, 4L, 6L, 4L), x3 = c(4L,
9L, 8L, 7L, 10L, 1L, 2L, 8L, 3L, 10L)), class = "data.frame", row.names = c(NA,
-10L))
dat
x1 x2 x3
1 2 2 4
2 5 1 9
3 1 6 8
4 3 10 7
5 9 8 10
6 2 10 1
7 6 10 2
8 3 4 8
9 3 6 3
10 9 4 10
with(dat,sapply(rule, function(rule) eval(parse(text = rule))))
x1>5 & x2>2 x1>x2 x3!=4
[1,] FALSE FALSE FALSE
[2,] FALSE TRUE TRUE
[3,] FALSE FALSE TRUE
[4,] FALSE FALSE TRUE
[5,] TRUE TRUE TRUE
[6,] FALSE FALSE TRUE
[7,] TRUE FALSE TRUE
[8,] FALSE FALSE TRUE
[9,] FALSE FALSE TRUE
[10,] TRUE TRUE TRUE
any(rowSums(with(dat,sapply(rule, function(rule) eval(parse(text = rule)))))==length(rule))
[1] TRUE
fu(dat)
fu(dat = dat, rule = rule, res.only = T)
[1] FALSE
# Is this correct?
更新
根据您的更新,我编写了一个新的 fu
函数,即 TIC_fu()
TIC_fu <- function(dat, rule, res.only = TRUE) {
m <- with(dat, lapply(rule, function(r) eval(str2expression(r))))
idx <- na.omit(
Reduce(
function(x, y) {
k <- which(y)
ifelse(all(k <= x), NA, min(k[k > x]))
}, m,
init = 0, accumulate = TRUE
)
)[-1]
if (!res.only) {
fidx <- head(idx, length(rule))
debug.vec <- replace(rep("no", nrow(dat)), fidx, rule[seq_along(fidx)])
return(cbind(dat, debug.vec))
}
length(idx) >= length(rule)
}
你会看到
> TIC_fu(dat, rule, FALSE)
x1 x2 x3 debug.vec
1 2 2 4 no
2 5 1 9 no
3 1 6 8 no
4 3 10 7 no
5 9 8 10 x1>5 & x2>2
6 2 10 1 no
7 6 10 2 no
8 3 4 8 no
9 3 6 3 no
10 9 4 10 x1>x2
> TIC_fu(dat,rule)
[1] FALSE
用于基准测试
> microbenchmark(
+ TIC_fu(dat, rule, FALSE),
+ fu(dat, rule, FALSE),
+ unit = "relative"
+ )
Unit: relative
expr min lq mean median uq max
TIC_fu(dat, rule, FALSE) 1.000000 1.000000 1.000000 1.000000 1.0000 1.000000
fu(dat, rule, FALSE) 4.639093 4.555523 3.383911 4.450056 4.3993 1.007532
neval
100
100
上一个答案
这里有一些选项与parse
、str2lang
和str2expression
microbenchmark::microbenchmark(
any(with(dat, rowSums(sapply(rule, function(rule) eval(parse(text = rule))))==length(rule))),
any(with(dat, rowSums(sapply(rule, function(rule) eval(str2lang(rule))))==length(rule))),
any(with(dat, rowSums(sapply(rule, function(rule) eval(str2expression(rule))))==length(rule))),
any(with(dat, eval(str2expression(paste0(rule,collapse = " & ")))))
)
你会看到
Unit: microseconds
expr
any(with(dat, rowSums(sapply(rule, function(rule) eval(parse(text = rule)))) == length(rule)))
any(with(dat, rowSums(sapply(rule, function(rule) eval(str2lang(rule)))) == length(rule)))
any(with(dat, rowSums(sapply(rule, function(rule) eval(str2expression(rule)))) == length(rule)))
any(with(dat, eval(str2expression(paste0(rule, collapse = " & ")))))
min lq mean median uq max neval
94.0 98.6 131.431 107.35 121.90 632.7 100
37.5 39.2 48.887 44.05 48.50 174.1 100
36.8 39.6 51.627 46.20 48.45 241.4 100
12.7 15.8 19.786 17.00 19.75 97.9 100