基于两个或多个变量的所有可能组合的子集 data.table
Subset data.table based on all possible combinations of two or more variables
我想根据某些变量是否全为正数、全为负数或介于两者之间的某种组合来对 data.frame
进行子集化。对于 n
个变量,这应该会导致 2^n
个可能的组合。
我认为 combn
可用于实现此目的,但我正在努力正确地做到这一点。
示例数据:
library(data.table)
dt <- data.table(x = runif(100, -1, 1), y = runif(100, -1, 1), z = runif(100, -1, 1))
我想要的:
dt[x < 0 & y < 0 z < 0, ]
dt[x < 0 & y < 0 z > 0, ]
dt[x < 0 & y > 0 z < 0, ]
dt[x < 0 & y > 0 z > 0, ]
dt[x > 0 & y < 0 z < 0, ]
dt[x > 0 & y < 0 z > 0, ]
dt[x > 0 & y > 0 z < 0, ]
dt[x > 0 & y > 0 z > 0, ]
到目前为止我尝试过的:
combinator <- function(z){
cnames <- colnames(z)
combinations <- t(combn(c(rep("<", ncol(z)), rep(">", ncol(z))),ncol(z)))
retval <- t(sapply(1:nrow(combinations), function(p){
sapply(1:ncol(z), function(q) paste(cnames[q], combinations[p,q], 0))
}))
return(apply(retval, 1, paste, collapse = " & "))
}
输出:
> l <- combinator(dt)
> l
[1] "x < 0 & y < 0 & z < 0" "x < 0 & y < 0 & z > 0" "x < 0 & y < 0 & z > 0" "x < 0 & y < 0 & z > 0"
[5] "x < 0 & y < 0 & z > 0" "x < 0 & y < 0 & z > 0" "x < 0 & y < 0 & z > 0" "x < 0 & y > 0 & z > 0"
[9] "x < 0 & y > 0 & z > 0" "x < 0 & y > 0 & z > 0" "x < 0 & y < 0 & z > 0" "x < 0 & y < 0 & z > 0"
[13] "x < 0 & y < 0 & z > 0" "x < 0 & y > 0 & z > 0" "x < 0 & y > 0 & z > 0" "x < 0 & y > 0 & z > 0"
[17] "x < 0 & y > 0 & z > 0" "x < 0 & y > 0 & z > 0" "x < 0 & y > 0 & z > 0" "x > 0 & y > 0 & z > 0"
> l[1]
[1] "x < 0 & y < 0 & z < 0"
> subset(dt, eval(l[1]))
Error in subset.data.table(dt, eval(l[1])) :
'subset' must evaluate to logical
此外,如果以下显示我没有列出所有需要的组合:
> unique(l)
[1] "x < 0 & y < 0 & z < 0" "x < 0 & y < 0 & z > 0"
[3] "x < 0 & y > 0 & z > 0" "x > 0 & y > 0 & z > 0"
输出应该有 8 个唯一结果,而不是上面显示的 4 个。
只需执行 dt[, sign_combi := do.call(paste, lapply(dt, sign))]
,您可以根据需要 split
或 by =
该列,例如 split(dt, dt$sign_combi)
。尝试将代码粘贴在一起是个坏主意。
例如:
set.seed(47) # setting seed for reproducibility
dt <- data.table(x = runif(100, -1, 1), y = runif(100, -1, 1), z = runif(100, -1, 1))
# create combination column (you could keep it separate if you prefer)
dt[, sign_combi := do.call(paste, lapply(dt, sign))]
# split original data by sign combinations
result = split(dt, dt$sign_combi)
# list of 8 resulting data tables
length(result)
# [1] 8
# peaking at the first three rows of the first three tables:
lapply(head(result, 3), head, 3)
# $`-1 -1 -1`
# x y z sign_combi
# 1: -0.5713038 -0.7103555 -0.6873705 -1 -1 -1
# 2: -0.1407803 -0.8371153 -0.3686299 -1 -1 -1
# 3: -0.6478446 -0.7629461 -0.7458949 -1 -1 -1
#
# $`-1 -1 1`
# x y z sign_combi
# 1: -0.8070969 -0.3952283 0.9212030 -1 -1 1
# 2: -0.1190934 -0.4969318 0.8082232 -1 -1 1
# 3: -0.6536104 -0.3280965 0.6880454 -1 -1 1
#
# $`-1 1 -1`
# x y z sign_combi
# 1: -0.78789241 0.8577848 -0.7586369 -1 1 -1
# 2: -0.04442825 0.4736388 -0.3354734 -1 1 -1
# 3: -0.22105744 0.3012645 -0.4160631 -1 1 -1
我想根据某些变量是否全为正数、全为负数或介于两者之间的某种组合来对 data.frame
进行子集化。对于 n
个变量,这应该会导致 2^n
个可能的组合。
我认为 combn
可用于实现此目的,但我正在努力正确地做到这一点。
示例数据:
library(data.table)
dt <- data.table(x = runif(100, -1, 1), y = runif(100, -1, 1), z = runif(100, -1, 1))
我想要的:
dt[x < 0 & y < 0 z < 0, ]
dt[x < 0 & y < 0 z > 0, ]
dt[x < 0 & y > 0 z < 0, ]
dt[x < 0 & y > 0 z > 0, ]
dt[x > 0 & y < 0 z < 0, ]
dt[x > 0 & y < 0 z > 0, ]
dt[x > 0 & y > 0 z < 0, ]
dt[x > 0 & y > 0 z > 0, ]
到目前为止我尝试过的:
combinator <- function(z){
cnames <- colnames(z)
combinations <- t(combn(c(rep("<", ncol(z)), rep(">", ncol(z))),ncol(z)))
retval <- t(sapply(1:nrow(combinations), function(p){
sapply(1:ncol(z), function(q) paste(cnames[q], combinations[p,q], 0))
}))
return(apply(retval, 1, paste, collapse = " & "))
}
输出:
> l <- combinator(dt)
> l
[1] "x < 0 & y < 0 & z < 0" "x < 0 & y < 0 & z > 0" "x < 0 & y < 0 & z > 0" "x < 0 & y < 0 & z > 0"
[5] "x < 0 & y < 0 & z > 0" "x < 0 & y < 0 & z > 0" "x < 0 & y < 0 & z > 0" "x < 0 & y > 0 & z > 0"
[9] "x < 0 & y > 0 & z > 0" "x < 0 & y > 0 & z > 0" "x < 0 & y < 0 & z > 0" "x < 0 & y < 0 & z > 0"
[13] "x < 0 & y < 0 & z > 0" "x < 0 & y > 0 & z > 0" "x < 0 & y > 0 & z > 0" "x < 0 & y > 0 & z > 0"
[17] "x < 0 & y > 0 & z > 0" "x < 0 & y > 0 & z > 0" "x < 0 & y > 0 & z > 0" "x > 0 & y > 0 & z > 0"
> l[1]
[1] "x < 0 & y < 0 & z < 0"
> subset(dt, eval(l[1]))
Error in subset.data.table(dt, eval(l[1])) :
'subset' must evaluate to logical
此外,如果以下显示我没有列出所有需要的组合:
> unique(l)
[1] "x < 0 & y < 0 & z < 0" "x < 0 & y < 0 & z > 0"
[3] "x < 0 & y > 0 & z > 0" "x > 0 & y > 0 & z > 0"
输出应该有 8 个唯一结果,而不是上面显示的 4 个。
只需执行 dt[, sign_combi := do.call(paste, lapply(dt, sign))]
,您可以根据需要 split
或 by =
该列,例如 split(dt, dt$sign_combi)
。尝试将代码粘贴在一起是个坏主意。
例如:
set.seed(47) # setting seed for reproducibility
dt <- data.table(x = runif(100, -1, 1), y = runif(100, -1, 1), z = runif(100, -1, 1))
# create combination column (you could keep it separate if you prefer)
dt[, sign_combi := do.call(paste, lapply(dt, sign))]
# split original data by sign combinations
result = split(dt, dt$sign_combi)
# list of 8 resulting data tables
length(result)
# [1] 8
# peaking at the first three rows of the first three tables:
lapply(head(result, 3), head, 3)
# $`-1 -1 -1`
# x y z sign_combi
# 1: -0.5713038 -0.7103555 -0.6873705 -1 -1 -1
# 2: -0.1407803 -0.8371153 -0.3686299 -1 -1 -1
# 3: -0.6478446 -0.7629461 -0.7458949 -1 -1 -1
#
# $`-1 -1 1`
# x y z sign_combi
# 1: -0.8070969 -0.3952283 0.9212030 -1 -1 1
# 2: -0.1190934 -0.4969318 0.8082232 -1 -1 1
# 3: -0.6536104 -0.3280965 0.6880454 -1 -1 1
#
# $`-1 1 -1`
# x y z sign_combi
# 1: -0.78789241 0.8577848 -0.7586369 -1 1 -1
# 2: -0.04442825 0.4736388 -0.3354734 -1 1 -1
# 3: -0.22105744 0.3012645 -0.4160631 -1 1 -1