根据多个条件选择数据行

Selecting Rows of Data Based on Multiple Conditions

我正在使用 R 编程语言。

我在 R 中有这个数据集:

v <- c(1,2,3,4,5,6,7,8,9,10)

var_1 <- as.factor(sample(v, 10000, replace=TRUE, prob=c(0.1,0.1,0.1,0.1,0.1, 0.1,0.1,0.1,0.1,0.1)))

var_2 <- as.factor(sample(v, 10000, replace=TRUE, prob=c(0.1,0.1,0.1,0.1,0.1, 0.1,0.1,0.1,0.1,0.1)))

var_3 <- as.factor(sample(v, 10000, replace=TRUE, prob=c(0.1,0.1,0.1,0.1,0.1, 0.1,0.1,0.1,0.1,0.1)))

var_4 <- as.factor(sample(v, 10000, replace=TRUE, prob=c(0.1,0.1,0.1,0.1,0.1, 0.1,0.1,0.1,0.1,0.1)))

var_5 <- as.factor(sample(v, 10000, replace=TRUE, prob=c(0.1,0.1,0.1,0.1,0.1, 0.1,0.1,0.1,0.1,0.1)))

my_data = data.frame(var_1, var_2, var_3, var_4, var_5)

 head(my_data)
  var_1 var_2 var_3 var_4 var_5
1    10     9     1     1     3
2     3    10     7     2     6
3     7     2     6     2     5
4     4     3     9     9     5
5    10     7     2     6     8
6     2     3     9     1     4

我在 R 中还有另一个包含“条件”的数据集:

conditions <- structure(list(iter = 1:5, n_1 = c(9L, 2L, 4L, 1L, 10L), n_2 = c(7L, 
6L, 9L, 6L, 6L), n_3 = c(8L, 8L, 5L, 4L, 9L), n_4 = c(5L, 10L, 
5L, 2L, 5L), n_5 = c(3L, 6L, 1L, 5L, 1L), cond_1 = list(c(5L, 
10L, 8L, 1L, 9L, 2L, 7L, 3L, 4L), 7:8, c(5L, 8L, 3L, 1L), 8L, 
    c(10L, 2L, 9L, 3L, 5L, 8L, 4L, 6L, 7L, 1L)), cond_2 = list(
    c(7L, 2L, 10L, 8L, 6L, 4L, 3L), c(5L, 1L, 7L, 8L, 6L, 10L
    ), c(5L, 1L, 4L, 3L, 6L, 10L, 7L, 8L, 9L), c(7L, 3L, 2L, 
    1L, 6L, 4L), c(1L, 9L, 3L, 7L, 8L, 4L)), cond_3 = list(c(9L, 
6L, 8L, 4L, 5L, 10L, 3L, 1L), c(8L, 2L, 7L, 5L, 4L, 1L, 9L, 3L
), c(9L, 3L, 10L, 1L, 4L), c(3L, 1L, 2L, 5L), c(8L, 4L, 6L, 7L, 
5L, 1L, 3L, 9L, 2L)), cond_4 = list(c(9L, 5L, 6L, 7L, 3L), c(9L, 
5L, 3L, 6L, 2L, 1L, 7L, 10L, 8L, 4L), c(8L, 10L, 1L, 7L, 6L), 
    c(1L, 7L), c(7L, 6L, 5L, 8L, 10L)), cond_5 = list(c(2L, 5L, 
9L), c(2L, 1L, 10L, 3L, 6L, 7L), 8L, c(10L, 6L, 2L, 8L, 1L), 
    6L)), class = "data.frame", row.names = c(NA, -5L))

head(conditions)

  iter n_1 n_2 n_3 n_4 n_5                        cond_1                     cond_2                    cond_3                        cond_4            cond_5
1    1   9   7   8   5   3    5, 10, 8, 1, 9, 2, 7, 3, 4       7, 2, 10, 8, 6, 4, 3   9, 6, 8, 4, 5, 10, 3, 1                 9, 5, 6, 7, 3           2, 5, 9
2    2   2   6   8  10   6                          7, 8          5, 1, 7, 8, 6, 10    8, 2, 7, 5, 4, 1, 9, 3 9, 5, 3, 6, 2, 1, 7, 10, 8, 4 2, 1, 10, 3, 6, 7
3    3   4   9   5   5   1                    5, 8, 3, 1 5, 1, 4, 3, 6, 10, 7, 8, 9            9, 3, 10, 1, 4                8, 10, 1, 7, 6                 8
4    4   1   6   4   2   5                             8           7, 3, 2, 1, 6, 4                3, 1, 2, 5                          1, 7    10, 6, 2, 8, 1
5    5  10   6   9   5   1 10, 2, 9, 3, 5, 8, 4, 6, 7, 1           1, 9, 3, 7, 8, 4 8, 4, 6, 7, 5, 1, 3, 9, 2                7, 6, 5, 8, 10                 6

基于这两个数据集,我想创建以下5个数据集:

data_1 = my_data[which(my_data$var_1 %in% conditions[1,7] & my_data$var_2 %in% conditions[1,8] & my_data$var_3 %in% conditions[1,9] & my_data$var_4 %in% conditions[1,10] & my_data$var_5 %in% conditions[1,11]), ]

data_2 = my_data[which(my_data$var_1 %in% conditions[2,7] & my_data$var_2 %in% conditions[2,8] & my_data$var_3 %in% conditions[2,9] & my_data$var_4 %in% conditions[2,10] & my_data$var_5 %in% conditions[2,11]), ]

data_3 = my_data[which(my_data$var_1 %in% conditions[3,7] & my_data$var_2 %in% conditions[3,8] & my_data$var_3 %in% conditions[3,9] & my_data$var_4 %in% conditions[3,10] & my_data$var_5 %in% conditions[3,11]), ]

data_4 = my_data[which(my_data$var_1 %in% conditions[4,7] & my_data$var_2 %in% conditions[4,8] & my_data$var_3 %in% conditions[4,9] & my_data$var_4 %in% conditions[4,10] & my_data$var_5 %in% conditions[4,11]), ]

data_5 = my_data[which(my_data$var_1 %in% conditions[5,7] & my_data$var_2 %in% conditions[5,8] & my_data$var_3 %in% conditions[5,9] & my_data$var_4 %in% conditions[5,10] & my_data$var_5 %in% conditions[5,11]), ]

我的问题:这些数据集中的每一个都返回一个空数据集:

> data_1
[1] var_1 var_2 var_3 var_4 var_5
<0 rows> (or 0-length row.names)
> data_2
[1] var_1 var_2 var_3 var_4 var_5
<0 rows> (or 0-length row.names)
> data_3
[1] var_1 var_2 var_3 var_4 var_5
<0 rows> (or 0-length row.names)
> data_4
[1] var_1 var_2 var_3 var_4 var_5
<0 rows> (or 0-length row.names)
> data_5
[1] var_1 var_2 var_3 var_4 var_5
<0 rows> (or 0-length row.names)

谁能告诉我我做错了什么,我该怎么做才能解决这个问题?

对于这些数据集(data_1、data_2、data_3、data_4、data_5)中的每一个,我想select 行,其中:

谢谢!

注 1: 这些是 OR 条件,例如基于此示例中的随机数据:

注2:如果可能的话,我想尽可能多地保留Base R中的代码。

问题是,您的条件中有列表。

class(conditions[1,7])
# [1] "list"

解法:unlist.

my_data[my_data$var_1 %in% unlist(conditions[1,7]) &
          my_data$var_2 %in% unlist(conditions[1,8]) & 
          my_data$var_3 %in% unlist(conditions[1,9]) & 
          my_data$var_4 %in% unlist(conditions[1,10]) & 
          my_data$var_5 %in% unlist(conditions[1,11]), ]
#    var_1 var_2 var_3 var_4 var_5
# 3      9     6     5     3     9
# 6      9     2     6     7     9
# 21     2    10     5     9     5

考虑这个小例子:

1:5 
# [1] 1 2 3 4 5
list(2:4)
# [[1]]
# [1] 2 3 4
unlist(list(2:4))
# [1] 2 3 4

条件更深一层:

1:5 %in% list(2:4)
# [1] FALSE FALSE FALSE FALSE FALSE
1:5 %in% unlist(list(2:4))
# [1] FALSE  TRUE  TRUE  TRUE FALSE