如何过滤具有多个条件的行
How to filter rows with multiple conditions
我是 R 的新手。我正在尝试根据多个条件从 data.frame (df) 中过滤行:
我的一个例子data.frame:
image of my df
df:
SNPA SNPB value block1 block2 score_T
A1 A22 0.379927 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 12
A2 A23 0.449074 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 25
A3 A24 0.464135 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 584
A4 A22 0.328866 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 51
A5 A22 0.326026 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 64
A22 A27 0.57169 A22|A23|A24|A25 A27|A28|A29|A30|A31 77
A23 A28 0.416178 A22|A23|A24|A25 A27|A28|A29|A30|A31 90
A24 A29 0.456144 A22|A23|A24|A25 A27|A28|A29|A30|A31 103
A34 A39 0.379927 A31|A32|A33|A34 A39|A40|A41|A42 116
A34 A40 0.759074 A31|A32|A33|A34 A39|A40|A41|A42 129
A34 A41 0.562303 A31|A32|A33|A34 A39|A40|A41|A42 142
A39 A57 0.322303 A39|A40|A41|A42 A52|A53|A54|A55|A56|A57|A58|A59|A60|A61 25
A40 A57 0.372303 A39|A40|A41|A42 A52|A53|A54|A55|A56|A57|A58|A59|A60|A61 198
A41 A57 0.562303 A39|A40|A41|A42 A52|A53|A54|A55|A56|A57|A58|A59|A60|A61 356
我想要的是使用 dplyr
只保留块(block1 和 block2)至少有两个 SNP 的行(来自 block1 的 SNPA 列和 block2 的 SNPB 列),并且删除包含 1 个 SNP 的块对(示例:第 9 至 14 行)。
想要的结果:result
SNPA SNPB value block1 block2 score_T
A1 A22 0.379927 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 12
A2 A23 0.449074 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 25
A3 A24 0.464135 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 584
A4 A22 0.328866 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 51
A5 A22 0.326026 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 64
A22 A27 0.57169 A22|A23|A24|A25 A27|A28|A29|A30|A31 77
A23 A28 0.416178 A22|A23|A24|A25 A27|A28|A29|A30|A31 90
A24 A29 0.456144 A22|A23|A24|A25 A27|A28|A29|A30|A31 103
你知道我该怎么做吗?
result <- df %>% group_by(block1, block2) %>% filter(...) %>% summarise(mean_s = mean(score_T), number = n())
谢谢。
一个有点慢的 base-dplyr
解决方案。此解决方案的一些问题包括需要在我们的过滤器函数中手动设置 "blocks" 和 "snps"。或许可以将这一过程自动化。
my_filter <- function(df,block, snp){
res<-strsplit(df[[block]],"|", fixed= TRUE)
lengths(lapply(res, function(x) which(x %in% df[[snp]]))) > 1
}
df %>%
filter(my_filter(., "block1", "SNPA"), my_filter(., "block2","SNPB"))
SNPA SNPB value block1 block2 score_T
1 A1 A22 0.379927 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 12
2 A2 A23 0.449074 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 25
3 A3 A24 0.464135 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 584
4 A4 A22 0.328866 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 51
5 A5 A22 0.326026 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 64
6 A22 A27 0.571690 A22|A23|A24|A25 A27|A28|A29|A30|A31 77
7 A23 A28 0.416178 A22|A23|A24|A25 A27|A28|A29|A30|A31 90
8 A24 A29 0.456144 A22|A23|A24|A25 A27|A28|A29|A30|A31 103
数据:
df <-structure(list(SNPA = c("A1", "A2", "A3", "A4", "A5", "A22",
"A23", "A24", "A34", "A34", "A34", "A39", "A40", "A41"), SNPB = c("A22",
"A23", "A24", "A22", "A22", "A27", "A28", "A29", "A39", "A40",
"A41", "A57", "A57", "A57"), value = c(0.379927, 0.449074, 0.464135,
0.328866, 0.326026, 0.57169, 0.416178, 0.456144, 0.379927, 0.759074,
0.562303, 0.322303, 0.372303, 0.562303), block1 = c("A1|A2|A3|A4|A5|A6",
"A1|A2|A3|A4|A5|A6", "A1|A2|A3|A4|A5|A6", "A1|A2|A3|A4|A5|A6",
"A1|A2|A3|A4|A5|A6", "A22|A23|A24|A25", "A22|A23|A24|A25", "A22|A23|A24|A25",
"A31|A32|A33|A34", "A31|A32|A33|A34", "A31|A32|A33|A34", "A39|A40|A41|A42",
"A39|A40|A41|A42", "A39|A40|A41|A42"), block2 = c("A22|A23|A24|A25",
"A22|A23|A24|A25", "A22|A23|A24|A25", "A22|A23|A24|A25", "A22|A23|A24|A25",
"A27|A28|A29|A30|A31", "A27|A28|A29|A30|A31", "A27|A28|A29|A30|A31",
"A39|A40|A41|A42", "A39|A40|A41|A42", "A39|A40|A41|A42", "A52|A53|A54|A55|A56|A57|A58|A59|A60|A61",
"A52|A53|A54|A55|A56|A57|A58|A59|A60|A61", "A52|A53|A54|A55|A56|A57|A58|A59|A60|A61"
), score_T = c(12L, 25L, 584L, 51L, 64L, 77L, 90L, 103L, 116L,
129L, 142L, 25L, 198L, 356L)), class = "data.frame", row.names = c(NA,
-14L))
toString(unique(SNPA))
将 return 一个长度为 1 的字符向量,它将使用 setdiff
与 block1
匹配,然后我们使用 setdiff
将此过程应用于每一行=14=]
#Here a toy example to understand setdiff, strsplit, and length
> length(strsplit(setdiff(toString(unique(df[1:5,'SNPA'])), df[1,'block1']),',')[[1]])
[1] 5
library(dplyr)
library(purrr)
df %>%
group_by(block1,block2) %>%
mutate(A = purrr::map2_dbl(toString(unique(SNPA)), block1, ~length(strsplit(setdiff(.x, .y),',')[[1]])),
B = purrr::map2_dbl(toString(unique(SNPB)), block2, ~length(strsplit(setdiff(.x, .y),',')[[1]]))) %>%
filter(A>2 & B>2)
# A tibble: 8 x 8
# Groups: block1, block2 [2]
SNPA SNPB value block1 block2 score_T A B
<chr> <chr> <dbl> <chr> <chr> <int> <dbl> <dbl>
1 A1 A22 0.380 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 12 5 3
2 A2 A23 0.449 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 25 5 3
3 A3 A24 0.464 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 584 5 3
4 A4 A22 0.329 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 51 5 3
5 A5 A22 0.326 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 64 5 3
6 A22 A27 0.572 A22|A23|A24|A25 A27|A28|A29|A30|A31 77 3 3
7 A23 A28 0.416 A22|A23|A24|A25 A27|A28|A29|A30|A31 90 3 3
8 A24 A29 0.456 A22|A23|A24|A25 A27|A28|A29|A30|A31 103 3 3
我是 R 的新手。我正在尝试根据多个条件从 data.frame (df) 中过滤行:
我的一个例子data.frame: image of my df
df:
SNPA SNPB value block1 block2 score_T
A1 A22 0.379927 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 12
A2 A23 0.449074 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 25
A3 A24 0.464135 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 584
A4 A22 0.328866 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 51
A5 A22 0.326026 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 64
A22 A27 0.57169 A22|A23|A24|A25 A27|A28|A29|A30|A31 77
A23 A28 0.416178 A22|A23|A24|A25 A27|A28|A29|A30|A31 90
A24 A29 0.456144 A22|A23|A24|A25 A27|A28|A29|A30|A31 103
A34 A39 0.379927 A31|A32|A33|A34 A39|A40|A41|A42 116
A34 A40 0.759074 A31|A32|A33|A34 A39|A40|A41|A42 129
A34 A41 0.562303 A31|A32|A33|A34 A39|A40|A41|A42 142
A39 A57 0.322303 A39|A40|A41|A42 A52|A53|A54|A55|A56|A57|A58|A59|A60|A61 25
A40 A57 0.372303 A39|A40|A41|A42 A52|A53|A54|A55|A56|A57|A58|A59|A60|A61 198
A41 A57 0.562303 A39|A40|A41|A42 A52|A53|A54|A55|A56|A57|A58|A59|A60|A61 356
我想要的是使用 dplyr
只保留块(block1 和 block2)至少有两个 SNP 的行(来自 block1 的 SNPA 列和 block2 的 SNPB 列),并且删除包含 1 个 SNP 的块对(示例:第 9 至 14 行)。
想要的结果:result
SNPA SNPB value block1 block2 score_T
A1 A22 0.379927 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 12
A2 A23 0.449074 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 25
A3 A24 0.464135 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 584
A4 A22 0.328866 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 51
A5 A22 0.326026 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 64
A22 A27 0.57169 A22|A23|A24|A25 A27|A28|A29|A30|A31 77
A23 A28 0.416178 A22|A23|A24|A25 A27|A28|A29|A30|A31 90
A24 A29 0.456144 A22|A23|A24|A25 A27|A28|A29|A30|A31 103
你知道我该怎么做吗?
result <- df %>% group_by(block1, block2) %>% filter(...) %>% summarise(mean_s = mean(score_T), number = n())
谢谢。
一个有点慢的 base-dplyr
解决方案。此解决方案的一些问题包括需要在我们的过滤器函数中手动设置 "blocks" 和 "snps"。或许可以将这一过程自动化。
my_filter <- function(df,block, snp){
res<-strsplit(df[[block]],"|", fixed= TRUE)
lengths(lapply(res, function(x) which(x %in% df[[snp]]))) > 1
}
df %>%
filter(my_filter(., "block1", "SNPA"), my_filter(., "block2","SNPB"))
SNPA SNPB value block1 block2 score_T
1 A1 A22 0.379927 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 12
2 A2 A23 0.449074 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 25
3 A3 A24 0.464135 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 584
4 A4 A22 0.328866 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 51
5 A5 A22 0.326026 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 64
6 A22 A27 0.571690 A22|A23|A24|A25 A27|A28|A29|A30|A31 77
7 A23 A28 0.416178 A22|A23|A24|A25 A27|A28|A29|A30|A31 90
8 A24 A29 0.456144 A22|A23|A24|A25 A27|A28|A29|A30|A31 103
数据:
df <-structure(list(SNPA = c("A1", "A2", "A3", "A4", "A5", "A22",
"A23", "A24", "A34", "A34", "A34", "A39", "A40", "A41"), SNPB = c("A22",
"A23", "A24", "A22", "A22", "A27", "A28", "A29", "A39", "A40",
"A41", "A57", "A57", "A57"), value = c(0.379927, 0.449074, 0.464135,
0.328866, 0.326026, 0.57169, 0.416178, 0.456144, 0.379927, 0.759074,
0.562303, 0.322303, 0.372303, 0.562303), block1 = c("A1|A2|A3|A4|A5|A6",
"A1|A2|A3|A4|A5|A6", "A1|A2|A3|A4|A5|A6", "A1|A2|A3|A4|A5|A6",
"A1|A2|A3|A4|A5|A6", "A22|A23|A24|A25", "A22|A23|A24|A25", "A22|A23|A24|A25",
"A31|A32|A33|A34", "A31|A32|A33|A34", "A31|A32|A33|A34", "A39|A40|A41|A42",
"A39|A40|A41|A42", "A39|A40|A41|A42"), block2 = c("A22|A23|A24|A25",
"A22|A23|A24|A25", "A22|A23|A24|A25", "A22|A23|A24|A25", "A22|A23|A24|A25",
"A27|A28|A29|A30|A31", "A27|A28|A29|A30|A31", "A27|A28|A29|A30|A31",
"A39|A40|A41|A42", "A39|A40|A41|A42", "A39|A40|A41|A42", "A52|A53|A54|A55|A56|A57|A58|A59|A60|A61",
"A52|A53|A54|A55|A56|A57|A58|A59|A60|A61", "A52|A53|A54|A55|A56|A57|A58|A59|A60|A61"
), score_T = c(12L, 25L, 584L, 51L, 64L, 77L, 90L, 103L, 116L,
129L, 142L, 25L, 198L, 356L)), class = "data.frame", row.names = c(NA,
-14L))
toString(unique(SNPA))
将 return 一个长度为 1 的字符向量,它将使用 setdiff
与 block1
匹配,然后我们使用 setdiff
将此过程应用于每一行=14=]
#Here a toy example to understand setdiff, strsplit, and length
> length(strsplit(setdiff(toString(unique(df[1:5,'SNPA'])), df[1,'block1']),',')[[1]])
[1] 5
library(dplyr)
library(purrr)
df %>%
group_by(block1,block2) %>%
mutate(A = purrr::map2_dbl(toString(unique(SNPA)), block1, ~length(strsplit(setdiff(.x, .y),',')[[1]])),
B = purrr::map2_dbl(toString(unique(SNPB)), block2, ~length(strsplit(setdiff(.x, .y),',')[[1]]))) %>%
filter(A>2 & B>2)
# A tibble: 8 x 8
# Groups: block1, block2 [2]
SNPA SNPB value block1 block2 score_T A B
<chr> <chr> <dbl> <chr> <chr> <int> <dbl> <dbl>
1 A1 A22 0.380 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 12 5 3
2 A2 A23 0.449 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 25 5 3
3 A3 A24 0.464 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 584 5 3
4 A4 A22 0.329 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 51 5 3
5 A5 A22 0.326 A1|A2|A3|A4|A5|A6 A22|A23|A24|A25 64 5 3
6 A22 A27 0.572 A22|A23|A24|A25 A27|A28|A29|A30|A31 77 3 3
7 A23 A28 0.416 A22|A23|A24|A25 A27|A28|A29|A30|A31 90 3 3
8 A24 A29 0.456 A22|A23|A24|A25 A27|A28|A29|A30|A31 103 3 3