根据截止值提取值
Extract values based on the cut-off
老问题:我有一个数据框,我希望在所有列中过滤和提取包含 0.5 和 1.0 的值之间的行。有没有办法通过指定多个条件来提取?
谢谢,
MD
dput(Data)
structure(list(Genes = c("Gene_1", "Gene_2", "Gene_3",
"Gene_4", "Gene_5", "Gene_6", "Gene_7", "Gene_8",
"Gene_9", "Gene_10"), `S1` = c(0.883643926, 0.248614376,
0.518091486, 0.535221236, 0.415450436, -0.940323826, -0.723796576,
-0.824290276, NA, -0.806255146), `S2` = c(1.005757776, 1.005757776,
4.51601548, 3, 7.78620408, -0.706674058, -0.572657338, -0.686018538,
-0.514713298, -0.532390248), `S3` = c(7.798089, 9.2058061,
5.5408169, 1.52159119, 2.63042701, NA, 1.3857699, -0.152939869,
-0.050295909, -0.337659179), `S4` = c(1.41324408, 9.6038562,
1.71087962, 2.95921938, 4.82199712, 3.17140358, 1.15931318, NA,
1.58997338, 4.76858598), `S5` = c(-0.167945369, 1.41324408,
1.41324408, 0.741171721, 2.494610191, -0.532343489, -0.358607189,
-0.442774239, -0.103589789, 0.213156301)), row.names = c(NA,
10L), class = "data.frame")
重新编辑我的问题:
对混淆表示歉意:之前的数据采用不同的比例格式。我有一个数据框,我希望在所有列中过滤和提取 <= -1.0 和 >+ 1.0 的值。有没有办法通过指定多个条件来提取?
注意:# 对于每一列,响应基因是一组检测到的 log2 值至少为 +/- 1 的基因。
Input data
dput(df)
structure(list(Genes = c("Gene_1", "Gene_2", "Gene_3", "Gene_4",
"Gene_5", "Gene_6", "Gene_7", "Gene_8", "Gene_9", "Gene_10"),
S1 = c(-8.83643926, -2.48614376, 5.18091486, 5.35221236,
0.415450436, -0.940323826, -0.723796576, -0.824290276, NA,
-8.06255146), S2 = c(1.005757776, 1.005757776, 4.51601548,
3, -7.78620408, -0.706674058, -0.572657338, -0.686018538,
-5.14713298, -0.532390248), S3 = c(7.798089, 9.2058061, 5.5408169,
-1.52159119, -2.63042701, NA, 1.3857699, -0.152939869, -0.050295909,
3.37659179), S4 = c(-1.41324408, 9.6038562, 1.71087962, 2.95921938,
4.82199712, 3.17140358, -1.15931318, NA, 1.58997338, -4.76858598
), S5 = c(-0.167945369, 1.41324408, 1.41324408, 0.741171721,
2.494610191, -0.532343489, -0.358607189, -0.442774239, -3.589789,
-2.13156301)), class = "data.frame", row.names = c(NA, -10L
))
#> Genes S1 S2 S3 S4 S5
#> 1 Gene_1 -8.8364393 1.0057578 7.79808900 -1.413244 -0.1679454
#> 2 Gene_2 -2.4861438 1.0057578 9.20580610 9.603856 1.4132441
#> 3 Gene_3 5.1809149 4.5160155 5.54081690 1.710880 1.4132441
#> 4 Gene_4 5.3522124 3.0000000 -1.52159119 2.959219 0.7411717
#> 5 Gene_5 0.4154504 -7.7862041 -2.63042701 4.821997 2.4946102
#> 6 Gene_6 -0.9403238 -0.7066741 NA 3.171404 -0.5323435
#> 7 Gene_7 -0.7237966 -0.5726573 1.38576990 -1.159313 -0.3586072
#> 8 Gene_8 -0.8242903 -0.6860185 -0.15293987 NA -0.4427742
#> 9 Gene_9 NA -5.1471330 -0.05029591 1.589973 -3.5897890
#> 10 Gene_10 -8.0625515 -0.5323902 3.37659179 -4.768586 -2.1315630
Expected results:
dput(df_filter)
structure(list(Genes = c("Gene_1", "Gene_2", "Gene_3", "Gene_4",
"Gene_5", "Gene_6", "Gene_7", "Gene_8", "Gene_9", "Gene_10"),
S1 = c(-8.83643926, -2.48614376, 5.18091486, 5.35221236,
NA, NA, NA, NA, NA, -8.06255146), S2 = c(1.005757776, 1.005757776,
4.51601548, 3, -7.78620408, NA, NA, NA, -5.14713298, NA),
S3 = c(7.798089, 9.2058061, 5.5408169, -1.52159119, -2.63042701,
NA, 1.3857699, NA, NA, 3.37659179), S4 = c(-1.41324408, 9.6038562,
1.71087962, 2.95921938, 4.82199712, 3.17140358, -1.15931318,
NA, 1.58997338, -4.76858598), S5 = c(NA, 1.41324408, 1.41324408,
NA, 2.494610191, NA, NA, NA, -3.589789, -2.13156301)), class = "data.frame", row.names = c(NA,
-10L))
#> Genes S1 S2 S3 S4 S5
#> 1 Gene_1 -8.836439 1.005758 7.798089 -1.413244 NA
#> 2 Gene_2 -2.486144 1.005758 9.205806 9.603856 1.413244
#> 3 Gene_3 5.180915 4.516015 5.540817 1.710880 1.413244
#> 4 Gene_4 5.352212 3.000000 -1.521591 2.959219 NA
#> 5 Gene_5 NA -7.786204 -2.630427 4.821997 2.494610
#> 6 Gene_6 NA NA NA 3.171404 NA
#> 7 Gene_7 NA NA 1.385770 -1.159313 NA
#> 8 Gene_8 NA NA NA NA NA
#> 9 Gene_9 NA -5.147133 NA 1.589973 -3.589789
#> 10 Gene_10 -8.062551 NA 3.376592 -4.768586 -2.131563
由 reprex package (v2.0.1)
创建于 2022-02-23
一种方法是删除第一列并使用 apply
:
Data[which(apply(Data[, -1], 1, function(x) any(x >= 0.5 & x <= 1))),]
Genes S1 S2 S3 S4 S5
1 Gene_1 0.8836439 1.005758 7.798089 1.413244 -0.1679454
3 Gene_3 0.5180915 4.516015 5.540817 1.710880 1.4132441
4 Gene_4 0.5352212 3.000000 1.521591 2.959219 0.7411717
编辑
x <- Data[,-1]
x[x > -1 & x <= 1] <- NA
cbind(Data[,1,drop = FALSE], x)
使用 tidyverse
:
Data %>%
pivot_longer(cols = -Genes) %>%
mutate(value = ifelse(value > -1 & value <= 1, NA, value)) %>%
pivot_wider(id_cols = Genes)
这是一个 dplyr
方法。
您可以在 filter
中使用 if_any
以应用于所有列。
library(dplyr)
Data %>% filter(if_any(!Genes, ~between(.x, 0.5, 1)))
Genes S1 S2 S3 S4 S5
1 Gene_1 0.8836439 1.005758 7.798089 1.413244 -0.1679454
2 Gene_3 0.5180915 4.516015 5.540817 1.710880 1.4132441
3 Gene_4 0.5352212 3.000000 1.521591 2.959219 0.7411717
老问题:我有一个数据框,我希望在所有列中过滤和提取包含 0.5 和 1.0 的值之间的行。有没有办法通过指定多个条件来提取?
谢谢,
MD
dput(Data)
structure(list(Genes = c("Gene_1", "Gene_2", "Gene_3",
"Gene_4", "Gene_5", "Gene_6", "Gene_7", "Gene_8",
"Gene_9", "Gene_10"), `S1` = c(0.883643926, 0.248614376,
0.518091486, 0.535221236, 0.415450436, -0.940323826, -0.723796576,
-0.824290276, NA, -0.806255146), `S2` = c(1.005757776, 1.005757776,
4.51601548, 3, 7.78620408, -0.706674058, -0.572657338, -0.686018538,
-0.514713298, -0.532390248), `S3` = c(7.798089, 9.2058061,
5.5408169, 1.52159119, 2.63042701, NA, 1.3857699, -0.152939869,
-0.050295909, -0.337659179), `S4` = c(1.41324408, 9.6038562,
1.71087962, 2.95921938, 4.82199712, 3.17140358, 1.15931318, NA,
1.58997338, 4.76858598), `S5` = c(-0.167945369, 1.41324408,
1.41324408, 0.741171721, 2.494610191, -0.532343489, -0.358607189,
-0.442774239, -0.103589789, 0.213156301)), row.names = c(NA,
10L), class = "data.frame")
重新编辑我的问题:
对混淆表示歉意:之前的数据采用不同的比例格式。我有一个数据框,我希望在所有列中过滤和提取 <= -1.0 和 >+ 1.0 的值。有没有办法通过指定多个条件来提取?
注意:# 对于每一列,响应基因是一组检测到的 log2 值至少为 +/- 1 的基因。
Input data
dput(df)
structure(list(Genes = c("Gene_1", "Gene_2", "Gene_3", "Gene_4",
"Gene_5", "Gene_6", "Gene_7", "Gene_8", "Gene_9", "Gene_10"),
S1 = c(-8.83643926, -2.48614376, 5.18091486, 5.35221236,
0.415450436, -0.940323826, -0.723796576, -0.824290276, NA,
-8.06255146), S2 = c(1.005757776, 1.005757776, 4.51601548,
3, -7.78620408, -0.706674058, -0.572657338, -0.686018538,
-5.14713298, -0.532390248), S3 = c(7.798089, 9.2058061, 5.5408169,
-1.52159119, -2.63042701, NA, 1.3857699, -0.152939869, -0.050295909,
3.37659179), S4 = c(-1.41324408, 9.6038562, 1.71087962, 2.95921938,
4.82199712, 3.17140358, -1.15931318, NA, 1.58997338, -4.76858598
), S5 = c(-0.167945369, 1.41324408, 1.41324408, 0.741171721,
2.494610191, -0.532343489, -0.358607189, -0.442774239, -3.589789,
-2.13156301)), class = "data.frame", row.names = c(NA, -10L
))
#> Genes S1 S2 S3 S4 S5
#> 1 Gene_1 -8.8364393 1.0057578 7.79808900 -1.413244 -0.1679454
#> 2 Gene_2 -2.4861438 1.0057578 9.20580610 9.603856 1.4132441
#> 3 Gene_3 5.1809149 4.5160155 5.54081690 1.710880 1.4132441
#> 4 Gene_4 5.3522124 3.0000000 -1.52159119 2.959219 0.7411717
#> 5 Gene_5 0.4154504 -7.7862041 -2.63042701 4.821997 2.4946102
#> 6 Gene_6 -0.9403238 -0.7066741 NA 3.171404 -0.5323435
#> 7 Gene_7 -0.7237966 -0.5726573 1.38576990 -1.159313 -0.3586072
#> 8 Gene_8 -0.8242903 -0.6860185 -0.15293987 NA -0.4427742
#> 9 Gene_9 NA -5.1471330 -0.05029591 1.589973 -3.5897890
#> 10 Gene_10 -8.0625515 -0.5323902 3.37659179 -4.768586 -2.1315630
Expected results:
dput(df_filter)
structure(list(Genes = c("Gene_1", "Gene_2", "Gene_3", "Gene_4",
"Gene_5", "Gene_6", "Gene_7", "Gene_8", "Gene_9", "Gene_10"),
S1 = c(-8.83643926, -2.48614376, 5.18091486, 5.35221236,
NA, NA, NA, NA, NA, -8.06255146), S2 = c(1.005757776, 1.005757776,
4.51601548, 3, -7.78620408, NA, NA, NA, -5.14713298, NA),
S3 = c(7.798089, 9.2058061, 5.5408169, -1.52159119, -2.63042701,
NA, 1.3857699, NA, NA, 3.37659179), S4 = c(-1.41324408, 9.6038562,
1.71087962, 2.95921938, 4.82199712, 3.17140358, -1.15931318,
NA, 1.58997338, -4.76858598), S5 = c(NA, 1.41324408, 1.41324408,
NA, 2.494610191, NA, NA, NA, -3.589789, -2.13156301)), class = "data.frame", row.names = c(NA,
-10L))
#> Genes S1 S2 S3 S4 S5
#> 1 Gene_1 -8.836439 1.005758 7.798089 -1.413244 NA
#> 2 Gene_2 -2.486144 1.005758 9.205806 9.603856 1.413244
#> 3 Gene_3 5.180915 4.516015 5.540817 1.710880 1.413244
#> 4 Gene_4 5.352212 3.000000 -1.521591 2.959219 NA
#> 5 Gene_5 NA -7.786204 -2.630427 4.821997 2.494610
#> 6 Gene_6 NA NA NA 3.171404 NA
#> 7 Gene_7 NA NA 1.385770 -1.159313 NA
#> 8 Gene_8 NA NA NA NA NA
#> 9 Gene_9 NA -5.147133 NA 1.589973 -3.589789
#> 10 Gene_10 -8.062551 NA 3.376592 -4.768586 -2.131563
由 reprex package (v2.0.1)
创建于 2022-02-23一种方法是删除第一列并使用 apply
:
Data[which(apply(Data[, -1], 1, function(x) any(x >= 0.5 & x <= 1))),]
Genes S1 S2 S3 S4 S5
1 Gene_1 0.8836439 1.005758 7.798089 1.413244 -0.1679454
3 Gene_3 0.5180915 4.516015 5.540817 1.710880 1.4132441
4 Gene_4 0.5352212 3.000000 1.521591 2.959219 0.7411717
编辑
x <- Data[,-1]
x[x > -1 & x <= 1] <- NA
cbind(Data[,1,drop = FALSE], x)
使用 tidyverse
:
Data %>%
pivot_longer(cols = -Genes) %>%
mutate(value = ifelse(value > -1 & value <= 1, NA, value)) %>%
pivot_wider(id_cols = Genes)
这是一个 dplyr
方法。
您可以在 filter
中使用 if_any
以应用于所有列。
library(dplyr)
Data %>% filter(if_any(!Genes, ~between(.x, 0.5, 1)))
Genes S1 S2 S3 S4 S5
1 Gene_1 0.8836439 1.005758 7.798089 1.413244 -0.1679454
2 Gene_3 0.5180915 4.516015 5.540817 1.710880 1.4132441
3 Gene_4 0.5352212 3.000000 1.521591 2.959219 0.7411717