根据截止值提取值

Extract values based on the cut-off

老问题:我有一个数据框,我希望在所有列中过滤和提取包含 0.5 和 1.0 的值之间的行。有没有办法通过指定多个条件来提取?

谢谢,

MD

dput(Data)
structure(list(Genes = c("Gene_1", "Gene_2", "Gene_3", 
                         "Gene_4", "Gene_5", "Gene_6", "Gene_7", "Gene_8", 
                         "Gene_9", "Gene_10"), `S1` = c(0.883643926, 0.248614376, 
                                                        0.518091486, 0.535221236, 0.415450436, -0.940323826, -0.723796576, 
                                                        -0.824290276, NA, -0.806255146), `S2` = c(1.005757776, 1.005757776, 
                                                                                                  4.51601548, 3, 7.78620408, -0.706674058, -0.572657338, -0.686018538, 
                                                                                                  -0.514713298, -0.532390248), `S3` = c(7.798089, 9.2058061, 
                                                                                                                                        5.5408169, 1.52159119, 2.63042701, NA, 1.3857699, -0.152939869, 
                                                                                                                                        -0.050295909, -0.337659179), `S4` = c(1.41324408, 9.6038562, 
                                                                                                                                                                              1.71087962, 2.95921938, 4.82199712, 3.17140358, 1.15931318, NA, 
                                                                                                                                                                              1.58997338, 4.76858598), `S5` = c(-0.167945369, 1.41324408, 
                                                                                                                                                                                                                1.41324408, 0.741171721, 2.494610191, -0.532343489, -0.358607189, 
                                                                                                                                                                                                                -0.442774239, -0.103589789, 0.213156301)), row.names = c(NA, 
                                                                                                                                                                                                                                                                         10L), class = "data.frame")

重新编辑我的问题:

对混淆表示歉意:之前的数据采用不同的比例格式。我有一个数据框,我希望在所有列中过滤和提取 <= -1.0 和 >+ 1.0 的值。有没有办法通过指定多个条件来提取?

注意:# 对于每一列,响应基因是一组检测到的 log2 值至少为 +/- 1 的基因。

Input data
dput(df)
structure(list(Genes = c("Gene_1", "Gene_2", "Gene_3", "Gene_4", 
                         "Gene_5", "Gene_6", "Gene_7", "Gene_8", "Gene_9", "Gene_10"), 
               S1 = c(-8.83643926, -2.48614376, 5.18091486, 5.35221236, 
                      0.415450436, -0.940323826, -0.723796576, -0.824290276, NA, 
                      -8.06255146), S2 = c(1.005757776, 1.005757776, 4.51601548, 
                                           3, -7.78620408, -0.706674058, -0.572657338, -0.686018538, 
                                           -5.14713298, -0.532390248), S3 = c(7.798089, 9.2058061, 5.5408169, 
                                                                              -1.52159119, -2.63042701, NA, 1.3857699, -0.152939869, -0.050295909, 
                                                                              3.37659179), S4 = c(-1.41324408, 9.6038562, 1.71087962, 2.95921938, 
                                                                                                  4.82199712, 3.17140358, -1.15931318, NA, 1.58997338, -4.76858598
                                                                              ), S5 = c(-0.167945369, 1.41324408, 1.41324408, 0.741171721, 
                                                                                        2.494610191, -0.532343489, -0.358607189, -0.442774239, -3.589789, 
                                                                                        -2.13156301)), class = "data.frame", row.names = c(NA, -10L
                                                                                        ))
#>      Genes         S1         S2          S3        S4         S5
#> 1   Gene_1 -8.8364393  1.0057578  7.79808900 -1.413244 -0.1679454
#> 2   Gene_2 -2.4861438  1.0057578  9.20580610  9.603856  1.4132441
#> 3   Gene_3  5.1809149  4.5160155  5.54081690  1.710880  1.4132441
#> 4   Gene_4  5.3522124  3.0000000 -1.52159119  2.959219  0.7411717
#> 5   Gene_5  0.4154504 -7.7862041 -2.63042701  4.821997  2.4946102
#> 6   Gene_6 -0.9403238 -0.7066741          NA  3.171404 -0.5323435
#> 7   Gene_7 -0.7237966 -0.5726573  1.38576990 -1.159313 -0.3586072
#> 8   Gene_8 -0.8242903 -0.6860185 -0.15293987        NA -0.4427742
#> 9   Gene_9         NA -5.1471330 -0.05029591  1.589973 -3.5897890
#> 10 Gene_10 -8.0625515 -0.5323902  3.37659179 -4.768586 -2.1315630


Expected results: 
dput(df_filter)
structure(list(Genes = c("Gene_1", "Gene_2", "Gene_3", "Gene_4", 
                         "Gene_5", "Gene_6", "Gene_7", "Gene_8", "Gene_9", "Gene_10"), 
               S1 = c(-8.83643926, -2.48614376, 5.18091486, 5.35221236, 
                      NA, NA, NA, NA, NA, -8.06255146), S2 = c(1.005757776, 1.005757776, 
                                                               4.51601548, 3, -7.78620408, NA, NA, NA, -5.14713298, NA), 
               S3 = c(7.798089, 9.2058061, 5.5408169, -1.52159119, -2.63042701, 
                      NA, 1.3857699, NA, NA, 3.37659179), S4 = c(-1.41324408, 9.6038562, 
                                                                 1.71087962, 2.95921938, 4.82199712, 3.17140358, -1.15931318, 
                                                                 NA, 1.58997338, -4.76858598), S5 = c(NA, 1.41324408, 1.41324408, 
                                                                                                      NA, 2.494610191, NA, NA, NA, -3.589789, -2.13156301)), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                 -10L))
#>      Genes        S1        S2        S3        S4        S5
#> 1   Gene_1 -8.836439  1.005758  7.798089 -1.413244        NA
#> 2   Gene_2 -2.486144  1.005758  9.205806  9.603856  1.413244
#> 3   Gene_3  5.180915  4.516015  5.540817  1.710880  1.413244
#> 4   Gene_4  5.352212  3.000000 -1.521591  2.959219        NA
#> 5   Gene_5        NA -7.786204 -2.630427  4.821997  2.494610
#> 6   Gene_6        NA        NA        NA  3.171404        NA
#> 7   Gene_7        NA        NA  1.385770 -1.159313        NA
#> 8   Gene_8        NA        NA        NA        NA        NA
#> 9   Gene_9        NA -5.147133        NA  1.589973 -3.589789
#> 10 Gene_10 -8.062551        NA  3.376592 -4.768586 -2.131563

reprex package (v2.0.1)

创建于 2022-02-23

一种方法是删除第一列并使用 apply:

Data[which(apply(Data[, -1], 1, function(x) any(x >= 0.5 & x <= 1))),]

   Genes        S1       S2       S3       S4         S5
1 Gene_1 0.8836439 1.005758 7.798089 1.413244 -0.1679454
3 Gene_3 0.5180915 4.516015 5.540817 1.710880  1.4132441
4 Gene_4 0.5352212 3.000000 1.521591 2.959219  0.7411717

编辑

x <- Data[,-1]
x[x > -1 & x <= 1] <- NA
cbind(Data[,1,drop = FALSE], x)

使用 tidyverse:

Data %>%
  pivot_longer(cols = -Genes) %>%
  mutate(value = ifelse(value > -1 & value <= 1, NA, value)) %>%
  pivot_wider(id_cols = Genes)

这是一个 dplyr 方法。

您可以在 filter 中使用 if_any 以应用于所有列。

library(dplyr)

Data %>% filter(if_any(!Genes, ~between(.x, 0.5, 1)))

   Genes        S1       S2       S3       S4         S5
1 Gene_1 0.8836439 1.005758 7.798089 1.413244 -0.1679454
2 Gene_3 0.5180915 4.516015 5.540817 1.710880  1.4132441
3 Gene_4 0.5352212 3.000000 1.521591 2.959219  0.7411717