在 R 中的嵌套数据帧列表上进行过滤和重新分类

Filter and re-categorize on a nested list of dataframes in R

我有一个包含 n 个元素的列表,每个元素都包含一个数据框。我们以start_list为例:

start_list <- list(ENSG0000014 = structure(list(name = c("E-1122O", "E-11EM3", 
"E-11EMC", "E-1442O", "E-1132O"), ENSG = c("ENSG0000014", "ENSG0000014", 
"ENSG0000014", "ENSG0000014", "ENSG0000014"), expr = c(" 9.940670e-02", 
" 1.289670e-01", "-7.394904e-03", " 9.940670e-02", " 9.940670e-02"
), `1_43222779_A_G_b37` = c("1", "1", "2", "1", "0"), `1_43222856_A_G_b37` = c("0", 
"0", "0", "1", "1"), `1_43223126_C_T_b37` = c("0", "1", "0", 
"1", "2"), `1_43223317_T_C_b37` = c("1", "0", "0", "2", "1")), row.names = c(NA, 
-5L), class = c("tbl_df", "tbl", "data.frame")), ENSG0000015 = structure(list(
name = c("E-1122O", "E-11EM3", "E-11EMC", "E-1442O", "E-1132O"
), ENSG = c("ENSG0000015", "ENSG0000015", "ENSG0000015", 
"ENSG0000015", "ENSG0000015"), expr = c(" 9.940670e-02", 
" 1.289670e-01", "-7.394904e-03", " 9.940670e-02", " 1.289670e-01"
), `1_43222779_A_G_b37` = c("0", "1", "0", "1", "2"), 
`1_43222856_A_G_b37` = c("1", "1", "2", "1", "0")), 
row.names = c(NA, -5L), class = c("tbl_df", 
"tbl", "data.frame"))) 

此外,还有一个名为 set_id 的数据框,其中包含来自 start_listname 列的个体列表,这些个体被分为五组 TRUE/FALSE 个字符:

set_id <- structure(list(IID = c("E-1122O", "E-11EM3", "E-11EMC", "E-1442O", 
"E-1132O"), set_1 = c(TRUE, FALSE, TRUE, TRUE, TRUE), set_2 = c(TRUE, 
TRUE, FALSE, FALSE, TRUE), set_3 = c(FALSE, TRUE, TRUE, FALSE, 
TRUE), set_4 = c(TRUE, FALSE, TRUE, TRUE, FALSE), set_5 = c(TRUE, 
FALSE, FALSE, TRUE, TRUE)), row.names = c(NA, -5L), class = "data.frame")

我需要根据这些个人群体过滤 start_list 以保留那些 IID 如果等于 'FALSE' 并且还删除 [=32] 的第二列和第三列=], ENSG, expr 并创建一个新列表,'list_prime_out':

list_prime_out <- list(ENSG0000014 = list(set_1 = structure(list(name = "E-11EM3", 
    `1_43222779_A_G_b37` = "1", `1_43222856_A_G_b37` = "0", `1_43223126_C_T_b37` = "0", 
    `1_43223317_T_C_b37` = "1"), row.names = c(NA, -1L), class = c("tbl_df", 
"tbl", "data.frame")), set_2 = structure(list(name = c("E-11EMC", 
"E-14420"), `1_43222779_A_G_b37` = c("1", "0"), `1_43222856_A_G_b37` = c("1", 
"1"), `1_43223126_C_T_b37` = c("2", "0"), `1_43223317_T_C_b37` = c("2", 
"0")), row.names = c(NA, -2L), class = c("tbl_df", "tbl", "data.frame"
)), set_3 = structure(list(name = c("E-1122O", "E-1442O"), `1_43222779_A_G_b37` = "1", 
    `1_43222856_A_G_b37` = "0", `1_43223126_C_T_b37` = c("1", 
    "1"), `1_43223317_T_C_b37` = c("1", "2")), row.names = c(NA, 
-2L), class = c("tbl_df", "tbl", "data.frame")), set_4 = structure(list(
    name = c("E-11EM3", "E-1132O"), `1_43222779_A_G_b37` = c("1", 
    "0"), `1_43222856_A_G_b37` = c("1", "1"), `1_43223126_C_T_b37` = c("0", 
    "0"), `1_43223317_T_C_b37` = c("0", "0")), row.names = c(NA, 
-2L), class = c("tbl_df", "tbl", "data.frame")), set_5 = structure(list(
    name = c("E-11EM3", "E-11EMC"), `1_43222779_A_G_b37` = c("1", 
    "0"), `1_43222856_A_G_b37` = c("1", "1"), `1_43223126_C_T_b37` = c("2", 
    "0"), `1_43223317_T_C_b37` = c("1", "2")), row.names = c(NA, 
-2L), class = c("tbl_df", "tbl", "data.frame"))), ENSG0000015 = list(
    set_1 = structure(list(name = "E-11EM3", `1_43222779_A_G_b37` = "1", 
        `1_43222856_A_G_b37` = "0", `1_43223126_C_T_b37` = "0", 
        `1_43223317_T_C_b37` = "1"), row.names = c(NA, -1L), class = c("tbl_df", 
    "tbl", "data.frame")), set_2 = structure(list(name = c("E-11EMC", 
    "E-14420"), `1_43222779_A_G_b37` = c("1", "0"), `1_43222856_A_G_b37` = c("1", 
    "1"), `1_43223126_C_T_b37` = c("2", "0"), `1_43223317_T_C_b37` = c("2", 
    "0")), row.names = c(NA, -2L), class = c("tbl_df", "tbl", 
    "data.frame")), set_3 = structure(list(name = c("E-1122O", 
    "E-1442O"), `1_43222779_A_G_b37` = "1", `1_43222856_A_G_b37` = "0", 
        `1_43223126_C_T_b37` = c("1", "1"), `1_43223317_T_C_b37` = c("1", 
        "2")), row.names = c(NA, -2L), class = c("tbl_df", "tbl", 
    "data.frame")), set_4 = structure(list(name = c("E-11EM3", 
    "E-1132O"), `1_43222779_A_G_b37` = c("1", "0"), `1_43222856_A_G_b37` = c("1", 
    "1"), `1_43223126_C_T_b37` = c("0", "0"), `1_43223317_T_C_b37` = c("0", 
    "0")), row.names = c(NA, -2L), class = c("tbl_df", "tbl", 
    "data.frame")), set_5 = structure(list(name = c("E-11EM3", 
    "E-11EMC"), `1_43222779_A_G_b37` = c("1", "0"), `1_43222856_A_G_b37` = c("1", 
    "1"), `1_43223126_C_T_b37` = c("2", "0"), `1_43223317_T_C_b37` = c("1", 
    "2")), row.names = c(NA, -2L), class = c("tbl_df", "tbl", 
    "data.frame"))))

str(list_prime_out)
List of 2
 $ ENSG0000014:List of 5
  ..$ set_1: tibble [1 × 5] (S3: tbl_df/tbl/data.frame)
  .. ..$ name              : chr "E-11EM3"
  .. ..$ 1_43222779_A_G_b37: chr "1"
  .. ..$ 1_43222856_A_G_b37: chr "0"
  .. ..$ 1_43223126_C_T_b37: chr "0"
  .. ..$ 1_43223317_T_C_b37: chr "1"
  ..$ set_2: tibble [2 × 5] (S3: tbl_df/tbl/data.frame)
  .. ..$ name              : chr [1:2] "E-11EMC" "E-14420"
  .. ..$ 1_43222779_A_G_b37: chr [1:2] "1" "0"
  .. ..$ 1_43222856_A_G_b37: chr [1:2] "1" "1"
  .. ..$ 1_43223126_C_T_b37: chr [1:2] "2" "0"
  .. ..$ 1_43223317_T_C_b37: chr [1:2] "2" "0"
  ..$ set_3: tibble [2 × 5] (S3: tbl_df/tbl/data.frame)
  .. ..$ name              : chr [1:2] "E-1122O" "E-1442O"
  .. ..$ 1_43222779_A_G_b37: chr "1"
  .. ..$ 1_43222856_A_G_b37: chr "0"
  .. ..$ 1_43223126_C_T_b37: chr [1:2] "1" "1"
  .. ..$ 1_43223317_T_C_b37: chr [1:2] "1" "2"
  ..$ set_4: tibble [2 × 5] (S3: tbl_df/tbl/data.frame)
  .. ..$ name              : chr [1:2] "E-11EM3" "E-1132O"
  .. ..$ 1_43222779_A_G_b37: chr [1:2] "1" "0"
  .. ..$ 1_43222856_A_G_b37: chr [1:2] "1" "1"
  .. ..$ 1_43223126_C_T_b37: chr [1:2] "0" "0"
  .. ..$ 1_43223317_T_C_b37: chr [1:2] "0" "0"
  ..$ set_5: tibble [2 × 5] (S3: tbl_df/tbl/data.frame)
  .. ..$ name              : chr [1:2] "E-11EM3" "E-11EMC"
  .. ..$ 1_43222779_A_G_b37: chr [1:2] "1" "0"
  .. ..$ 1_43222856_A_G_b37: chr [1:2] "1" "1"
  .. ..$ 1_43223126_C_T_b37: chr [1:2] "2" "0"
  .. ..$ 1_43223317_T_C_b37: chr [1:2] "1" "2"
 $ ENSG0000015:List of 5
  ..$ set_1: tibble [1 × 5] (S3: tbl_df/tbl/data.frame)
  .. ..$ name              : chr "E-11EM3"
  .. ..$ 1_43222779_A_G_b37: chr "1"
  .. ..$ 1_43222856_A_G_b37: chr "0"
  .. ..$ 1_43223126_C_T_b37: chr "0"
  .. ..$ 1_43223317_T_C_b37: chr "1"
  ..$ set_2: tibble [2 × 5] (S3: tbl_df/tbl/data.frame)
  .. ..$ name              : chr [1:2] "E-11EMC" "E-14420"
  .. ..$ 1_43222779_A_G_b37: chr [1:2] "1" "0"
  .. ..$ 1_43222856_A_G_b37: chr [1:2] "1" "1"
  .. ..$ 1_43223126_C_T_b37: chr [1:2] "2" "0"
  .. ..$ 1_43223317_T_C_b37: chr [1:2] "2" "0"
  ..$ set_3: tibble [2 × 5] (S3: tbl_df/tbl/data.frame)
  .. ..$ name              : chr [1:2] "E-1122O" "E-1442O"
  .. ..$ 1_43222779_A_G_b37: chr "1"
  .. ..$ 1_43222856_A_G_b37: chr "0"
  .. ..$ 1_43223126_C_T_b37: chr [1:2] "1" "1"
  .. ..$ 1_43223317_T_C_b37: chr [1:2] "1" "2"
  ..$ set_4: tibble [2 × 5] (S3: tbl_df/tbl/data.frame)
  .. ..$ name              : chr [1:2] "E-11EM3" "E-1132O"
  .. ..$ 1_43222779_A_G_b37: chr [1:2] "1" "0"
  .. ..$ 1_43222856_A_G_b37: chr [1:2] "1" "1"
  .. ..$ 1_43223126_C_T_b37: chr [1:2] "0" "0"
  .. ..$ 1_43223317_T_C_b37: chr [1:2] "0" "0"
  ..$ set_5: tibble [2 × 5] (S3: tbl_df/tbl/data.frame)
  .. ..$ name              : chr [1:2] "E-11EM3" "E-11EMC"
  .. ..$ 1_43222779_A_G_b37: chr [1:2] "1" "0"
  .. ..$ 1_43222856_A_G_b37: chr [1:2] "1" "1"
  .. ..$ 1_43223126_C_T_b37: chr [1:2] "2" "0"
  .. ..$ 1_43223317_T_C_b37: chr [1:2] "1" "2"

非常感谢你的帮助。

这是使用 {dplyr} 和 {purrr} 的解决方案:

library(dplyr)
library(purrr)

# create a list containing a vector of `IID`s for each set  
set_id_list <- set_id %>% 
  transmute(across(set_1:set_5, ~ if_else(.x, NA_character_, IID))) %>% 
  map(~ discard(.x, is.na))

# nested loop:
# for each `start_list` dataframe, remove ENSG and expr columns, 
# then create versions filtered by each set of `IID`s
list_prime_out <- map(start_list, function(data) {
  data <- select(data, !ENSG:expr)
  map(set_id_list, ~ filter(data, name %in% .x))
})

输出:

# > list_prime_out
$ENSG0000014
$ENSG0000014$set_1
# A tibble: 1 x 5
  name    `1_43222779_A_~` `1_43222856_A_~` `1_43223126_C_~` `1_43223317_T_~`
  <chr>   <chr>            <chr>            <chr>            <chr>           
1 E-11EM3 1                0                1                0               

$ENSG0000014$set_2
# A tibble: 2 x 5
  name    `1_43222779_A_~` `1_43222856_A_~` `1_43223126_C_~` `1_43223317_T_~`
  <chr>   <chr>            <chr>            <chr>            <chr>           
1 E-11EMC 2                0                0                0               
2 E-1442O 1                1                1                2               

$ENSG0000014$set_3
# A tibble: 2 x 5
  name    `1_43222779_A_~` `1_43222856_A_~` `1_43223126_C_~` `1_43223317_T_~`
  <chr>   <chr>            <chr>            <chr>            <chr>           
1 E-1122O 1                0                0                1               
2 E-1442O 1                1                1                2               

$ENSG0000014$set_4
# A tibble: 2 x 5
  name    `1_43222779_A_~` `1_43222856_A_~` `1_43223126_C_~` `1_43223317_T_~`
  <chr>   <chr>            <chr>            <chr>            <chr>           
1 E-11EM3 1                0                1                0               
2 E-1132O 0                1                2                1               

$ENSG0000014$set_5
# A tibble: 2 x 5
  name    `1_43222779_A_~` `1_43222856_A_~` `1_43223126_C_~` `1_43223317_T_~`
  <chr>   <chr>            <chr>            <chr>            <chr>           
1 E-11EM3 1                0                1                0               
2 E-11EMC 2                0                0                0               


$ENSG0000015
$ENSG0000015$set_1
# A tibble: 1 x 3
  name    `1_43222779_A_G_b37` `1_43222856_A_G_b37`
  <chr>   <chr>                <chr>               
1 E-11EM3 1                    1                   

$ENSG0000015$set_2
# A tibble: 2 x 3
  name    `1_43222779_A_G_b37` `1_43222856_A_G_b37`
  <chr>   <chr>                <chr>               
1 E-11EMC 0                    2                   
2 E-1442O 1                    1                   

$ENSG0000015$set_3
# A tibble: 2 x 3
  name    `1_43222779_A_G_b37` `1_43222856_A_G_b37`
  <chr>   <chr>                <chr>               
1 E-1122O 0                    1                   
2 E-1442O 1                    1                   

$ENSG0000015$set_4
# A tibble: 2 x 3
  name    `1_43222779_A_G_b37` `1_43222856_A_G_b37`
  <chr>   <chr>                <chr>               
1 E-11EM3 1                    1                   
2 E-1132O 2                    0                   

$ENSG0000015$set_5
# A tibble: 2 x 3
  name    `1_43222779_A_G_b37` `1_43222856_A_G_b37`
  <chr>   <chr>                <chr>               
1 E-11EM3 1                    1                   
2 E-11EMC 0                    2                   

reprex package (v2.0.1)

创建于 2022-03-02