比较两列不同的数据框并在 R 中创建另一列

Compare two columns of distinct data frames and create another column in R

我有以下数据框:

df1

> head(df1)
  data.source                  data.term_name data.term_id data.adjusted_p_value 
1       GO:MF              catalytic activity   GO:0003824         4.940656e-324                                323.3062
2       GO:MF              molecular_function   GO:0003674         4.940656e-324                                323.3062
3       GO:MF                         binding   GO:0005488         4.940656e-324                                323.3062
4       GO:MF                     ion binding   GO:0043167         4.940656e-324                                323.3062
5       GO:MF organic cyclic compound binding   GO:0097159         9.573609e-319                                318.0189
6       GO:MF   heterocyclic compound binding   GO:1901363 

df2

> head(filter_tig2)
# A tibble: 6 × 5
# Groups:   g0.seq_id, g0.product [6]
  g0.seq_id g0.product                                     g0.ontology_term                                    name_count process
  <chr>     <chr>                                          <chr>                                                    <int> <lgl>  
1 000000F   3-hydroxy-3-methylglutaryl coenzyme A synthase GO:0003824,GO:0008152,GO:0004421,GO:0003824                  1 NA     
2 000000F   3-isopropylmalate dehydratase                  GO:0055114,GO:0009098,GO:0016616,GO:0003862,GO:000…          1 NA     
3 000000F   40S ribosomal protein S17                      GO:0006412,GO:0003735,GO:0005622,GO:0005840                  1 NA     
4 000000F   40S ribosomal protein S20                      GO:0006412,GO:0003735,GO:0003723,GO:0005840,GO:001…          1 NA     
5 000000F   40S ribosomal protein S26                      GO:0006412,GO:0003735,GO:0005622,GO:0005840                  1 NA     
6 000000F   40S ribosomal protein S29                      GO:0003735,GO:0006412,GO:0005840,GO:0005622                  1 NA  

我需要在 df2 中使用基于 *data.source 值创建一个新列data.term_id 来自 df1

df1

> dput(head(df1))
structure(list(data.source = c("GO:MF", "GO:MF", "GO:MF", "GO:MF", 
"GO:MF", "GO:MF"), data.term_name = c("catalytic activity", "molecular_function", 
"binding", "ion binding", "organic cyclic compound binding", 
"heterocyclic compound binding"), data.term_id = c("GO:0003824", 
"GO:0003674", "GO:0005488", "GO:0043167", "GO:0097159", "GO:1901363"
), data.adjusted_p_value = c(4.94065645841247e-324, 4.94065645841247e-324, 
4.94065645841247e-324, 4.94065645841247e-324, 9.573608832595e-319, 
4.68422156625148e-318), data.negative_log10_of_adjusted_p_value = c(323.306215343116, 
323.306215343116, 323.306215343116, 323.306215343116, 318.018924321299, 
317.329362570746), data.term_size = c(3786L, 6157L, 3549L, 2192L, 
2116L, 2112L), data.query_size = c(7587L, 7587L, 7587L, 7587L, 
7587L, 7587L), data.intersection_size = c(3786L, 6153L, 3549L, 
2192L, 2116L, 2112L), data.effective_domain_size = c(10286L, 
10286L, 10286L, 10286L, 10286L, 10286L)), row.names = c(NA, 6L
), class = "data.frame")

df2

> dput(head(df2))
structure(list(g0.seq_id = c("000000F", "000000F", "000000F", 
"000000F", "000000F", "000000F"), g0.product = c("3-hydroxy-3-methylglutaryl coenzyme A synthase", 
"3-isopropylmalate dehydratase", "40S ribosomal protein S17", 
"40S ribosomal protein S20", "40S ribosomal protein S26", "40S ribosomal protein S29"
), g0.ontology_term = c("GO:0008299,GO:0008152,GO:0004421,GO:0003824", 
"GO:0055114,GO:0009098,GO:0016616,GO:0003862,GO:0000287,GO:0051287,GO:0005737", 
"GO:0006412,GO:0003735,GO:0005622,GO:0005840", "GO:0006412,GO:0003735,GO:0003723,GO:0005840,GO:0015935,GO:0005622", 
"GO:0006412,GO:0003735,GO:0005622,GO:0005840", "GO:0003735,GO:0006412,GO:0005840,GO:0005622"
), name_count = c(1L, 1L, 1L, 1L, 1L, 1L), process = c(NA, NA, 
NA, NA, NA, NA)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L), groups = structure(list(g0.seq_id = c("000000F", 
"000000F", "000000F", "000000F", "000000F", "000000F"), g0.product = c("3-hydroxy-3-methylglutaryl coenzyme A synthase", 
"3-isopropylmalate dehydratase", "40S ribosomal protein S17", 
"40S ribosomal protein S20", "40S ribosomal protein S26", "40S ribosomal protein S29"
), .rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L), .drop = TRUE))

所以我需要搜索 df2$g0.ontology_term 的值是否在 df1$data.term_id , 如果是,请将 data.source 通讯员放入新栏中。

例如:

我在 df2$g0.ontology_term 中有 GO:0003824 ,所以我的输出将是:

 data.source                  data.term_name data.term_id data.adjusted_p_value new_column
1       GO:MF              catalytic activity   GO:0003824         4.940656e-324  GO:MF

我试过使用 dplyr,但我做不到。

有人能帮帮我吗?

我认为这就是您想要做的,即为匹配的一行向 df2 添加 data.source? (您的示例输出是扩充 df1,所以我已经对齐了单词。)

library(tidyverse)

df1 <- tribble(
  ~data.source, ~data.term_name, ~data.term_id, ~data.adjusted_p_value,
  "GO:MF", "catalytic activity", "GO:0003824", "4.940656e-324",
  "GO:MF", "molecular_function", "GO:0003674", "4.940656e-324",
  "GO:MF", "binding", "GO:0005488", "4.940656e-324",
  "GO:MF", "ion binding", "GO:0043167", "4.940656e-324",
  "GO:MF", "organic cyclic compound binding", "GO:0097159", "9.573609e-319",
  "GO:MF", "heterocyclic compound binding", "GO:1901363", NA
)

df2 <- tribble(
  ~g0.seq_id, ~g0.product, ~g0.ontology_term, ~name_count, ~process,
  "000000F", "3-hydroxy-3-methylglutaryl coenzyme A synthase", "GO:0003824 GO:0008152,GO:0004421,GO:0003824", 1, NA,
  "000000F", "3-isopropylmalate dehydratase", "GO:0055114 GO:0009098,GO:0016616,GO:0003862,GO:000…", 1, NA,
  "000000F", "40S ribosomal protein S17", "GO:0006412 GO:0003735,GO:0005622,GO:0005840", 1, NA,
  "000000F", "40S ribosomal protein S20", "GO:0006412 GO:0003735,GO:0003723,GO:0005840,GO:001…", 1, NA,
  "000000F", "40S ribosomal protein S26", "GO:0006412 GO:0003735,GO:0005622,GO:0005840", 1, NA,
  "000000F", "40S ribosomal protein S29", "GO:0003735 GO:0006412,GO:0005840,GO:0005622", 1, NA
)

df3 <- df2 |> 
  separate(g0.ontology_term, into = c("data.term_id", "g0.ontology_term part2"), sep = " ") |> 
  inner_join(df1 |> select(data.source, data.term_id), by = "data.term_id")

df3
#> # A tibble: 1 × 7
#>   g0.seq_id g0.product          data.term_id `g0.ontology_t…` name_count process
#>   <chr>     <chr>               <chr>        <chr>                 <dbl> <lgl>  
#> 1 000000F   3-hydroxy-3-methyl… GO:0003824   GO:0008152,GO:0…          1 NA     
#> # … with 1 more variable: data.source <chr>

reprex package (v2.0.1)

于 2022-05-18 创建