从具有特定条件的两个不同的 DataFrame 创建 DataFrame

Create DataFrame from two different DataFrame with specific condition

我想从两种不同类型的 DataFrame 创建一个具有条件的 DataFrame 并保留额外的列。我的第一个 DataFrame 是:

    sample_id      motif    chromosome position   
        1         CT-G.A      chr1        7300        
        1         TA-C.C      chr1        1000        
        1         TC-G.C      chr2        1200        
        1         TC-G.C      chr2        3000        
        2         CG-A.T      chr2        12898       
        2         CA-G.T      chr2        234235      

第二个 DataFrame 是:

geneID    chromosome   start     end       
  E1          chr1      100      10300            
  E2          chr1      1100     20122                   
  E3          chr2      1200     2000                         
  E4          chr2      400      234236              
  E5          chr2      12000    20000        

然后我想创建一个具有以下条件的 DataFrame:

if (first$chromosome == second$chromosome & second$start<= first$position <= second$end)  

然后我在那个基因中有一个基序。因此我想创建这个 DataFrame:

sample_id     E1,CT-G.A   E1,TA-C.C   E1,TC-G.C   E1,TC-G.C   E1,CG-A.T   E1,CA-G.T     E2,CT-G.A   E2,TA-C.C   E2,TC-G.C   E2,CG-A.T   E2,CA-G.T     E3,CT-G.A   E3,TA-C.C   E3,TC-G.C   E3,CG-A.T   E3,CA-G.T     E4,CT-G.A   E4,TA-C.C   E4,TC-G.C   E4,CG-A.T   E4,CA-G.T     E5,CT-G.A   E5,TA-C.C   E5,TC-G.C   E5,CG-A.T   E5,CA-G.T     E6,CT-G.A   E6,TA-C.C   E6,TC-G.C   E6,CG-A.T   E6,CA-G.T
    1            1         1            0             0           0           0          1            0            0            0            0            0            0            1            0            0            0            0      2            0            0            0            0            0            0            0            0          0            0          0      0
    2            0         0            0             0           0           0          0            0            0            0            0            0            0            0            0            0            0            0      0            1            1            0            0            0            1            0            0          0            0          0      0

这行得通。但是,如果您这样做,您可能需要考虑一下您的专栏 headers。

library(dplyr)
library(tidyr)

df1 %>% inner_join(df2, "chromosome") %>% 
  mutate(geneID_motif = paste(geneID, motif, sep = ","),
         n = if_else(position >= start & position <= end, 1, 0)) %>% 
  select(sample_id, geneID_motif, n) %>%
  group_by(sample_id, geneID_motif) %>% 
  summarise(n = sum(n)) %>%
  spread(key = geneID_motif, value = n, fill = 0)

# A tibble: 2 x 14
# Groups:   sample_id [2]
  sample_id `E1,CT-G.A` `E1,TA-C.C` `E2,CT-G.A` `E2,TA-C.C` `E3,CA-G.T` `E3,CG-A.T` `E3,TC-G.C` `E4,CA-G.T` `E4,CG-A.T` `E4,TC-G.C`
      <int>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
1         1        1.00        1.00        1.00           0           0           0        1.00        0           0           2.00
2         2        0           0           0              0           0           0        0           1.00        1.00        0   
# ... with 3 more variables: `E5,CA-G.T` <dbl>, `E5,CG-A.T` <dbl>, `E5,TC-G.C` <dbl>

数据:

  df1 <-
  structure(
    list(
      sample_id = c(1L, 1L, 1L, 1L, 2L, 2L),
      motif = c("CT-G.A", "TA-C.C", "TC-G.C", "TC-G.C", "CG-A.T", "CA-G.T"),
      chromosome = c("chr1", "chr1", "chr2", "chr2", "chr2", "chr2"),
      position = c(7300L, 1000L, 1200L, 3000L, 12898L, 234235L)
    ),
    .Names = c("sample_id", "motif", "chromosome", "position"),
    class = "data.frame",
    row.names = c(NA,-6L)
  )
df2 <-
  structure(
    list(
      geneID = c("E1", "E2", "E3", "E4", "E5"),
      chromosome = c("chr1", "chr1", "chr2", "chr2", "chr2"),
      start = c(100L, 1100L, 1200L,400L, 12000L),
      end = c(10300L, 20122L, 2000L, 234236L, 20000L)
    ),
    .Names = c("geneID", "chromosome", "start", "end"),
    class = "data.frame",
    row.names = c(NA,-5L)
  )
library(dplyr)
library(tidyr)

df1 %>%
  crossing(df2) %>%
  mutate(geneID_motif = paste(geneID, motif, sep=","),
         flag=ifelse(start <= position & position <= end & chromosome1 == chromosome2, 1, 0)) %>%
  select(sample_id, geneID_motif, flag) %>%
  group_by(sample_id, geneID_motif) %>%
  summarise(flag=as.integer(sum(flag))) %>%
  spread(geneID_motif, flag) %>%
  replace(is.na(.),0) %>%
  data.frame(check.names=FALSE)

输出为:

  sample_id E1,CA-G.T E1,CG-A.T E1,CT-G.A E1,TA-C.C E1,TC-G.C E2,CA-G.T E2,CG-A.T E2,CT-G.A E2,TA-C.C E2,TC-G.C
1         1         0         0         1         1         0         0         0         1         0         0
2         2         0         0         0         0         0         0         0         0         0         0
  E3,CA-G.T E3,CG-A.T E3,CT-G.A E3,TA-C.C E3,TC-G.C E4,CA-G.T E4,CG-A.T E4,CT-G.A E4,TA-C.C E4,TC-G.C E5,CA-G.T
1         0         0         0         0         1         0         0         0         0         2         0
2         0         0         0         0         0         1         1         0         0         0         0
  E5,CG-A.T E5,CT-G.A E5,TA-C.C E5,TC-G.C
1         0         0         0         0
2         1         0         0         0

示例数据:

df1 <- structure(list(sample_id = c(1L, 1L, 1L, 1L, 2L, 2L), motif = c("CT-G.A", 
"TA-C.C", "TC-G.C", "TC-G.C", "CG-A.T", "CA-G.T"), chromosome1 = c("chr1", 
"chr1", "chr2", "chr2", "chr2", "chr2"), position = c(7300L, 
1000L, 1200L, 3000L, 12898L, 234235L)), .Names = c("sample_id", 
"motif", "chromosome1", "position"), class = "data.frame", row.names = c(NA, 
-6L))

df2 <- structure(list(geneID = c("E1", "E2", "E3", "E4", "E5"), chromosome2 = c("chr1", 
"chr1", "chr2", "chr2", "chr2"), start = c(100L, 1100L, 1200L, 
400L, 12000L), end = c(10300L, 20122L, 2000L, 234236L, 20000L
)), .Names = c("geneID", "chromosome2", "start", "end"), class = "data.frame", row.names = c(NA, 
-5L))