通过比较两个数据帧来过滤常见字符串

Filter common strings by comparing two dataframe

我有两个数据集 df1 和 df2。如何从 df1 中删除 df2 中找到的基因名称。

df1<-

chr   start   end     CNA       Genes                  No.of.Gene
   1    13991   1401    gain    Cfh,Gm26048,Bhis,Sclm       2
   1    14011   1490    gain    Zfp788,Rik                  2

df2 <-

       Genes
      Gm26048
        Gif
        Tl2
        Rik

预期输出

           chr   start   end     CNA    Genes                No.of.Gene
           1    13991   1401    gain     Cfh,Bhis,Sclm              2
           1    14011   1490    gain    Zfp788                      2

可以使用,

df1$Genes <- sapply(strsplit(as.character(df1$Genes), ','), function(i)
                                                     setdiff(i, df2$Genes))

df1
#  chr start  end  CNA  Genes No.of.Gene
#1   1 13991 1401 gain    Cfh          2
#2   1 14011 1490 gain Zfp788          2

编辑

你改变后df1,然后得到预期的结果

sapply(sapply(strsplit(as.character(df1$Genes), ','), function(i) 
                              setdiff(i, df2$Genes)), paste, collapse = ',')
#[1] "Cfh,Bhis,Sclm" "Zfp788"

另一种选择是使用 gsub

df1$Genes <- gsub(",(?=,)|,$", "", gsub(paste0("(", paste(df2$Genes, 
              collapse="|"), ")"), "", df1$Genes), perl= TRUE)
df1$Genes
#[1] "Cfh,Bhis,Sclm" "Zfp788"  

我们可以将 Genes 列转换为行,然后使用过滤器:

#data
df1 <- read.table(text = "
chr   start   end     CNA       Genes                  No.of.Gene
1    13991   1401    gain    Cfh,Gm26048,Bhis,Sclm       2
1    14011   1490    gain    Zfp788,Rik                  2", header = TRUE)
df2 <- read.table(text = "
Genes
Gm26048
Gif
Tl2
Rik", header = TRUE)

library(dplyr)
library(tidyr)

# filter matching genes - intersect    
df1 %>% 
  mutate(Gene = strsplit(as.character(Genes), ",")) %>%
  unnest(Gene) %>% 
  filter(Gene %in% df2$Genes)

#     chr start   end    CNA                 Genes No.of.Gene    Gene
#   (int) (int) (int) (fctr)                (fctr)      (int)   (chr)
# 1     1 13991  1401   gain Cfh,Gm26048,Bhis,Sclm          2 Gm26048
# 2     1 14011  1490   gain            Zfp788,Rik          2     Rik

# filter non-matching genes - setdiff
df1 %>% 
  mutate(Gene = strsplit(as.character(Genes), ",")) %>%
  unnest(Gene) %>% 
  filter(!Gene %in% df2$Genes)

#     chr start   end    CNA                 Genes No.of.Gene   Gene
#    (int) (int) (int) (fctr)                (fctr)      (int)  (chr)
# 1     1 13991  1401   gain Cfh,Gm26048,Bhis,Sclm          2    Cfh
# 2     1 13991  1401   gain Cfh,Gm26048,Bhis,Sclm          2   Bhis
# 3     1 13991  1401   gain Cfh,Gm26048,Bhis,Sclm          2   Sclm
# 4     1 14011  1490   gain            Zfp788,Rik          2 Zfp788

模式 1 (pattn1) 将负责删除 df2 中列出的基因,而 pattn2 将删除任何尾随逗号:

pattn1 <- paste0(df2$Genes, collapse=",?|")
df1$Genes <- str_replace_all(df1$Genes, pattn1, "")
pattn2 <- c("^,|,$")
df1$Genes <- str_replace_all(df1$Genes, pattn2, "")

## Results ##

  chr start  end  CNA         Genes No.of.Gene
1   1 13991 1401 gain Cfh,Bhis,Sclm          2
2   1 14011 1490 gain        Zfp788          2