映射两个不同数据集中的通用名称
mapping common names in two different datasets
我有两个数据框。我想通过与 dataframe_2.
进行比较,找出 dataframe_1 中每个基因的备选基因名称
data_frame_1
chr start end CNA Genes No.of.Gene
1 13991 1401 gain Cfh,Gm26048 2
1 14011 1490 gain Zfp788,Rik 2
data.frame_2
Associated_Gene_Name Chromosome_Name Gene_Start Gene_End Associated_Gene_Name_1 Chromosome_Name_1 Gene_Start_1 Gene_End_1
Cfh 1 13900 14100 CFH 3 43900 54100
Gm26048 1 13998 14010 TFE 1 76710 76790
Zfp788 2 43970 44180 ELF 4 131950 133100
Rik 3 202100 202600 RIK 5 881100 1036800
data_frame_result
chr start end CNA Genes No.of.Gene Associated.Gene.name_1
1 13991 1401 gain Cfh,Gm26048 2 CFH,TFE
1 14011 1490 gain Zfp788,Rik 2 ELF,RIK
用逗号分隔多个值确实会使事情变得混乱。这是一个链,它将 "normalize" 数据使每行一个值,以便您可以进行标准合并。我使用 magrittr
库来链接命令
#test data
data_frame_1<-data.frame(
Genes=c("Cfh,Gm26048","Gm5852,Gm5773","Elf","Ttn")
)
data_frame_2<-data.frame(
Genes_1=c("Cfh","Gm26048","Gm5852","Gm5773","Elf","Ttn"),
Alternate_Gene_name = c("CFH","FGFR","NAA","TFE","ELF","TTN")
)
library(magrittr)
idxstack <- function(x, idx=if(!is.null(names(x))) {names(x)} else {seq_along(x)})
do.call(rbind, Map(function(a,b) cbind.data.frame(idx=a,val=b), idx, x))
as.character(data_frame_1$Genes) %>%
{setNames(strsplit(., , split=","), .)} %>%
idxstack %>%
merge(data_frame_2, by.x="val", by.y="Genes_1", all.x=TRUE) %>%
aggregate(Alternate_Gene_name~idx, ., paste0, collapse=",") %>%
merge(data_frame_1,., by.x="Genes", by.y="idx")
哪个returns
Genes Alternate_Gene_name
1 Cfh,Gm26048 CFH,FGFR
2 Elf ELF
3 Gm5852,Gm5773 TFE,NAA
4 Ttn TTN
我有两个数据框。我想通过与 dataframe_2.
进行比较,找出 dataframe_1 中每个基因的备选基因名称 data_frame_1
chr start end CNA Genes No.of.Gene
1 13991 1401 gain Cfh,Gm26048 2
1 14011 1490 gain Zfp788,Rik 2
data.frame_2
Associated_Gene_Name Chromosome_Name Gene_Start Gene_End Associated_Gene_Name_1 Chromosome_Name_1 Gene_Start_1 Gene_End_1
Cfh 1 13900 14100 CFH 3 43900 54100
Gm26048 1 13998 14010 TFE 1 76710 76790
Zfp788 2 43970 44180 ELF 4 131950 133100
Rik 3 202100 202600 RIK 5 881100 1036800
data_frame_result
chr start end CNA Genes No.of.Gene Associated.Gene.name_1
1 13991 1401 gain Cfh,Gm26048 2 CFH,TFE
1 14011 1490 gain Zfp788,Rik 2 ELF,RIK
用逗号分隔多个值确实会使事情变得混乱。这是一个链,它将 "normalize" 数据使每行一个值,以便您可以进行标准合并。我使用 magrittr
库来链接命令
#test data
data_frame_1<-data.frame(
Genes=c("Cfh,Gm26048","Gm5852,Gm5773","Elf","Ttn")
)
data_frame_2<-data.frame(
Genes_1=c("Cfh","Gm26048","Gm5852","Gm5773","Elf","Ttn"),
Alternate_Gene_name = c("CFH","FGFR","NAA","TFE","ELF","TTN")
)
library(magrittr)
idxstack <- function(x, idx=if(!is.null(names(x))) {names(x)} else {seq_along(x)})
do.call(rbind, Map(function(a,b) cbind.data.frame(idx=a,val=b), idx, x))
as.character(data_frame_1$Genes) %>%
{setNames(strsplit(., , split=","), .)} %>%
idxstack %>%
merge(data_frame_2, by.x="val", by.y="Genes_1", all.x=TRUE) %>%
aggregate(Alternate_Gene_name~idx, ., paste0, collapse=",") %>%
merge(data_frame_1,., by.x="Genes", by.y="idx")
哪个returns
Genes Alternate_Gene_name
1 Cfh,Gm26048 CFH,FGFR
2 Elf ELF
3 Gm5852,Gm5773 TFE,NAA
4 Ttn TTN