匹配具有同一基因多次出现的两个数据集
Match two datasets with multiple appearance of the same gene
我有两个数据集
set1 <- structure(list(gene = c("ENSG00000003096", "ENSG00000011677", "ENSG00000019169",
"ENSG00000022556", "ENSG00000029534"), bM = c(5069.84617263404,
1339.17994216287, 38.6160408376658, 589.853084670642, 2805.5926601769
), fold = c(2.85993763274585, 4.90104282563152, 5.16621421186109,
11.1359115874284, 2.95328623121562), value = c(7.37447235027197e-07,
2.76009547949742e-10, 1.56142531487242e-10, 1.97711024002255e-21,
8.60063581030308e-12), symbol = c("KLHL13", "GABPA", "MARCO", "NLRP2",
"ANK1")), class = "data.frame", row.names = c(NA, -5L))
set2 <- structure(list(symbol = c("KLHL13", "KLHL13", "KLHL13", "GABPA",
"GABPA", "GABPA", "MARCO", "MARCO"), geneID = c(90293L, 90293L, 90293L,
2551L, 2551L, 2551L, 8685L, 8685L), pathway = c("Class I MHC mediated antigen processing & presentation",
"Immune System", "Ubiquitin mediated proteolysis", "Mitochondrial biogenesis",
"Organelle biogenesis and maintenance", "Transcriptional activation of mitochondrial biogenesis",
"Binding and Uptake of Ligands by Scavenger Receptors", "Phagosome"
), pathwayID = c("REACT:R-HSA-983169", "REACT:R-HSA-168256", "KEGG:hsa04120",
"REACT:R-HSA-1592230", "REACT:R-HSA-1852241", "REACT:R-HSA-2151201",
"REACT:R-HSA-2173782", "KEGG:hsa04145")), class = "data.frame", row.names = c(NA,
-8L))
我想要的输出是;
ENSG00000003096 5069.84617263404 2.85993763274585 7.37447235027197e-07 KLHL13 KLHL13 90293 Class I MHC mediated antigen processing & presentation REACT:R-HSA-983169
ENSG00000003096 5069.84617263404 2.85993763274585 7.37447235027197e-07 KLHL13 KLHL13 90293 Immune System REACT:R-HSA-168256
ENSG00000003096 5069.84617263404 2.85993763274585 7.37447235027197e-07 KLHL13 KLHL13 90293 Ubiquitin mediated proteolysis KEGG:hsa04120
ENSG00000011677 1339.17994216287 4.90104282563152 2.76009547949742e-10 GABPA GABPA 2551 Mitochondrial biogenesis REACT:R-HSA-1592230
ENSG00000011677 1339.17994216287 4.90104282563152 2.76009547949742e-10 GABPA GABPA Organelle biogenesis and maintenance REACT:R-HSA-1852241
ENSG00000011677 1339.17994216287 4.90104282563152 2.76009547949742e-10 GABPA GABPA 2551 Transcriptional activation of mitochondrial biogenesis REACT:R-HSA-2151201
ENSG00000019169 38.6160408376658 5.16621421186109 1.56142531487242e-10 MARCO MARCO 8685 Binding and Uptake of Ligands by Scavenger Receptors REACT:R-HSA-2173782
ENSG00000019169 38.6160408376658 5.16621421186109 1.56142531487242e-10 MARCO MARCO 8685 Phagosome KEGG:hsa04145
ENSG00000022556 589.853084670642 11.1359115874284 1.97711024002255e-21 NLRP2 NA NA NA NA
ENSG00000029534 2805.5926601769 2.95328623121562 8.60063581030308e-12 ANK1 NA NA NA NA
如果我使用merge
,我会丢失不匹配的基因。如果我使用 match
我只会得到一个符号匹配。如何得到我想要的输出?
1. merge(set1, set2, by=c("symbol"))
2. set1[, (ncol(set1)+1):((ncol(set1))+ncol(set2))]<- set2[match(set1$symbol, set2$symbol), ]
Dplyr:: left_join
或 full_join
将完成工作:
> full_join(set1,set2,by="symbol")
gene bM fold value symbol geneID pathway
1 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Class I MHC mediated antigen processing & presentation
2 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Immune System
3 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Ubiquitin mediated proteolysis
4 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Mitochondrial biogenesis
5 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Organelle biogenesis and maintenance
6 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Transcriptional activation of mitochondrial biogenesis
7 ENSG00000019169 38.61604 5.166214 1.561425e-10 MARCO 8685 Binding and Uptake of Ligands by Scavenger Receptors
8 ENSG00000019169 38.61604 5.166214 1.561425e-10 MARCO 8685 Phagosome
9 ENSG00000022556 589.85308 11.135912 1.977110e-21 NLRP2 NA <NA>
10 ENSG00000029534 2805.59266 2.953286 8.600636e-12 ANK1 NA <NA>
pathwayID
1 REACT:R-HSA-983169
2 REACT:R-HSA-168256
3 KEGG:hsa04120
4 REACT:R-HSA-1592230
5 REACT:R-HSA-1852241
6 REACT:R-HSA-2151201
7 REACT:R-HSA-2173782
8 KEGG:hsa04145
9 <NA>
10 <NA>
> left_join(set1,set2,by="symbol")
gene bM fold value symbol geneID pathway
1 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Class I MHC mediated antigen processing & presentation
2 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Immune System
3 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Ubiquitin mediated proteolysis
4 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Mitochondrial biogenesis
5 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Organelle biogenesis and maintenance
6 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Transcriptional activation of mitochondrial biogenesis
7 ENSG00000019169 38.61604 5.166214 1.561425e-10 MARCO 8685 Binding and Uptake of Ligands by Scavenger Receptors
8 ENSG00000019169 38.61604 5.166214 1.561425e-10 MARCO 8685 Phagosome
9 ENSG00000022556 589.85308 11.135912 1.977110e-21 NLRP2 NA <NA>
10 ENSG00000029534 2805.59266 2.953286 8.600636e-12 ANK1 NA <NA>
pathwayID
1 REACT:R-HSA-983169
2 REACT:R-HSA-168256
3 KEGG:hsa04120
4 REACT:R-HSA-1592230
5 REACT:R-HSA-1852241
6 REACT:R-HSA-2151201
7 REACT:R-HSA-2173782
8 KEGG:hsa04145
9 <NA>
10 <NA>
您也可以使用plyr::join_all
> join_all(list(set1,set2),by="symbol")
gene bM fold value symbol geneID pathway
1 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Class I MHC mediated antigen processing & presentation
2 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Immune System
3 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Ubiquitin mediated proteolysis
4 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Mitochondrial biogenesis
5 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Organelle biogenesis and maintenance
6 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Transcriptional activation of mitochondrial biogenesis
7 ENSG00000019169 38.61604 5.166214 1.561425e-10 MARCO 8685 Binding and Uptake of Ligands by Scavenger Receptors
8 ENSG00000019169 38.61604 5.166214 1.561425e-10 MARCO 8685 Phagosome
9 ENSG00000022556 589.85308 11.135912 1.977110e-21 NLRP2 NA <NA>
10 ENSG00000029534 2805.59266 2.953286 8.600636e-12 ANK1 NA <NA>
pathwayID
1 REACT:R-HSA-983169
2 REACT:R-HSA-168256
3 KEGG:hsa04120
4 REACT:R-HSA-1592230
5 REACT:R-HSA-1852241
6 REACT:R-HSA-2151201
7 REACT:R-HSA-2173782
8 KEGG:hsa04145
9 <NA>
10 <NA>
我有两个数据集
set1 <- structure(list(gene = c("ENSG00000003096", "ENSG00000011677", "ENSG00000019169",
"ENSG00000022556", "ENSG00000029534"), bM = c(5069.84617263404,
1339.17994216287, 38.6160408376658, 589.853084670642, 2805.5926601769
), fold = c(2.85993763274585, 4.90104282563152, 5.16621421186109,
11.1359115874284, 2.95328623121562), value = c(7.37447235027197e-07,
2.76009547949742e-10, 1.56142531487242e-10, 1.97711024002255e-21,
8.60063581030308e-12), symbol = c("KLHL13", "GABPA", "MARCO", "NLRP2",
"ANK1")), class = "data.frame", row.names = c(NA, -5L))
set2 <- structure(list(symbol = c("KLHL13", "KLHL13", "KLHL13", "GABPA",
"GABPA", "GABPA", "MARCO", "MARCO"), geneID = c(90293L, 90293L, 90293L,
2551L, 2551L, 2551L, 8685L, 8685L), pathway = c("Class I MHC mediated antigen processing & presentation",
"Immune System", "Ubiquitin mediated proteolysis", "Mitochondrial biogenesis",
"Organelle biogenesis and maintenance", "Transcriptional activation of mitochondrial biogenesis",
"Binding and Uptake of Ligands by Scavenger Receptors", "Phagosome"
), pathwayID = c("REACT:R-HSA-983169", "REACT:R-HSA-168256", "KEGG:hsa04120",
"REACT:R-HSA-1592230", "REACT:R-HSA-1852241", "REACT:R-HSA-2151201",
"REACT:R-HSA-2173782", "KEGG:hsa04145")), class = "data.frame", row.names = c(NA,
-8L))
我想要的输出是;
ENSG00000003096 5069.84617263404 2.85993763274585 7.37447235027197e-07 KLHL13 KLHL13 90293 Class I MHC mediated antigen processing & presentation REACT:R-HSA-983169
ENSG00000003096 5069.84617263404 2.85993763274585 7.37447235027197e-07 KLHL13 KLHL13 90293 Immune System REACT:R-HSA-168256
ENSG00000003096 5069.84617263404 2.85993763274585 7.37447235027197e-07 KLHL13 KLHL13 90293 Ubiquitin mediated proteolysis KEGG:hsa04120
ENSG00000011677 1339.17994216287 4.90104282563152 2.76009547949742e-10 GABPA GABPA 2551 Mitochondrial biogenesis REACT:R-HSA-1592230
ENSG00000011677 1339.17994216287 4.90104282563152 2.76009547949742e-10 GABPA GABPA Organelle biogenesis and maintenance REACT:R-HSA-1852241
ENSG00000011677 1339.17994216287 4.90104282563152 2.76009547949742e-10 GABPA GABPA 2551 Transcriptional activation of mitochondrial biogenesis REACT:R-HSA-2151201
ENSG00000019169 38.6160408376658 5.16621421186109 1.56142531487242e-10 MARCO MARCO 8685 Binding and Uptake of Ligands by Scavenger Receptors REACT:R-HSA-2173782
ENSG00000019169 38.6160408376658 5.16621421186109 1.56142531487242e-10 MARCO MARCO 8685 Phagosome KEGG:hsa04145
ENSG00000022556 589.853084670642 11.1359115874284 1.97711024002255e-21 NLRP2 NA NA NA NA
ENSG00000029534 2805.5926601769 2.95328623121562 8.60063581030308e-12 ANK1 NA NA NA NA
如果我使用merge
,我会丢失不匹配的基因。如果我使用 match
我只会得到一个符号匹配。如何得到我想要的输出?
1. merge(set1, set2, by=c("symbol"))
2. set1[, (ncol(set1)+1):((ncol(set1))+ncol(set2))]<- set2[match(set1$symbol, set2$symbol), ]
Dplyr:: left_join
或 full_join
将完成工作:
> full_join(set1,set2,by="symbol")
gene bM fold value symbol geneID pathway
1 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Class I MHC mediated antigen processing & presentation
2 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Immune System
3 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Ubiquitin mediated proteolysis
4 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Mitochondrial biogenesis
5 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Organelle biogenesis and maintenance
6 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Transcriptional activation of mitochondrial biogenesis
7 ENSG00000019169 38.61604 5.166214 1.561425e-10 MARCO 8685 Binding and Uptake of Ligands by Scavenger Receptors
8 ENSG00000019169 38.61604 5.166214 1.561425e-10 MARCO 8685 Phagosome
9 ENSG00000022556 589.85308 11.135912 1.977110e-21 NLRP2 NA <NA>
10 ENSG00000029534 2805.59266 2.953286 8.600636e-12 ANK1 NA <NA>
pathwayID
1 REACT:R-HSA-983169
2 REACT:R-HSA-168256
3 KEGG:hsa04120
4 REACT:R-HSA-1592230
5 REACT:R-HSA-1852241
6 REACT:R-HSA-2151201
7 REACT:R-HSA-2173782
8 KEGG:hsa04145
9 <NA>
10 <NA>
> left_join(set1,set2,by="symbol")
gene bM fold value symbol geneID pathway
1 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Class I MHC mediated antigen processing & presentation
2 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Immune System
3 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Ubiquitin mediated proteolysis
4 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Mitochondrial biogenesis
5 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Organelle biogenesis and maintenance
6 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Transcriptional activation of mitochondrial biogenesis
7 ENSG00000019169 38.61604 5.166214 1.561425e-10 MARCO 8685 Binding and Uptake of Ligands by Scavenger Receptors
8 ENSG00000019169 38.61604 5.166214 1.561425e-10 MARCO 8685 Phagosome
9 ENSG00000022556 589.85308 11.135912 1.977110e-21 NLRP2 NA <NA>
10 ENSG00000029534 2805.59266 2.953286 8.600636e-12 ANK1 NA <NA>
pathwayID
1 REACT:R-HSA-983169
2 REACT:R-HSA-168256
3 KEGG:hsa04120
4 REACT:R-HSA-1592230
5 REACT:R-HSA-1852241
6 REACT:R-HSA-2151201
7 REACT:R-HSA-2173782
8 KEGG:hsa04145
9 <NA>
10 <NA>
您也可以使用plyr::join_all
> join_all(list(set1,set2),by="symbol")
gene bM fold value symbol geneID pathway
1 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Class I MHC mediated antigen processing & presentation
2 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Immune System
3 ENSG00000003096 5069.84617 2.859938 7.374472e-07 KLHL13 90293 Ubiquitin mediated proteolysis
4 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Mitochondrial biogenesis
5 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Organelle biogenesis and maintenance
6 ENSG00000011677 1339.17994 4.901043 2.760095e-10 GABPA 2551 Transcriptional activation of mitochondrial biogenesis
7 ENSG00000019169 38.61604 5.166214 1.561425e-10 MARCO 8685 Binding and Uptake of Ligands by Scavenger Receptors
8 ENSG00000019169 38.61604 5.166214 1.561425e-10 MARCO 8685 Phagosome
9 ENSG00000022556 589.85308 11.135912 1.977110e-21 NLRP2 NA <NA>
10 ENSG00000029534 2805.59266 2.953286 8.600636e-12 ANK1 NA <NA>
pathwayID
1 REACT:R-HSA-983169
2 REACT:R-HSA-168256
3 KEGG:hsa04120
4 REACT:R-HSA-1592230
5 REACT:R-HSA-1852241
6 REACT:R-HSA-2151201
7 REACT:R-HSA-2173782
8 KEGG:hsa04145
9 <NA>
10 <NA>