从两个数据框中获取公共信息

Question

我有一个很大的文件，比如

    > dput(head(big))
structure(list(Tumor_Sample_Barcode = c("LP6005334_DNA_H01", 
"LP6005334_DNA_H01", "LP6005334_DNA_H01", "LP6005334_DNA_H01", 
"LP6005334_DNA_H01", "LP6005334_DNA_H01"), Chromosome = c("chr1", 
"chr1", "chr1", "chr1", "chr1", "chr1"), Start_Position = c(782112, 
1026918, 1133283, 1431511, 1742395, 1864994), Reference_Allele = c("G", 
"C", "C", "G", "C", "G"), Tumor_Seq_Allele2 = c("A", "T", "T", 
"A", "T", "A")), row.names = c("1", "2", "3", "4", "5", "6"), class = "data.frame")
> 
> dim(big)
[1] 555437      5
>

这包括突变的染色体位置

对于相同的样本，我有一个较小的数据框，如

> dput(head(small))
structure(list(Tumor_Sample_Barcode = c("LP6005334_DNA_H01", 
"LP6005334_DNA_H01", "LP6005334_DNA_H01"), Chromosome = c("chr1", 
"chr6", "chr16"), Start_Position = c("89616151", "51909815", 
"20556547"), End_Position = c("89616151", "51909815", "20556547"
), Reference_Allele = c("-", "A", "-"), Tumor_Seq_Allele2 = c("T", 
"-", "G"), Hugo_Symbol = c("GBP7", "PKHD1", "ACSM2B"), Variant_Classification = c("Frame_Shift_Ins", 
"Frame_Shift_Del", "Frame_Shift_Ins"), tx = c("NM_207398", "NM_138694", 
"NM_001105069"), exon = c("exon6", "exon25", "exon10"), txChange = c("c.732dupA", 
"c.2664delT", "c.1212dupC"), aaChange = c("p.L245fs", "p.F888fs", 
"p.G405fs"), Variant_Type = c("INS", "DEL", "INS"), Func.refGene = c("exonic", 
"exonic", "exonic"), Gene.refGene = c("GBP7", "PKHD1", "ACSM2B"
), GeneDetail.refGene = c(".", ".", "."), ExonicFunc.refGene = c("frameshift insertion", 
"frameshift deletion", "frameshift insertion"), AAChange.refGene = c("GBP7:NM_207398:exon6:c.732dupA:p.L245fs", 
"PKHD1:NM_138694:exon25:c.2664delT:p.F888fs,PKHD1:NM_170724:exon25:c.2664delT:p.F888fs", 
"ACSM2B:NM_001105069:exon10:c.1212dupC:p.G405fs,ACSM2B:NM_182617:exon11:c.1212dupC:p.G405fs"
), Func.knownGene = c("exonic", "exonic", "exonic"), Gene.knownGene = c("GBP7", 
"PKHD1", "ACSM2B"), GeneDetail.knownGene = c(".", ".", "."), 
    ExonicFunc.knownGene = c("frameshift insertion", "frameshift deletion", 
    "frameshift insertion"), AAChange.knownGene = c("GBP7:uc001dna.2:exon6:c.732dupA:p.L245fs", 
    "PKHD1:uc003pah.1:exon25:c.2664delT:p.F888fs,PKHD1:uc003pai.3:exon25:c.2664delT:p.F888fs", 
    "ACSM2B:uc002dhk.4:exon10:c.1212dupC:p.G405fs,ACSM2B:uc010bwf.1:exon10:c.1212dupC:p.G405fs,ACSM2B:uc002dhj.4:exon11:c.1212dupC:p.G405fs"
    ), avsnp147 = c("rs200922375", ".", "rs750935206"), `1000g2015aug_all` = c("0.00738818", 
    ".", "."), `1000g2015aug_amr` = c("0.0014", ".", "."), `1000g2015aug_sas` = c(".", 
    ".", "."), `1000g2015aug_eur` = c(".", ".", "."), `1000g2015aug_eas` = c(".", 
    ".", "."), esp6500siv2_all = c("0.0082", ".", "."), esp6500siv2_ea = c(".", 
    ".", "."), esp6500siv2_aa = c("0.0242", ".", "."), ExAC_ALL = c("0.0021", 
    ".", "8.254e-06"), ExAC_AFR = c("0.0239", ".", "0"), ExAC_AMR = c("0.0004", 
    ".", "0"), ExAC_EAS = c("0.0001", ".", "0"), ExAC_FIN = c("0", 
    ".", "0"), ExAC_NFE = c("1.499e-05", ".", "1.502e-05"), ExAC_OTH = c("0", 
    ".", "0"), ExAC_SAS = c("0", ".", "0"), cosmic70 = c(".", 
    ".", "."), SIFT_score = c(".", ".", "."), SIFT_converted_rankscore = c(".", 
    ".", "."), SIFT_pred = c(".", ".", "."), Polyphen2_HDIV_score = c(".", 
    ".", "."), Polyphen2_HDIV_rankscore = c(".", ".", "."), Polyphen2_HDIV_pred = c(".", 
    ".", "."), Polyphen2_HVAR_score = c(".", ".", "."), Polyphen2_HVAR_rankscore = c(".", 
    ".", "."), Polyphen2_HVAR_pred = c(".", ".", "."), LRT_score = c(".", 
    ".", "."), LRT_converted_rankscore = c(".", ".", "."), LRT_pred = c(".", 
    ".", "."), MutationTaster_score = c(".", ".", "."), MutationTaster_converted_rankscore = c(".", 
    ".", "."), MutationTaster_pred = c(".", ".", "."), MutationAssessor_score = c(".", 
    ".", "."), MutationAssessor_score_rankscore = c(".", ".", 
    "."), MutationAssessor_pred = c(".", ".", "."), FATHMM_score = c(".", 
    ".", "."), FATHMM_converted_rankscore = c(".", ".", "."), 
    FATHMM_pred = c(".", ".", "."), PROVEAN_score = c(".", ".", 
    "."), PROVEAN_converted_rankscore = c(".", ".", "."), PROVEAN_pred = c(".", 
    ".", "."), VEST3_score = c(".", ".", "."), VEST3_rankscore = c(".", 
    ".", "."), MetaSVM_score = c(".", ".", "."), MetaSVM_rankscore = c(".", 
    ".", "."), MetaSVM_pred = c(".", ".", "."), MetaLR_score = c(".", 
    ".", "."), MetaLR_rankscore = c(".", ".", "."), MetaLR_pred = c(".", 
    ".", "."), `M-CAP_score` = c(".", ".", "."), `M-CAP_rankscore` = c(".", 
    ".", "."), `M-CAP_pred` = c(".", ".", "."), CADD_raw = c(".", 
    ".", "."), CADD_raw_rankscore = c(".", ".", "."), CADD_phred = c(".", 
    ".", "."), DANN_score = c(".", ".", "."), DANN_rankscore = c(".", 
    ".", "."), `fathmm-MKL_coding_score` = c(".", ".", "."), 
    `fathmm-MKL_coding_rankscore` = c(".", ".", "."), `fathmm-MKL_coding_pred` = c(".", 
    ".", "."), Eigen_coding_or_noncoding = c(".", ".", "."), 
    `Eigen-raw` = c(".", ".", "."), `Eigen-PC-raw` = c(".", ".", 
    "."), GenoCanyon_score = c(".", ".", "."), GenoCanyon_score_rankscore = c(".", 
    ".", "."), integrated_fitCons_score = c(".", ".", "."), integrated_fitCons_score_rankscore = c(".", 
    ".", "."), integrated_confidence_value = c(".", ".", "."), 
    `GERP++_RS` = c(".", ".", "."), `GERP++_RS_rankscore` = c(".", 
    ".", "."), phyloP100way_vertebrate = c(".", ".", "."), phyloP100way_vertebrate_rankscore = c(".", 
    ".", "."), phyloP20way_mammalian = c(".", ".", "."), phyloP20way_mammalian_rankscore = c(".", 
    ".", "."), phastCons100way_vertebrate = c(".", ".", "."), 
    phastCons100way_vertebrate_rankscore = c(".", ".", "."), 
    phastCons20way_mammalian = c(".", ".", "."), phastCons20way_mammalian_rankscore = c(".", 
    ".", "."), SiPhy_29way_logOdds = c(".", ".", "."), SiPhy_29way_logOdds_rankscore = c(".", 
    ".", "."), Interpro_domain = c(".", ".", "."), GTEx_V6_gene = c(".", 
    ".", "."), GTEx_V6_tissue = c(".", ".", "."), Interpro_domain = c(".", 
    ".", "."), dbscSNV_ADA_SCORE = c(".", ".", "."), dbscSNV_RF_SCORE = c(".", 
    ".", "."), CLINSIG = c(".", ".", "."), CLNDBN = c(".", ".", 
    "."), CLNACC = c(".", ".", "."), CLNDSDB = c(".", ".", "."
    ), CLNDSDBID = c(".", ".", "."), HRC_AF = c(".", ".", "."
    ), HRC_AC = c(".", ".", "."), HRC_AN = c(".", ".", "."), 
    HRC_non1000G_AF = c(".", ".", "."), HRC_non1000G_AC = c(".", 
    ".", "."), HRC_non1000G_AN = c(".", ".", "."), Kaviar_AF = c("0.0020051", 
    ".", "1.29e-05"), Kaviar_AC = c("310", ".", "2"), Kaviar_AN = c("154602", 
    ".", "154602"), nci60 = c(".", ".", "."), TumorVAF = c("0.5375", 
    "0.234043", "0.413043"), NormalVAF = c("0", "0", "0"), cytoBand = c(NA_character_, 
    NA_character_, NA_character_), phyloP7way_vertebrate = c(NA_character_, 
    NA_character_, NA_character_), phastCons7way_vertebrate = c(NA_character_, 
    NA_character_, NA_character_), T_VAF = c(NA_character_, NA_character_, 
    NA_character_), N_VAF = c(NA_character_, NA_character_, NA_character_
    ), COSMIC_OESO = c("no", "no", "no"), COSMIC_FULL = c("no", 
    "no", "no"), RESIST = c("no", "no", "no"), FRANKEL = c("no", 
    "no", "no"), CANCER_GENES_DAMAGING_ALTERATION = c("no", "no", 
    "no"), CANCER_HELPER = c("no", "no", "no"), N_vaf = c(NA_character_, 
    NA_character_, NA_character_)), row.names = c(NA, -3L), class = c("data.table", 
"data.frame"), .internal.selfref = <pointer: 0x7fd1600094e0>)
>
> dim(small)
[1] 584702    139
>

我想提取小文件中与大文件通用的内容，这样我就可以让小文件的所有信息都与大文件共享

我试过了

merge(small,big)
Error in merge.data.table(small,big) :

x 有一些重复的列名称：Interpro_domain。请删除或重命名重复项，然后重试。

合并（小，大）

Error in merge.data.table(small, big) : 
x has some duplicated column name(s): Interpro_domain. Please remove or rename the duplicate(s) and try again.

我也试过了

> subset(small, big)
Error in subset.data.table(small, big) : 'subset' must evaluate to logical
> intersect(small, big)
data frame with 0 columns and 0 rows

拜托，你可以帮帮我

Answer 1

尝试merge(big, small, by="Tumor_Sample_Barcode")。另外我不明白你到底想得到什么。

Answer 2

列名在 small 数据框中重复，以使其唯一。

library(data.table)

# merge by tumor by barcode
merge(small, big, by = "Tumor_Sample_Barcode")

# Error in merge.data.table(small, small, by = "Tumor_Sample_Barcode") : 
#   x has some duplicated column name(s): Interpro_domain. Please remove or
#   rename the duplicate(s) and try again.

正如预期的那样，我们得到了错误，"Interpro_domain" 是重复的。识别它们并使它们独一无二

dupeCols <- colnames(small)[ duplicated(colnames(small)) ]
dupeColsIx <- which(colnames(small) %in% dupeCols)
colnames(small)[ dupeColsIx ] <- make.unique(colnames(small)[ dupeColsIx ])

让我们再次合并：

merge(small, big, by = "Tumor_Sample_Barcode")

Error in vecseq(f__, len__, if (allow.cartesian || notjoin || !anyDuplicated(f__,  : 
  Join results in 18 rows; more than 9 = nrow(x)+nrow(i). Check for duplicate key values in i each of which join to the same group in x over and over again. If that's ok, try by=.EACHI to run j for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and data.table issue tracker for advice.

此错误表明我们的键列 "Tumor_Sample_Barcode" 值在 big 和 small 中都是重复的，因此我们可能会创建笛卡尔连接，如果我们确定，则将参数设置为：

merge(small, big, by = "Tumor_Sample_Barcode", allow.cartesian = TRUE)

这很好用，但它从 3 行和 6 行数据中创建了 18 行，这可能不是您需要的。合并多个列，或者 de-duplicated 一个数据集。

Answer 3

library("dplyr")
small$Start_Position<-as.numeric(as.character(small$Start_Position))

all_inner<-inner_join(small,big,by=NULL)

从两个数据框中获取公共信息

Getting common information from two data frames

intersection

r

subset

data.table