从两个数据框中获取公共信息
Getting common information from two data frames
我有一个很大的文件,比如
> dput(head(big))
structure(list(Tumor_Sample_Barcode = c("LP6005334_DNA_H01",
"LP6005334_DNA_H01", "LP6005334_DNA_H01", "LP6005334_DNA_H01",
"LP6005334_DNA_H01", "LP6005334_DNA_H01"), Chromosome = c("chr1",
"chr1", "chr1", "chr1", "chr1", "chr1"), Start_Position = c(782112,
1026918, 1133283, 1431511, 1742395, 1864994), Reference_Allele = c("G",
"C", "C", "G", "C", "G"), Tumor_Seq_Allele2 = c("A", "T", "T",
"A", "T", "A")), row.names = c("1", "2", "3", "4", "5", "6"), class = "data.frame")
>
> dim(big)
[1] 555437 5
>
这包括突变的染色体位置
对于相同的样本,我有一个较小的数据框,如
> dput(head(small))
structure(list(Tumor_Sample_Barcode = c("LP6005334_DNA_H01",
"LP6005334_DNA_H01", "LP6005334_DNA_H01"), Chromosome = c("chr1",
"chr6", "chr16"), Start_Position = c("89616151", "51909815",
"20556547"), End_Position = c("89616151", "51909815", "20556547"
), Reference_Allele = c("-", "A", "-"), Tumor_Seq_Allele2 = c("T",
"-", "G"), Hugo_Symbol = c("GBP7", "PKHD1", "ACSM2B"), Variant_Classification = c("Frame_Shift_Ins",
"Frame_Shift_Del", "Frame_Shift_Ins"), tx = c("NM_207398", "NM_138694",
"NM_001105069"), exon = c("exon6", "exon25", "exon10"), txChange = c("c.732dupA",
"c.2664delT", "c.1212dupC"), aaChange = c("p.L245fs", "p.F888fs",
"p.G405fs"), Variant_Type = c("INS", "DEL", "INS"), Func.refGene = c("exonic",
"exonic", "exonic"), Gene.refGene = c("GBP7", "PKHD1", "ACSM2B"
), GeneDetail.refGene = c(".", ".", "."), ExonicFunc.refGene = c("frameshift insertion",
"frameshift deletion", "frameshift insertion"), AAChange.refGene = c("GBP7:NM_207398:exon6:c.732dupA:p.L245fs",
"PKHD1:NM_138694:exon25:c.2664delT:p.F888fs,PKHD1:NM_170724:exon25:c.2664delT:p.F888fs",
"ACSM2B:NM_001105069:exon10:c.1212dupC:p.G405fs,ACSM2B:NM_182617:exon11:c.1212dupC:p.G405fs"
), Func.knownGene = c("exonic", "exonic", "exonic"), Gene.knownGene = c("GBP7",
"PKHD1", "ACSM2B"), GeneDetail.knownGene = c(".", ".", "."),
ExonicFunc.knownGene = c("frameshift insertion", "frameshift deletion",
"frameshift insertion"), AAChange.knownGene = c("GBP7:uc001dna.2:exon6:c.732dupA:p.L245fs",
"PKHD1:uc003pah.1:exon25:c.2664delT:p.F888fs,PKHD1:uc003pai.3:exon25:c.2664delT:p.F888fs",
"ACSM2B:uc002dhk.4:exon10:c.1212dupC:p.G405fs,ACSM2B:uc010bwf.1:exon10:c.1212dupC:p.G405fs,ACSM2B:uc002dhj.4:exon11:c.1212dupC:p.G405fs"
), avsnp147 = c("rs200922375", ".", "rs750935206"), `1000g2015aug_all` = c("0.00738818",
".", "."), `1000g2015aug_amr` = c("0.0014", ".", "."), `1000g2015aug_sas` = c(".",
".", "."), `1000g2015aug_eur` = c(".", ".", "."), `1000g2015aug_eas` = c(".",
".", "."), esp6500siv2_all = c("0.0082", ".", "."), esp6500siv2_ea = c(".",
".", "."), esp6500siv2_aa = c("0.0242", ".", "."), ExAC_ALL = c("0.0021",
".", "8.254e-06"), ExAC_AFR = c("0.0239", ".", "0"), ExAC_AMR = c("0.0004",
".", "0"), ExAC_EAS = c("0.0001", ".", "0"), ExAC_FIN = c("0",
".", "0"), ExAC_NFE = c("1.499e-05", ".", "1.502e-05"), ExAC_OTH = c("0",
".", "0"), ExAC_SAS = c("0", ".", "0"), cosmic70 = c(".",
".", "."), SIFT_score = c(".", ".", "."), SIFT_converted_rankscore = c(".",
".", "."), SIFT_pred = c(".", ".", "."), Polyphen2_HDIV_score = c(".",
".", "."), Polyphen2_HDIV_rankscore = c(".", ".", "."), Polyphen2_HDIV_pred = c(".",
".", "."), Polyphen2_HVAR_score = c(".", ".", "."), Polyphen2_HVAR_rankscore = c(".",
".", "."), Polyphen2_HVAR_pred = c(".", ".", "."), LRT_score = c(".",
".", "."), LRT_converted_rankscore = c(".", ".", "."), LRT_pred = c(".",
".", "."), MutationTaster_score = c(".", ".", "."), MutationTaster_converted_rankscore = c(".",
".", "."), MutationTaster_pred = c(".", ".", "."), MutationAssessor_score = c(".",
".", "."), MutationAssessor_score_rankscore = c(".", ".",
"."), MutationAssessor_pred = c(".", ".", "."), FATHMM_score = c(".",
".", "."), FATHMM_converted_rankscore = c(".", ".", "."),
FATHMM_pred = c(".", ".", "."), PROVEAN_score = c(".", ".",
"."), PROVEAN_converted_rankscore = c(".", ".", "."), PROVEAN_pred = c(".",
".", "."), VEST3_score = c(".", ".", "."), VEST3_rankscore = c(".",
".", "."), MetaSVM_score = c(".", ".", "."), MetaSVM_rankscore = c(".",
".", "."), MetaSVM_pred = c(".", ".", "."), MetaLR_score = c(".",
".", "."), MetaLR_rankscore = c(".", ".", "."), MetaLR_pred = c(".",
".", "."), `M-CAP_score` = c(".", ".", "."), `M-CAP_rankscore` = c(".",
".", "."), `M-CAP_pred` = c(".", ".", "."), CADD_raw = c(".",
".", "."), CADD_raw_rankscore = c(".", ".", "."), CADD_phred = c(".",
".", "."), DANN_score = c(".", ".", "."), DANN_rankscore = c(".",
".", "."), `fathmm-MKL_coding_score` = c(".", ".", "."),
`fathmm-MKL_coding_rankscore` = c(".", ".", "."), `fathmm-MKL_coding_pred` = c(".",
".", "."), Eigen_coding_or_noncoding = c(".", ".", "."),
`Eigen-raw` = c(".", ".", "."), `Eigen-PC-raw` = c(".", ".",
"."), GenoCanyon_score = c(".", ".", "."), GenoCanyon_score_rankscore = c(".",
".", "."), integrated_fitCons_score = c(".", ".", "."), integrated_fitCons_score_rankscore = c(".",
".", "."), integrated_confidence_value = c(".", ".", "."),
`GERP++_RS` = c(".", ".", "."), `GERP++_RS_rankscore` = c(".",
".", "."), phyloP100way_vertebrate = c(".", ".", "."), phyloP100way_vertebrate_rankscore = c(".",
".", "."), phyloP20way_mammalian = c(".", ".", "."), phyloP20way_mammalian_rankscore = c(".",
".", "."), phastCons100way_vertebrate = c(".", ".", "."),
phastCons100way_vertebrate_rankscore = c(".", ".", "."),
phastCons20way_mammalian = c(".", ".", "."), phastCons20way_mammalian_rankscore = c(".",
".", "."), SiPhy_29way_logOdds = c(".", ".", "."), SiPhy_29way_logOdds_rankscore = c(".",
".", "."), Interpro_domain = c(".", ".", "."), GTEx_V6_gene = c(".",
".", "."), GTEx_V6_tissue = c(".", ".", "."), Interpro_domain = c(".",
".", "."), dbscSNV_ADA_SCORE = c(".", ".", "."), dbscSNV_RF_SCORE = c(".",
".", "."), CLINSIG = c(".", ".", "."), CLNDBN = c(".", ".",
"."), CLNACC = c(".", ".", "."), CLNDSDB = c(".", ".", "."
), CLNDSDBID = c(".", ".", "."), HRC_AF = c(".", ".", "."
), HRC_AC = c(".", ".", "."), HRC_AN = c(".", ".", "."),
HRC_non1000G_AF = c(".", ".", "."), HRC_non1000G_AC = c(".",
".", "."), HRC_non1000G_AN = c(".", ".", "."), Kaviar_AF = c("0.0020051",
".", "1.29e-05"), Kaviar_AC = c("310", ".", "2"), Kaviar_AN = c("154602",
".", "154602"), nci60 = c(".", ".", "."), TumorVAF = c("0.5375",
"0.234043", "0.413043"), NormalVAF = c("0", "0", "0"), cytoBand = c(NA_character_,
NA_character_, NA_character_), phyloP7way_vertebrate = c(NA_character_,
NA_character_, NA_character_), phastCons7way_vertebrate = c(NA_character_,
NA_character_, NA_character_), T_VAF = c(NA_character_, NA_character_,
NA_character_), N_VAF = c(NA_character_, NA_character_, NA_character_
), COSMIC_OESO = c("no", "no", "no"), COSMIC_FULL = c("no",
"no", "no"), RESIST = c("no", "no", "no"), FRANKEL = c("no",
"no", "no"), CANCER_GENES_DAMAGING_ALTERATION = c("no", "no",
"no"), CANCER_HELPER = c("no", "no", "no"), N_vaf = c(NA_character_,
NA_character_, NA_character_)), row.names = c(NA, -3L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7fd1600094e0>)
>
> dim(small)
[1] 584702 139
>
我想提取小文件中与大文件通用的内容,这样我就可以让小文件的所有信息都与大文件共享
我试过了
merge(small,big)
Error in merge.data.table(small,big) :
x 有一些重复的列名称:Interpro_domain。请删除或重命名重复项,然后重试。
合并(小,大)
Error in merge.data.table(small, big) :
x has some duplicated column name(s): Interpro_domain. Please remove or rename the duplicate(s) and try again.
我也试过了
> subset(small, big)
Error in subset.data.table(small, big) : 'subset' must evaluate to logical
> intersect(small, big)
data frame with 0 columns and 0 rows
拜托,你可以帮帮我
尝试merge(big, small, by="Tumor_Sample_Barcode")
。另外我不明白你到底想得到什么。
列名在 small 数据框中重复,以使其唯一。
library(data.table)
# merge by tumor by barcode
merge(small, big, by = "Tumor_Sample_Barcode")
# Error in merge.data.table(small, small, by = "Tumor_Sample_Barcode") :
# x has some duplicated column name(s): Interpro_domain. Please remove or
# rename the duplicate(s) and try again.
正如预期的那样,我们得到了错误,"Interpro_domain"
是重复的。识别它们并使它们独一无二
dupeCols <- colnames(small)[ duplicated(colnames(small)) ]
dupeColsIx <- which(colnames(small) %in% dupeCols)
colnames(small)[ dupeColsIx ] <- make.unique(colnames(small)[ dupeColsIx ])
让我们再次合并:
merge(small, big, by = "Tumor_Sample_Barcode")
Error in vecseq(f__, len__, if (allow.cartesian || notjoin || !anyDuplicated(f__, :
Join results in 18 rows; more than 9 = nrow(x)+nrow(i). Check for duplicate key values in i each of which join to the same group in x over and over again. If that's ok, try by=.EACHI to run j for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and data.table issue tracker for advice.
此错误表明我们的键列 "Tumor_Sample_Barcode"
值在 big 和 small 中都是重复的,因此我们可能会创建笛卡尔连接,如果我们确定,则将参数设置为:
merge(small, big, by = "Tumor_Sample_Barcode", allow.cartesian = TRUE)
这很好用,但它从 3 行和 6 行数据中创建了 18 行,这可能不是您需要的。合并多个列,或者 de-duplicated 一个数据集。
library("dplyr")
small$Start_Position<-as.numeric(as.character(small$Start_Position))
all_inner<-inner_join(small,big,by=NULL)
我有一个很大的文件,比如
> dput(head(big))
structure(list(Tumor_Sample_Barcode = c("LP6005334_DNA_H01",
"LP6005334_DNA_H01", "LP6005334_DNA_H01", "LP6005334_DNA_H01",
"LP6005334_DNA_H01", "LP6005334_DNA_H01"), Chromosome = c("chr1",
"chr1", "chr1", "chr1", "chr1", "chr1"), Start_Position = c(782112,
1026918, 1133283, 1431511, 1742395, 1864994), Reference_Allele = c("G",
"C", "C", "G", "C", "G"), Tumor_Seq_Allele2 = c("A", "T", "T",
"A", "T", "A")), row.names = c("1", "2", "3", "4", "5", "6"), class = "data.frame")
>
> dim(big)
[1] 555437 5
>
这包括突变的染色体位置
对于相同的样本,我有一个较小的数据框,如
> dput(head(small))
structure(list(Tumor_Sample_Barcode = c("LP6005334_DNA_H01",
"LP6005334_DNA_H01", "LP6005334_DNA_H01"), Chromosome = c("chr1",
"chr6", "chr16"), Start_Position = c("89616151", "51909815",
"20556547"), End_Position = c("89616151", "51909815", "20556547"
), Reference_Allele = c("-", "A", "-"), Tumor_Seq_Allele2 = c("T",
"-", "G"), Hugo_Symbol = c("GBP7", "PKHD1", "ACSM2B"), Variant_Classification = c("Frame_Shift_Ins",
"Frame_Shift_Del", "Frame_Shift_Ins"), tx = c("NM_207398", "NM_138694",
"NM_001105069"), exon = c("exon6", "exon25", "exon10"), txChange = c("c.732dupA",
"c.2664delT", "c.1212dupC"), aaChange = c("p.L245fs", "p.F888fs",
"p.G405fs"), Variant_Type = c("INS", "DEL", "INS"), Func.refGene = c("exonic",
"exonic", "exonic"), Gene.refGene = c("GBP7", "PKHD1", "ACSM2B"
), GeneDetail.refGene = c(".", ".", "."), ExonicFunc.refGene = c("frameshift insertion",
"frameshift deletion", "frameshift insertion"), AAChange.refGene = c("GBP7:NM_207398:exon6:c.732dupA:p.L245fs",
"PKHD1:NM_138694:exon25:c.2664delT:p.F888fs,PKHD1:NM_170724:exon25:c.2664delT:p.F888fs",
"ACSM2B:NM_001105069:exon10:c.1212dupC:p.G405fs,ACSM2B:NM_182617:exon11:c.1212dupC:p.G405fs"
), Func.knownGene = c("exonic", "exonic", "exonic"), Gene.knownGene = c("GBP7",
"PKHD1", "ACSM2B"), GeneDetail.knownGene = c(".", ".", "."),
ExonicFunc.knownGene = c("frameshift insertion", "frameshift deletion",
"frameshift insertion"), AAChange.knownGene = c("GBP7:uc001dna.2:exon6:c.732dupA:p.L245fs",
"PKHD1:uc003pah.1:exon25:c.2664delT:p.F888fs,PKHD1:uc003pai.3:exon25:c.2664delT:p.F888fs",
"ACSM2B:uc002dhk.4:exon10:c.1212dupC:p.G405fs,ACSM2B:uc010bwf.1:exon10:c.1212dupC:p.G405fs,ACSM2B:uc002dhj.4:exon11:c.1212dupC:p.G405fs"
), avsnp147 = c("rs200922375", ".", "rs750935206"), `1000g2015aug_all` = c("0.00738818",
".", "."), `1000g2015aug_amr` = c("0.0014", ".", "."), `1000g2015aug_sas` = c(".",
".", "."), `1000g2015aug_eur` = c(".", ".", "."), `1000g2015aug_eas` = c(".",
".", "."), esp6500siv2_all = c("0.0082", ".", "."), esp6500siv2_ea = c(".",
".", "."), esp6500siv2_aa = c("0.0242", ".", "."), ExAC_ALL = c("0.0021",
".", "8.254e-06"), ExAC_AFR = c("0.0239", ".", "0"), ExAC_AMR = c("0.0004",
".", "0"), ExAC_EAS = c("0.0001", ".", "0"), ExAC_FIN = c("0",
".", "0"), ExAC_NFE = c("1.499e-05", ".", "1.502e-05"), ExAC_OTH = c("0",
".", "0"), ExAC_SAS = c("0", ".", "0"), cosmic70 = c(".",
".", "."), SIFT_score = c(".", ".", "."), SIFT_converted_rankscore = c(".",
".", "."), SIFT_pred = c(".", ".", "."), Polyphen2_HDIV_score = c(".",
".", "."), Polyphen2_HDIV_rankscore = c(".", ".", "."), Polyphen2_HDIV_pred = c(".",
".", "."), Polyphen2_HVAR_score = c(".", ".", "."), Polyphen2_HVAR_rankscore = c(".",
".", "."), Polyphen2_HVAR_pred = c(".", ".", "."), LRT_score = c(".",
".", "."), LRT_converted_rankscore = c(".", ".", "."), LRT_pred = c(".",
".", "."), MutationTaster_score = c(".", ".", "."), MutationTaster_converted_rankscore = c(".",
".", "."), MutationTaster_pred = c(".", ".", "."), MutationAssessor_score = c(".",
".", "."), MutationAssessor_score_rankscore = c(".", ".",
"."), MutationAssessor_pred = c(".", ".", "."), FATHMM_score = c(".",
".", "."), FATHMM_converted_rankscore = c(".", ".", "."),
FATHMM_pred = c(".", ".", "."), PROVEAN_score = c(".", ".",
"."), PROVEAN_converted_rankscore = c(".", ".", "."), PROVEAN_pred = c(".",
".", "."), VEST3_score = c(".", ".", "."), VEST3_rankscore = c(".",
".", "."), MetaSVM_score = c(".", ".", "."), MetaSVM_rankscore = c(".",
".", "."), MetaSVM_pred = c(".", ".", "."), MetaLR_score = c(".",
".", "."), MetaLR_rankscore = c(".", ".", "."), MetaLR_pred = c(".",
".", "."), `M-CAP_score` = c(".", ".", "."), `M-CAP_rankscore` = c(".",
".", "."), `M-CAP_pred` = c(".", ".", "."), CADD_raw = c(".",
".", "."), CADD_raw_rankscore = c(".", ".", "."), CADD_phred = c(".",
".", "."), DANN_score = c(".", ".", "."), DANN_rankscore = c(".",
".", "."), `fathmm-MKL_coding_score` = c(".", ".", "."),
`fathmm-MKL_coding_rankscore` = c(".", ".", "."), `fathmm-MKL_coding_pred` = c(".",
".", "."), Eigen_coding_or_noncoding = c(".", ".", "."),
`Eigen-raw` = c(".", ".", "."), `Eigen-PC-raw` = c(".", ".",
"."), GenoCanyon_score = c(".", ".", "."), GenoCanyon_score_rankscore = c(".",
".", "."), integrated_fitCons_score = c(".", ".", "."), integrated_fitCons_score_rankscore = c(".",
".", "."), integrated_confidence_value = c(".", ".", "."),
`GERP++_RS` = c(".", ".", "."), `GERP++_RS_rankscore` = c(".",
".", "."), phyloP100way_vertebrate = c(".", ".", "."), phyloP100way_vertebrate_rankscore = c(".",
".", "."), phyloP20way_mammalian = c(".", ".", "."), phyloP20way_mammalian_rankscore = c(".",
".", "."), phastCons100way_vertebrate = c(".", ".", "."),
phastCons100way_vertebrate_rankscore = c(".", ".", "."),
phastCons20way_mammalian = c(".", ".", "."), phastCons20way_mammalian_rankscore = c(".",
".", "."), SiPhy_29way_logOdds = c(".", ".", "."), SiPhy_29way_logOdds_rankscore = c(".",
".", "."), Interpro_domain = c(".", ".", "."), GTEx_V6_gene = c(".",
".", "."), GTEx_V6_tissue = c(".", ".", "."), Interpro_domain = c(".",
".", "."), dbscSNV_ADA_SCORE = c(".", ".", "."), dbscSNV_RF_SCORE = c(".",
".", "."), CLINSIG = c(".", ".", "."), CLNDBN = c(".", ".",
"."), CLNACC = c(".", ".", "."), CLNDSDB = c(".", ".", "."
), CLNDSDBID = c(".", ".", "."), HRC_AF = c(".", ".", "."
), HRC_AC = c(".", ".", "."), HRC_AN = c(".", ".", "."),
HRC_non1000G_AF = c(".", ".", "."), HRC_non1000G_AC = c(".",
".", "."), HRC_non1000G_AN = c(".", ".", "."), Kaviar_AF = c("0.0020051",
".", "1.29e-05"), Kaviar_AC = c("310", ".", "2"), Kaviar_AN = c("154602",
".", "154602"), nci60 = c(".", ".", "."), TumorVAF = c("0.5375",
"0.234043", "0.413043"), NormalVAF = c("0", "0", "0"), cytoBand = c(NA_character_,
NA_character_, NA_character_), phyloP7way_vertebrate = c(NA_character_,
NA_character_, NA_character_), phastCons7way_vertebrate = c(NA_character_,
NA_character_, NA_character_), T_VAF = c(NA_character_, NA_character_,
NA_character_), N_VAF = c(NA_character_, NA_character_, NA_character_
), COSMIC_OESO = c("no", "no", "no"), COSMIC_FULL = c("no",
"no", "no"), RESIST = c("no", "no", "no"), FRANKEL = c("no",
"no", "no"), CANCER_GENES_DAMAGING_ALTERATION = c("no", "no",
"no"), CANCER_HELPER = c("no", "no", "no"), N_vaf = c(NA_character_,
NA_character_, NA_character_)), row.names = c(NA, -3L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x7fd1600094e0>)
>
> dim(small)
[1] 584702 139
>
我想提取小文件中与大文件通用的内容,这样我就可以让小文件的所有信息都与大文件共享
我试过了
merge(small,big)
Error in merge.data.table(small,big) :
x 有一些重复的列名称:Interpro_domain。请删除或重命名重复项,然后重试。
合并(小,大)
Error in merge.data.table(small, big) :
x has some duplicated column name(s): Interpro_domain. Please remove or rename the duplicate(s) and try again.
我也试过了
> subset(small, big)
Error in subset.data.table(small, big) : 'subset' must evaluate to logical
> intersect(small, big)
data frame with 0 columns and 0 rows
拜托,你可以帮帮我
尝试merge(big, small, by="Tumor_Sample_Barcode")
。另外我不明白你到底想得到什么。
列名在 small 数据框中重复,以使其唯一。
library(data.table)
# merge by tumor by barcode
merge(small, big, by = "Tumor_Sample_Barcode")
# Error in merge.data.table(small, small, by = "Tumor_Sample_Barcode") :
# x has some duplicated column name(s): Interpro_domain. Please remove or
# rename the duplicate(s) and try again.
正如预期的那样,我们得到了错误,"Interpro_domain"
是重复的。识别它们并使它们独一无二
dupeCols <- colnames(small)[ duplicated(colnames(small)) ]
dupeColsIx <- which(colnames(small) %in% dupeCols)
colnames(small)[ dupeColsIx ] <- make.unique(colnames(small)[ dupeColsIx ])
让我们再次合并:
merge(small, big, by = "Tumor_Sample_Barcode")
Error in vecseq(f__, len__, if (allow.cartesian || notjoin || !anyDuplicated(f__, :
Join results in 18 rows; more than 9 = nrow(x)+nrow(i). Check for duplicate key values in i each of which join to the same group in x over and over again. If that's ok, try by=.EACHI to run j for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and data.table issue tracker for advice.
此错误表明我们的键列 "Tumor_Sample_Barcode"
值在 big 和 small 中都是重复的,因此我们可能会创建笛卡尔连接,如果我们确定,则将参数设置为:
merge(small, big, by = "Tumor_Sample_Barcode", allow.cartesian = TRUE)
这很好用,但它从 3 行和 6 行数据中创建了 18 行,这可能不是您需要的。合并多个列,或者 de-duplicated 一个数据集。
library("dplyr")
small$Start_Position<-as.numeric(as.character(small$Start_Position))
all_inner<-inner_join(small,big,by=NULL)