HGNC 基因名称的基因组坐标
Genomic coordinates of HGNC gene names
我想使用 GenomicFeatures 和 TxDb.Hsapiens.UCSC.hg19.knownGene 来自 Bioconductor 的 R 包从我的列表(由 hgnc 基因 id 组成)中获取人类基因的坐标。
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb=(TxDb.Hsapiens.UCSC.hg19.knownGene)
my_genes = c("INO80","NASP","INO80D","SMARCA1")
select(txdb, keys = my_genes,
columns=c("TXCHROM","TXSTART","TXEND","TXSTRAND"),
keytype="GENEID")
但是,它不起作用,因为 txdb 不采用 hgnc 标识符;如何解决?我找不到任何支持 hgnc 的合适键类型,也不确定如何匹配我拥有的 hgnc id 和来自 txdb 的 GENEID。
我不熟悉 TxDb 及其属性的种类 accepts/includes。
不过,我可以为您提供一种使用 biomaRt
包的替代方法,它也接受 hgnc。
library(biomaRt)
my_genes = c("INO80","NASP","INO80D","SMARCA1")
m <- useMart('ensembl', dataset='hsapiens_gene_ensembl') # create a mart object
df <- getBM(mart=m, attributes=c('hgnc_symbol', 'description', 'chromosome_name',
'start_position', 'end_position', 'strand',
'ensembl_gene_id'),
filters='hgnc_symbol', values=my_genes) # where df is a data.frame with all your requested info
它有大量的属性可供选择,您可以通过简单的操作找到它们:
listAttributes(m) # our current dataset
更多信息请查看??biomaRt
希望对您有所帮助。
因为txdb是转录本,没有(hgnc)geneSymbol,但是有 EntrezID.
首先,我们需要将geneSymbol映射到EntrezID。
library(org.Hs.eg.db)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
myGeneSymbols <- select(org.Hs.eg.db,
keys = c("INO80","NASP","INO80D","SMARCA1"),
columns = c("SYMBOL","ENTREZID"),
keytype = "SYMBOL")
# SYMBOL ENTREZID
# 1 INO80 54617
# 2 NASP 4678
# 3 INO80D 54891
# 4 SMARCA1 6594
然后我们可以子集txdb
:
myGeneSymbolsTx <- select(TxDb.Hsapiens.UCSC.hg19.knownGene,
keys = myGeneSymbols$ENTREZID,
columns = c("GENEID", "TXID", "TXCHROM", "TXSTART", "TXEND"),
keytype = "GENEID")
# GENEID TXID TXCHROM TXSTART TXEND
# 1 54617 55599 chr15 41267988 41280172
# 2 54617 55600 chr15 41271079 41408340
# 3 54617 55601 chr15 41271079 41408340
# 4 4678 1229 chr1 46049660 46079853
# 5 4678 1230 chr1 46049660 46081143
# 6 4678 1231 chr1 46049660 46084578
# 7 4678 1232 chr1 46049660 46084578
# 8 4678 1233 chr1 46049660 46084578
# 9 4678 1234 chr1 46067733 46075197
# 10 4678 1235 chr1 46077135 46084578
# 11 54891 12593 chr2 206858445 206950906
# 12 6594 77970 chrX 128580478 128657460
# 13 6594 77971 chrX 128580478 128657460
# 14 6594 77972 chrX 128580740 128657460
# 15 6594 77973 chrX 128580740 128657460
如果需要,我们可以使用 merge:
添加 geneSymbol 到 table
res <- merge(myGeneSymbols, myGeneSymbolsTx, by.x = "ENTREZID", by.y = "GENEID")
# ENTREZID SYMBOL TXID TXCHROM TXSTART TXEND
# 1 4678 NASP 1229 chr1 46049660 46079853
# 2 4678 NASP 1230 chr1 46049660 46081143
# 3 4678 NASP 1231 chr1 46049660 46084578
# 4 4678 NASP 1232 chr1 46049660 46084578
# 5 4678 NASP 1233 chr1 46049660 46084578
# 6 4678 NASP 1234 chr1 46067733 46075197
# 7 4678 NASP 1235 chr1 46077135 46084578
# 8 54617 INO80 55599 chr15 41267988 41280172
# 9 54617 INO80 55600 chr15 41271079 41408340
# 10 54617 INO80 55601 chr15 41271079 41408340
# 11 54891 INO80D 12593 chr2 206858445 206950906
# 12 6594 SMARCA1 77970 chrX 128580478 128657460
# 13 6594 SMARCA1 77971 chrX 128580478 128657460
# 14 6594 SMARCA1 77972 chrX 128580740 128657460
# 15 6594 SMARCA1 77973 chrX 128580740 128657460
我想使用 GenomicFeatures 和 TxDb.Hsapiens.UCSC.hg19.knownGene 来自 Bioconductor 的 R 包从我的列表(由 hgnc 基因 id 组成)中获取人类基因的坐标。
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb=(TxDb.Hsapiens.UCSC.hg19.knownGene)
my_genes = c("INO80","NASP","INO80D","SMARCA1")
select(txdb, keys = my_genes,
columns=c("TXCHROM","TXSTART","TXEND","TXSTRAND"),
keytype="GENEID")
但是,它不起作用,因为 txdb 不采用 hgnc 标识符;如何解决?我找不到任何支持 hgnc 的合适键类型,也不确定如何匹配我拥有的 hgnc id 和来自 txdb 的 GENEID。
我不熟悉 TxDb 及其属性的种类 accepts/includes。
不过,我可以为您提供一种使用 biomaRt
包的替代方法,它也接受 hgnc。
library(biomaRt)
my_genes = c("INO80","NASP","INO80D","SMARCA1")
m <- useMart('ensembl', dataset='hsapiens_gene_ensembl') # create a mart object
df <- getBM(mart=m, attributes=c('hgnc_symbol', 'description', 'chromosome_name',
'start_position', 'end_position', 'strand',
'ensembl_gene_id'),
filters='hgnc_symbol', values=my_genes) # where df is a data.frame with all your requested info
它有大量的属性可供选择,您可以通过简单的操作找到它们:
listAttributes(m) # our current dataset
更多信息请查看??biomaRt
希望对您有所帮助。
因为txdb是转录本,没有(hgnc)geneSymbol,但是有 EntrezID.
首先,我们需要将geneSymbol映射到EntrezID。
library(org.Hs.eg.db)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
myGeneSymbols <- select(org.Hs.eg.db,
keys = c("INO80","NASP","INO80D","SMARCA1"),
columns = c("SYMBOL","ENTREZID"),
keytype = "SYMBOL")
# SYMBOL ENTREZID
# 1 INO80 54617
# 2 NASP 4678
# 3 INO80D 54891
# 4 SMARCA1 6594
然后我们可以子集txdb
:
myGeneSymbolsTx <- select(TxDb.Hsapiens.UCSC.hg19.knownGene,
keys = myGeneSymbols$ENTREZID,
columns = c("GENEID", "TXID", "TXCHROM", "TXSTART", "TXEND"),
keytype = "GENEID")
# GENEID TXID TXCHROM TXSTART TXEND
# 1 54617 55599 chr15 41267988 41280172
# 2 54617 55600 chr15 41271079 41408340
# 3 54617 55601 chr15 41271079 41408340
# 4 4678 1229 chr1 46049660 46079853
# 5 4678 1230 chr1 46049660 46081143
# 6 4678 1231 chr1 46049660 46084578
# 7 4678 1232 chr1 46049660 46084578
# 8 4678 1233 chr1 46049660 46084578
# 9 4678 1234 chr1 46067733 46075197
# 10 4678 1235 chr1 46077135 46084578
# 11 54891 12593 chr2 206858445 206950906
# 12 6594 77970 chrX 128580478 128657460
# 13 6594 77971 chrX 128580478 128657460
# 14 6594 77972 chrX 128580740 128657460
# 15 6594 77973 chrX 128580740 128657460
如果需要,我们可以使用 merge:
添加 geneSymbol 到 tableres <- merge(myGeneSymbols, myGeneSymbolsTx, by.x = "ENTREZID", by.y = "GENEID")
# ENTREZID SYMBOL TXID TXCHROM TXSTART TXEND
# 1 4678 NASP 1229 chr1 46049660 46079853
# 2 4678 NASP 1230 chr1 46049660 46081143
# 3 4678 NASP 1231 chr1 46049660 46084578
# 4 4678 NASP 1232 chr1 46049660 46084578
# 5 4678 NASP 1233 chr1 46049660 46084578
# 6 4678 NASP 1234 chr1 46067733 46075197
# 7 4678 NASP 1235 chr1 46077135 46084578
# 8 54617 INO80 55599 chr15 41267988 41280172
# 9 54617 INO80 55600 chr15 41271079 41408340
# 10 54617 INO80 55601 chr15 41271079 41408340
# 11 54891 INO80D 12593 chr2 206858445 206950906
# 12 6594 SMARCA1 77970 chrX 128580478 128657460
# 13 6594 SMARCA1 77971 chrX 128580478 128657460
# 14 6594 SMARCA1 77972 chrX 128580740 128657460
# 15 6594 SMARCA1 77973 chrX 128580740 128657460