.attrs 和 R 列表中的重复项
.attrs and repetitive entries in an R list
我正在尝试使用此 R 脚本从 NCBI 获取一些信息:
require(rentrez)
require(magrittr)
rs = "rs16891982"
rss = c("rs16891982", "rs12203592", "rs1408799", "rs10756819", "rs35264875", "rs1393350", "rs12821256", "rs17128291", "rs1800407", "rs12913832", "rs1805008", "rs4911414")
# given a rs number, return chr, bp, allele and gene name
annotateGeneName = function(rs) {
anno = rentrez::entrez_search(db = "snp", term = rs) %>%
"[["("ids") %>%
rentrez::entrez_summary(db = "snp", id = .)
if(length(anno) < 1) {
warning(sprintf("%s not found in dbSNP!", rs))
return(invisible(NULL))
}
# there might be multiple entries
# if "snp_id" is not in the list, then
# it means multiple SNPs have been return for this search
# just take the first hit
if(! "snp_id" %in% names(anno)) {
anno = anno[[1]]
}
chrpos = anno[["chrpos"]]
EA = anno$allele_origin %>% gsub("\(.*", "", .)
fEA = anno$global_maf %>% gsub("/.*", "", .) %>% gsub("^.*=", "", .)
genes = dplyr::first(anno$genes, default = NA)
res = data.frame(snp = rs, chrpos = chrpos, EA = EA, fEA = fEA, genes = genes)
res
}
annotateGeneNames = function(rss) {
do.call(rbind, lapply(rss, annotateGeneName))
}
ids = rentrez::entrez_search(db = "snp", term = rs) %>% "[["("ids")
x = rentrez::entrez_fetch(db = "snp", id = ids[1], rettype="xml")
snp1xml = xmlParse(x)
snp1list = xmlToList(snp1xml)
print(snp1list)
当您打印出结果时,您可以看到如下内容:
...
$Rs$Sequence$.attrs
exemplarSs ancestralAllele
"285153617" "C,C,C,C,C,C"
$Rs$Ss$.attrs
ssId handle batchId locSnpId subSnpClass orient
"23456916" "PERLEGEN" "12309" "afd3693051" "snp" "forward"
strand molType buildId methodClass validated
"bottom" "genomic" "123" "hybridize" "by-cluster"
$Rs$Ss$.attrs
ssId handle
"28510204" "MGC_GENOME_DIFF"
batchId locSnpId
"12314" "BC064405x37550355-C16403799G"
subSnpClass orient
"snp" "forward"
strand molType
"bottom" "cDNA"
buildId methodClass
"126" "computed"
$Rs$Ss
$Rs$Ss$Sequence
$Rs$Ss$Sequence$Seq5
[1] "TTCCCTTTCATTTTCCAGAGAAACTTGATCAGGAACCCACTGATTCCAAGAGCAAAGTAATCAGTGAGGAAATGACACCTAGAATTCATGATGAAAAAAGGATGCTTTATATGGTCCTTTTTAAGGTGATAGTTTTTCCTGACGTCCATAGATTTATTAAGAATCTGGTATTTTAAACAGTAGGAAATACACATAGAAATATCAAATCCAAGTTGTGCTAGACCAGAAACTTTTAGAAGACATCCTTAGGAGAGAGAAAGACTTACAAGAATAAAGTGAGGAAAACACGGAGTTGATGCA"
$Rs$Ss$.attrs
$Rs$Ss$Sequence
$Rs$Ss$Sequence$Seq5
[1] "AAGACATCCTTAGGAGAGAGAAAGACTTACAAGAATAAAGTGAGGAAAACACGGAGTTGATGCA"
$Rs$Assembly$Component$MapLoc$FxnSet
geneId symbol mrnaAcc mrnaVer protAcc protVer
"51151" "SLC45A2" "NM_016180" "4" "NP_057264" "3"
fxnClass readingFrame allele residue aaPosition
"reference" "3" "C" "F" "373"
$Rs$Assembly$Component$MapLoc$FxnSet
geneId symbol mrnaAcc
"51151" "SLC45A2" "NM_016180"
mrnaVer protAcc protVer
"4" "NP_057264" "3"
fxnClass readingFrame allele
"missense" "3" "G"
residue aaPosition soTerm
"L" "373" "non_synonymous_codon"
此列表中有很多 .attrs 条目,而且它们经常重复。还有其他重复条目,例如:
$Rs$Ss$Sequence$Seq5
$Rs$Assembly$Component$MapLoc$FxnSet
等等
.attrs 是什么意思,我如何理解这些数据?
我不知道如何在一个列表中包含两个同名条目。
在 R 中,attributes
和 attr
是分配或提取属性的函数,但据我所知,`'.attr' 只是一个列表位置名称。它的含义基本上是作者认为它应该表示的意思......之后是您的代码通过解析 XML 并将其转换为 R 列表。它不是 R 定义的一部分,因此请阅读文档。
我现在看到您对名称相同的列表项感到困扰。这在 R 中是可能的。“[”和“[[”将检索树中与名称匹配的第一个项目。访问需要是数字或由 lapply 或 sapply 调节,这些函数遍历树的上层以避免歧义。
> mylist=vector("list", length=2)
> mylist
[[1]]
NULL
[[2]]
NULL
> names(mylist) <- c("a","a")
> mylist
$a
NULL
$a
NULL
> mylist[['a']]
NULL
> mylist['a']
$a
NULL
> lapply( mylist , "[[", "a")
$a
NULL
$a
NULL
(我也没有看到在提取和处理该数据的过程中使用了这些函数定义中的任何一个。)
我正在尝试使用此 R 脚本从 NCBI 获取一些信息:
require(rentrez)
require(magrittr)
rs = "rs16891982"
rss = c("rs16891982", "rs12203592", "rs1408799", "rs10756819", "rs35264875", "rs1393350", "rs12821256", "rs17128291", "rs1800407", "rs12913832", "rs1805008", "rs4911414")
# given a rs number, return chr, bp, allele and gene name
annotateGeneName = function(rs) {
anno = rentrez::entrez_search(db = "snp", term = rs) %>%
"[["("ids") %>%
rentrez::entrez_summary(db = "snp", id = .)
if(length(anno) < 1) {
warning(sprintf("%s not found in dbSNP!", rs))
return(invisible(NULL))
}
# there might be multiple entries
# if "snp_id" is not in the list, then
# it means multiple SNPs have been return for this search
# just take the first hit
if(! "snp_id" %in% names(anno)) {
anno = anno[[1]]
}
chrpos = anno[["chrpos"]]
EA = anno$allele_origin %>% gsub("\(.*", "", .)
fEA = anno$global_maf %>% gsub("/.*", "", .) %>% gsub("^.*=", "", .)
genes = dplyr::first(anno$genes, default = NA)
res = data.frame(snp = rs, chrpos = chrpos, EA = EA, fEA = fEA, genes = genes)
res
}
annotateGeneNames = function(rss) {
do.call(rbind, lapply(rss, annotateGeneName))
}
ids = rentrez::entrez_search(db = "snp", term = rs) %>% "[["("ids")
x = rentrez::entrez_fetch(db = "snp", id = ids[1], rettype="xml")
snp1xml = xmlParse(x)
snp1list = xmlToList(snp1xml)
print(snp1list)
当您打印出结果时,您可以看到如下内容:
...
$Rs$Sequence$.attrs
exemplarSs ancestralAllele
"285153617" "C,C,C,C,C,C"
$Rs$Ss$.attrs
ssId handle batchId locSnpId subSnpClass orient
"23456916" "PERLEGEN" "12309" "afd3693051" "snp" "forward"
strand molType buildId methodClass validated
"bottom" "genomic" "123" "hybridize" "by-cluster"
$Rs$Ss$.attrs
ssId handle
"28510204" "MGC_GENOME_DIFF"
batchId locSnpId
"12314" "BC064405x37550355-C16403799G"
subSnpClass orient
"snp" "forward"
strand molType
"bottom" "cDNA"
buildId methodClass
"126" "computed"
$Rs$Ss
$Rs$Ss$Sequence
$Rs$Ss$Sequence$Seq5
[1] "TTCCCTTTCATTTTCCAGAGAAACTTGATCAGGAACCCACTGATTCCAAGAGCAAAGTAATCAGTGAGGAAATGACACCTAGAATTCATGATGAAAAAAGGATGCTTTATATGGTCCTTTTTAAGGTGATAGTTTTTCCTGACGTCCATAGATTTATTAAGAATCTGGTATTTTAAACAGTAGGAAATACACATAGAAATATCAAATCCAAGTTGTGCTAGACCAGAAACTTTTAGAAGACATCCTTAGGAGAGAGAAAGACTTACAAGAATAAAGTGAGGAAAACACGGAGTTGATGCA"
$Rs$Ss$.attrs
$Rs$Ss$Sequence
$Rs$Ss$Sequence$Seq5
[1] "AAGACATCCTTAGGAGAGAGAAAGACTTACAAGAATAAAGTGAGGAAAACACGGAGTTGATGCA"
$Rs$Assembly$Component$MapLoc$FxnSet
geneId symbol mrnaAcc mrnaVer protAcc protVer
"51151" "SLC45A2" "NM_016180" "4" "NP_057264" "3"
fxnClass readingFrame allele residue aaPosition
"reference" "3" "C" "F" "373"
$Rs$Assembly$Component$MapLoc$FxnSet
geneId symbol mrnaAcc
"51151" "SLC45A2" "NM_016180"
mrnaVer protAcc protVer
"4" "NP_057264" "3"
fxnClass readingFrame allele
"missense" "3" "G"
residue aaPosition soTerm
"L" "373" "non_synonymous_codon"
此列表中有很多 .attrs 条目,而且它们经常重复。还有其他重复条目,例如:
$Rs$Ss$Sequence$Seq5
$Rs$Assembly$Component$MapLoc$FxnSet
等等
.attrs 是什么意思,我如何理解这些数据? 我不知道如何在一个列表中包含两个同名条目。
在 R 中,attributes
和 attr
是分配或提取属性的函数,但据我所知,`'.attr' 只是一个列表位置名称。它的含义基本上是作者认为它应该表示的意思......之后是您的代码通过解析 XML 并将其转换为 R 列表。它不是 R 定义的一部分,因此请阅读文档。
我现在看到您对名称相同的列表项感到困扰。这在 R 中是可能的。“[”和“[[”将检索树中与名称匹配的第一个项目。访问需要是数字或由 lapply 或 sapply 调节,这些函数遍历树的上层以避免歧义。
> mylist=vector("list", length=2)
> mylist
[[1]]
NULL
[[2]]
NULL
> names(mylist) <- c("a","a")
> mylist
$a
NULL
$a
NULL
> mylist[['a']]
NULL
> mylist['a']
$a
NULL
> lapply( mylist , "[[", "a")
$a
NULL
$a
NULL
(我也没有看到在提取和处理该数据的过程中使用了这些函数定义中的任何一个。)