从 PubMed 抓取数据
Scrape data from PubMed
我编写了以下函数来使用 Entrez 从 PubMed 中提取数据:
def getFromPubMed(id):
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=str(id))
records = Medline.parse(handle)
for record in records:
abstract = str(record["AB"])
mesh = str(record["MH"]).replace("'", "").replace("[", "").replace("]", "")
pmid = str(record["PMID"])
title = str(record["TI"]).replace("'", "").replace("[", "").replace("]", "")
pt = str(record["PT"]).replace("'", "").replace("[", "").replace("]", "")
au = str(record["AU"])
dp = str(record["DP"])
la = str(record["LA"])
pmc = str(record["PMC"])
si = str(record["SI"])
try:
doi=str(record["AID"])
except:
doi = str(record["SO"]).split('doi:',1)[1]
return pmid, title, abstract, au, mesh, doi, pt, la, pmc
但是,此功能并不总是有效,因为并非所有 MEDLINE 记录都包含所有字段。例如,此 PMID 不包含任何 MeSH 标题。
我可以用 try-except 语句包装每个项目,例如 abstract
:
try:
abstract = str(record["AB"])
except:
abstract = ""
但这似乎是一种笨拙的实现方式。什么是更优雅的解决方案?
您可以将提取字段的操作拆分为单独的方法 - 执行如下操作:
def get_record_attributes(record, attr_details):
attributes = {}
for attr_name, details in attr_details.items():
value = ""
try:
value = record[details["key"]]
for char in details["chars_to_remove"]:
value = value.replace(char, "")
except KeyError, AttributeError:
pass
attributes[attr_name] = value
return attributes
def getFromPubMed(id):
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=str(id))
records = Medline.parse(handle)
for record in records:
attr_details = {
"abstract" : {"key" : "AB"},
"mesh" : { "key" : "MH", "chars_to_remove" : "'[]"},
#...
"aid" : {"key" : "AB"},
"so" : {"key" : "SO"},
}
attributes = get_record_attributes(record, attr_details)
#...
怎么样:
mesh = str(record["MH"] or '')
因为空字典是 FALSE
,正如 建议的那样
我编写了以下函数来使用 Entrez 从 PubMed 中提取数据:
def getFromPubMed(id):
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=str(id))
records = Medline.parse(handle)
for record in records:
abstract = str(record["AB"])
mesh = str(record["MH"]).replace("'", "").replace("[", "").replace("]", "")
pmid = str(record["PMID"])
title = str(record["TI"]).replace("'", "").replace("[", "").replace("]", "")
pt = str(record["PT"]).replace("'", "").replace("[", "").replace("]", "")
au = str(record["AU"])
dp = str(record["DP"])
la = str(record["LA"])
pmc = str(record["PMC"])
si = str(record["SI"])
try:
doi=str(record["AID"])
except:
doi = str(record["SO"]).split('doi:',1)[1]
return pmid, title, abstract, au, mesh, doi, pt, la, pmc
但是,此功能并不总是有效,因为并非所有 MEDLINE 记录都包含所有字段。例如,此 PMID 不包含任何 MeSH 标题。
我可以用 try-except 语句包装每个项目,例如 abstract
:
try:
abstract = str(record["AB"])
except:
abstract = ""
但这似乎是一种笨拙的实现方式。什么是更优雅的解决方案?
您可以将提取字段的操作拆分为单独的方法 - 执行如下操作:
def get_record_attributes(record, attr_details):
attributes = {}
for attr_name, details in attr_details.items():
value = ""
try:
value = record[details["key"]]
for char in details["chars_to_remove"]:
value = value.replace(char, "")
except KeyError, AttributeError:
pass
attributes[attr_name] = value
return attributes
def getFromPubMed(id):
handle = Entrez.efetch(db="pubmed",rettype="medline",retmode="text", id=str(id))
records = Medline.parse(handle)
for record in records:
attr_details = {
"abstract" : {"key" : "AB"},
"mesh" : { "key" : "MH", "chars_to_remove" : "'[]"},
#...
"aid" : {"key" : "AB"},
"so" : {"key" : "SO"},
}
attributes = get_record_attributes(record, attr_details)
#...
怎么样:
mesh = str(record["MH"] or '')
因为空字典是 FALSE
,正如