如何从 efetch(Biopython、Entrez)中提取摘要?
How can I extract the abstract from efetch (Biopython, Entrez)?
我是 python 的新手,想使用 bio 包中的 entrez 系统从 pubmed 中提取摘要。
我通过电子搜索获得了我的 UID(存储在 my_list_ges
中),我还可以使用 efetch 下载一个条目。
然而,现在结果是一个字典列表,条目看起来像字典,但我无法访问它们:
Entrez.email= "my-email@provider.sth"
handle=Entrez.efetch(db="pubmed",id=my_list_ges[0],rettype="null",retmode="xml")
record = Entrez.read(handle)
abstract=record["Abstract"]
handle.close()
结果是类型错误:
TypeError: list indices must be integers, not str
我在尝试从第一条记录中检索 'Abstract'
时得到 KeyError
:
>>> record[0]["Abstract"]
KeyError: 'Abstract'
这很奇怪,因为在电子搜索的结果中,我可以通过字典轻松访问我的 UID
记录[0]的结构是:
{u'MedlineCitation': DictElement({
u'OtherID': [],
u'OtherAbstract': [],
u'CitationSubset': ['IM'],
u'KeywordList': [],
u'DateCreated': {u'Month': '03', u'Day': '17', u'Year': '2016'},
u'SpaceFlightMission': [],
u'GeneralNote': [],
u'Article':
DictElement({
u'ArticleDate': [
DictElement({u'Month': '03', u'Day': '16', u'Year': '2016'}, attributes={u'DateType': u'Electronic'})],
u'Pagination': {u'MedlinePgn': 'e0151666'},
u'AuthorList': ListElement([
DictElement({
u'LastName': "O'Neill",
u'Initials': 'KE',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}],
u'ForeName': 'Kathy E'
}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Bredenkamp',
u'Initials': 'N', u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}],
u'ForeName': 'Nicholas'}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Tischner',
u'Initials': 'C',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}],
u'ForeName': 'Christin'}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Vaidya',
u'Initials': 'HJ',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}],
u'ForeName': 'Harsh J'}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Stenhouse',
u'Initials': 'FH',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}], u'ForeName': 'Frances H'}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Peddie',
u'Initials': 'CD',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}],
u'ForeName': 'C Diana'}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Nowell',
u'Initials': 'CS',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}],
u'ForeName': 'Craig S'}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Gaskell',
u'Initials': 'T',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}],
u'ForeName': 'Terri'}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Blackburn',
u'Initials': 'CC',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}], u'ForeName': 'C Clare'}, attributes={u'ValidYN': u'Y'})],
attributes={u'Type': u'authors', u'CompleteYN': u'Y'}),
u'Language': ['eng'],
u'PublicationTypeList': [StringElement('Journal Article', attributes={u'UI': u'D016428'})],
u'Journal': {
u'ISSN': StringElement('1932-6203', attributes={u'IssnType': u'Electronic'}),
u'ISOAbbreviation': 'PLoS ONE',
u'JournalIssue': DictElement({
u'Volume': '11',
u'Issue': '3',
u'PubDate': {u'Year': '2016'}}, attributes={u'CitedMedium': u'Internet'}),
u'Title': 'PloS one'},
u'ArticleTitle': 'Foxn1 Is Dynamically Regulated in Thymic Epithelial Cells during Embryogenesis and at the Onset of Thymic Involution.',
u'ELocationID': [StringElement('10.1371/journal.pone.0151666', attributes={u'ValidYN': u'Y', u'EIdType': u'doi'})],
u'Abstract': {u'AbstractText': ['--Unnecessarily long abstract removed --']}}, attributes={u'PubModel': u'Electronic-eCollection'}),
u'PMID': StringElement('26983083', attributes={u'Version': u'1'}),
u'MedlineJournalInfo': {
u'MedlineTA': 'PLoS One',
u'Country': 'United States',
u'NlmUniqueID': '101285081',
u'ISSNLinking': '1932-6203'}}, attributes={u'Owner': u'NLM', u'Status': u'In-Data-Review'}),
u'PubmedData': {
u'ArticleIdList': [
StringElement('10.1371/journal.pone.0151666', attributes={u'IdType': u'doi'}),
StringElement('PONE-D-15-47173', attributes={u'IdType': u'pii'}),
StringElement('26983083', attributes={u'IdType': u'pubmed'})],
u'PublicationStatus': 'epublish',
u'History': [
DictElement({u'Month': '', u'Day': '', u'Year': '2016'}, attributes={u'PubStatus': u'ecollection'}),
DictElement({u'Month': '10', u'Day': '28', u'Year': '2015'}, attributes={u'PubStatus': u'received'}),
DictElement({u'Month': '3', u'Day': '2', u'Year': '2016'}, attributes={u'PubStatus': u'accepted'}),
DictElement({u'Month': '3', u'Day': '16', u'Year': '2016'}, attributes={u'PubStatus': u'epublish'}),
DictElement({u'Minute': '0', u'Month': '3', u'Day': '17', u'Hour': '6', u'Year': '2016'}, attributes={u'PubStatus': u'entrez'}),
DictElement({u'Minute': '0', u'Month': '3', u'Day': '18', u'Hour': '6', u'Year': '2016'}, attributes={u'PubStatus': u'pubmed'}),
DictElement({u'Minute': '0', u'Month': '3', u'Day': '18', u'Hour': '6', u'Year': '2016'}, attributes={u'PubStatus': u'medline'})]}
}
我发现 return 一条 Medline 记录并对其进行解析要容易得多。我为相关查询插入完整的工作代码:query = "Tischner[AU] Cortex-specific down-regulation"
。 下面代码的关键点是 fetch_rec()
函数使用 rettype='Medline', retmode='text'
然后使用 BioPython 的 Medline 模块解析结果记录。
from StringIO import StringIO
from Bio import Entrez, Medline
def search_medline(query, email):
Entrez.email = email
search = Entrez.esearch(db='pubmed', term=query, usehistory='y')
handle = Entrez.read(search)
try:
return handle
except Exception as e:
raise IOError(str(e))
finally:
search.close()
def fetch_rec(rec_id, entrez_handle):
fetch_handle = Entrez.efetch(db='pubmed', id=rec_id,
rettype='Medline', retmode='text',
webenv=entrez_handle['WebEnv'],
query_key=entrez_handle['QueryKey'])
rec = fetch_handle.read()
return rec
def main(query, email):
rec_handler = search_medline(query, email)
for rec_id in rec_handler['IdList']:
rec = fetch_rec(rec_id, rec_handler)
rec_file = StringIO(rec)
medline_rec = Medline.read(rec_file)
if 'AB' in medline_rec:
print(medline_rec['AB'])
if __name__ == '__main__':
email = "my-email@provider.sth"
query = "Tischner[AU] Cortex-specific down-regulation"
main(query, email)
它会打印出您搜索的摘要,但通过更改 query
参数,此脚本可能适用于任何搜索。有更有效的方法来提取大量记录,但对于小型搜索来说,这就可以了。
我不太了解在这种情况下 "right" 应该做什么(不熟悉 biopython),但是你得到 KeyError
的原因是 'Abstract'
键嵌套在 'MedlineCitation'
字典中:
record[0]['MedlineCitation']['Article']['Abstract']
应该给你这样的东西:
{'AbstractText': ['--Unnecessarily long abstract removed --']}
我是 python 的新手,想使用 bio 包中的 entrez 系统从 pubmed 中提取摘要。
我通过电子搜索获得了我的 UID(存储在 my_list_ges
中),我还可以使用 efetch 下载一个条目。
然而,现在结果是一个字典列表,条目看起来像字典,但我无法访问它们:
Entrez.email= "my-email@provider.sth"
handle=Entrez.efetch(db="pubmed",id=my_list_ges[0],rettype="null",retmode="xml")
record = Entrez.read(handle)
abstract=record["Abstract"]
handle.close()
结果是类型错误:
TypeError: list indices must be integers, not str
我在尝试从第一条记录中检索 'Abstract'
时得到 KeyError
:
>>> record[0]["Abstract"]
KeyError: 'Abstract'
这很奇怪,因为在电子搜索的结果中,我可以通过字典轻松访问我的 UID
记录[0]的结构是:
{u'MedlineCitation': DictElement({
u'OtherID': [],
u'OtherAbstract': [],
u'CitationSubset': ['IM'],
u'KeywordList': [],
u'DateCreated': {u'Month': '03', u'Day': '17', u'Year': '2016'},
u'SpaceFlightMission': [],
u'GeneralNote': [],
u'Article':
DictElement({
u'ArticleDate': [
DictElement({u'Month': '03', u'Day': '16', u'Year': '2016'}, attributes={u'DateType': u'Electronic'})],
u'Pagination': {u'MedlinePgn': 'e0151666'},
u'AuthorList': ListElement([
DictElement({
u'LastName': "O'Neill",
u'Initials': 'KE',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}],
u'ForeName': 'Kathy E'
}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Bredenkamp',
u'Initials': 'N', u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}],
u'ForeName': 'Nicholas'}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Tischner',
u'Initials': 'C',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}],
u'ForeName': 'Christin'}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Vaidya',
u'Initials': 'HJ',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}],
u'ForeName': 'Harsh J'}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Stenhouse',
u'Initials': 'FH',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}], u'ForeName': 'Frances H'}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Peddie',
u'Initials': 'CD',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}],
u'ForeName': 'C Diana'}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Nowell',
u'Initials': 'CS',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}],
u'ForeName': 'Craig S'}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Gaskell',
u'Initials': 'T',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}],
u'ForeName': 'Terri'}, attributes={u'ValidYN': u'Y'}),
DictElement({
u'LastName': 'Blackburn',
u'Initials': 'CC',
u'Identifier': [],
u'AffiliationInfo': [{
u'Affiliation': 'MRC Centre for Regenerative Medicine, Institute for Stem Cell Research, School of Biological Sciences, University of Edinburgh, SCRM Building, 5 Little France Drive, Edinburgh, EH16 4UU, UK.',
u'Identifier': []}], u'ForeName': 'C Clare'}, attributes={u'ValidYN': u'Y'})],
attributes={u'Type': u'authors', u'CompleteYN': u'Y'}),
u'Language': ['eng'],
u'PublicationTypeList': [StringElement('Journal Article', attributes={u'UI': u'D016428'})],
u'Journal': {
u'ISSN': StringElement('1932-6203', attributes={u'IssnType': u'Electronic'}),
u'ISOAbbreviation': 'PLoS ONE',
u'JournalIssue': DictElement({
u'Volume': '11',
u'Issue': '3',
u'PubDate': {u'Year': '2016'}}, attributes={u'CitedMedium': u'Internet'}),
u'Title': 'PloS one'},
u'ArticleTitle': 'Foxn1 Is Dynamically Regulated in Thymic Epithelial Cells during Embryogenesis and at the Onset of Thymic Involution.',
u'ELocationID': [StringElement('10.1371/journal.pone.0151666', attributes={u'ValidYN': u'Y', u'EIdType': u'doi'})],
u'Abstract': {u'AbstractText': ['--Unnecessarily long abstract removed --']}}, attributes={u'PubModel': u'Electronic-eCollection'}),
u'PMID': StringElement('26983083', attributes={u'Version': u'1'}),
u'MedlineJournalInfo': {
u'MedlineTA': 'PLoS One',
u'Country': 'United States',
u'NlmUniqueID': '101285081',
u'ISSNLinking': '1932-6203'}}, attributes={u'Owner': u'NLM', u'Status': u'In-Data-Review'}),
u'PubmedData': {
u'ArticleIdList': [
StringElement('10.1371/journal.pone.0151666', attributes={u'IdType': u'doi'}),
StringElement('PONE-D-15-47173', attributes={u'IdType': u'pii'}),
StringElement('26983083', attributes={u'IdType': u'pubmed'})],
u'PublicationStatus': 'epublish',
u'History': [
DictElement({u'Month': '', u'Day': '', u'Year': '2016'}, attributes={u'PubStatus': u'ecollection'}),
DictElement({u'Month': '10', u'Day': '28', u'Year': '2015'}, attributes={u'PubStatus': u'received'}),
DictElement({u'Month': '3', u'Day': '2', u'Year': '2016'}, attributes={u'PubStatus': u'accepted'}),
DictElement({u'Month': '3', u'Day': '16', u'Year': '2016'}, attributes={u'PubStatus': u'epublish'}),
DictElement({u'Minute': '0', u'Month': '3', u'Day': '17', u'Hour': '6', u'Year': '2016'}, attributes={u'PubStatus': u'entrez'}),
DictElement({u'Minute': '0', u'Month': '3', u'Day': '18', u'Hour': '6', u'Year': '2016'}, attributes={u'PubStatus': u'pubmed'}),
DictElement({u'Minute': '0', u'Month': '3', u'Day': '18', u'Hour': '6', u'Year': '2016'}, attributes={u'PubStatus': u'medline'})]}
}
我发现 return 一条 Medline 记录并对其进行解析要容易得多。我为相关查询插入完整的工作代码:query = "Tischner[AU] Cortex-specific down-regulation"
。 下面代码的关键点是 fetch_rec()
函数使用 rettype='Medline', retmode='text'
然后使用 BioPython 的 Medline 模块解析结果记录。
from StringIO import StringIO
from Bio import Entrez, Medline
def search_medline(query, email):
Entrez.email = email
search = Entrez.esearch(db='pubmed', term=query, usehistory='y')
handle = Entrez.read(search)
try:
return handle
except Exception as e:
raise IOError(str(e))
finally:
search.close()
def fetch_rec(rec_id, entrez_handle):
fetch_handle = Entrez.efetch(db='pubmed', id=rec_id,
rettype='Medline', retmode='text',
webenv=entrez_handle['WebEnv'],
query_key=entrez_handle['QueryKey'])
rec = fetch_handle.read()
return rec
def main(query, email):
rec_handler = search_medline(query, email)
for rec_id in rec_handler['IdList']:
rec = fetch_rec(rec_id, rec_handler)
rec_file = StringIO(rec)
medline_rec = Medline.read(rec_file)
if 'AB' in medline_rec:
print(medline_rec['AB'])
if __name__ == '__main__':
email = "my-email@provider.sth"
query = "Tischner[AU] Cortex-specific down-regulation"
main(query, email)
它会打印出您搜索的摘要,但通过更改 query
参数,此脚本可能适用于任何搜索。有更有效的方法来提取大量记录,但对于小型搜索来说,这就可以了。
我不太了解在这种情况下 "right" 应该做什么(不熟悉 biopython),但是你得到 KeyError
的原因是 'Abstract'
键嵌套在 'MedlineCitation'
字典中:
record[0]['MedlineCitation']['Article']['Abstract']
应该给你这样的东西:
{'AbstractText': ['--Unnecessarily long abstract removed --']}