beautifulsoup 网页抓取搜索id列表
beautifulsoup web crawling search id list
我正在尝试抓取 ncbi eutils webpage。我想从网络上抓取Id列表,如下所示:
这是它的代码:
import requests
from bs4 import BeautifulSoup
def get_html(url):
"""get the content of the url"""
response = requests.get(url)
response.encoding = 'utf-8'
return response.text
def get_pmid(html):
soup = BeautifulSoup(html, 'lxml')
for texts in soup.select('body'):
text = texts.get_text()
print text
url_ncbi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%22D-PANTOTHENIC+ACID%22&retmax=2000&usehistory=y&field=Title/Abstracts"
html = get_html(url_ncbi)
get_pmid(html)
我想使用 select 函数获取文本但找不到脚本的正确代码:对于 soup.select(' ')
中的文本。
我对多层 类 和来自网络代码的 id 感到困惑,如下所示:
要获取所有 ID
标签,您可以使用 find_all()
函数:
import requests
from bs4 import BeautifulSoup
def get_html(url):
"""get the content of the url"""
response = requests.get(url)
response.encoding = 'utf-8'
return response.text
def get_pmid(html):
soup = BeautifulSoup(html, 'lxml')
rv = []
for id_tag in soup.find_all('id'):
rv.append(id_tag.text)
return rv
url_ncbi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%22D-PANTOTHENIC+ACID%22&retmax=2000&usehistory=y&field=Title/Abstracts"
html = get_html(url_ncbi)
all_ids = get_pmid(html)
print(all_ids)
打印:
['29737393', '29209902', '24632028', '23727638', '22536244', '22052867', '15371742', '12204559', '10885798', '16348362', '3096335', '3734807', '6247641', '6997858', '761345', '108510', '355840', '1003285', '4676550', '5804470', '6076800', '6076775', '6012920', '14091285']
我正在尝试抓取 ncbi eutils webpage。我想从网络上抓取Id列表,如下所示:
这是它的代码:
import requests
from bs4 import BeautifulSoup
def get_html(url):
"""get the content of the url"""
response = requests.get(url)
response.encoding = 'utf-8'
return response.text
def get_pmid(html):
soup = BeautifulSoup(html, 'lxml')
for texts in soup.select('body'):
text = texts.get_text()
print text
url_ncbi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%22D-PANTOTHENIC+ACID%22&retmax=2000&usehistory=y&field=Title/Abstracts"
html = get_html(url_ncbi)
get_pmid(html)
我想使用 select 函数获取文本但找不到脚本的正确代码:对于 soup.select(' ')
中的文本。
我对多层 类 和来自网络代码的 id 感到困惑,如下所示:
要获取所有 ID
标签,您可以使用 find_all()
函数:
import requests
from bs4 import BeautifulSoup
def get_html(url):
"""get the content of the url"""
response = requests.get(url)
response.encoding = 'utf-8'
return response.text
def get_pmid(html):
soup = BeautifulSoup(html, 'lxml')
rv = []
for id_tag in soup.find_all('id'):
rv.append(id_tag.text)
return rv
url_ncbi = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%22D-PANTOTHENIC+ACID%22&retmax=2000&usehistory=y&field=Title/Abstracts"
html = get_html(url_ncbi)
all_ids = get_pmid(html)
print(all_ids)
打印:
['29737393', '29209902', '24632028', '23727638', '22536244', '22052867', '15371742', '12204559', '10885798', '16348362', '3096335', '3734807', '6247641', '6997858', '761345', '108510', '355840', '1003285', '4676550', '5804470', '6076800', '6076775', '6012920', '14091285']