用 rvest 抓取一个文学图书馆
Scrape a library of literature with rvest
我正在学习rvest
。
我打算抓取我的搜索结果。这是网页,
我抬头html_nodes()
。没有我在网页上看到的。
我能做什么?
这里是 'body'.
webpage %>% html_node('body')
{html_node}
<body>
[1] <noscript>\n <div class="no-script-banner" id="no-script-banner">\n <div class="warning-message">\n <div class="warn ...
[2] <div class="no-session-banner" id="no-session-banner" hidden>\n <div class="warning-message">\n <div class="warning-messa ...
[3] <a class="usa-skipnav" href="#search-results">\n Skip to main page content\n </a>
[4] <div role="complementary" id="ncov-alert-from-server" class="ncbi-alerts nCoV_shutdown converted" style="display: block;" dat ...
[5] <div class="usa-overlay"></div>
[6] <header class="ncbi-header" role="banner" data-section="Header"><div class="usa-grid">\n\t\t<div class="usa-width-one-whole"> ...
[7] <div role="navigation" aria-label="access keys">\n<a id="nws_header_accesskey_0" href="https://www.ncbi.nlm.nih.gov/guide/bro ...
[8] <section data-section="Alerts"><div class="ncbi-alerts-placeholder"></div>\n</section>
[9] <a id="maincontent" aria-label="Main page content below" role="navigation"></a>
[10] <main class="search-page" id="search-page"><h1 class="usa-sr-only">Search Page</h1>\n \n \n\n\n\n<input type="hidden" n ...
[11] <div id="ncbi-footer">\n <div class="literature-footer" role="complementary" title="Links to NCBI Literature Resources"> ...
[12] <script src="https://cdn.ncbi.nlm.nih.gov/pubmed/0399d7a0-471a-4f7d-84af-66091af9d657/CACHE/js/output.293fbf76aa18.js"></script>
[13] <script src="https://cdn.ncbi.nlm.nih.gov/pubmed/0399d7a0-471a-4f7d-84af-66091af9d657/CACHE/js/output.29588445dbd9.js"></script>
[14] <script>\n ncbi.awesome.basePage.init({\n userInfo: {\n isLoggedIn: false,\n username: "",\n log ...
[15] <script type="text/javascript">\n jQuery.getScript("https://www.ncbi.nlm.nih.gov/core/alerts/alerts.js", function () {\n ...
[16] <script defer type="text/javascript" src="https://cdn.ncbi.nlm.nih.gov/core/pinger/pinger.js"> </script>
[17] <svg class="timeline-filter-gradient" xmlns="http://www.w3.org/2000/svg"><defs><lineargradient id="timeline-filter-selected-g ...
[18] <script src="https://cdn.ncbi.nlm.nih.gov/pubmed/0399d7a0-471a-4f7d-84af-66091af9d657/CACHE/js/output.714a700656e1.js"></script>
[19] <script>\n ncbi.awesome.searchPage.init({\n searchQuery: "eliminat matrix effect HPLC\u002Dms/ms",\n searchCons ...
Not
我们可以通过
获取搜索结果的标题
library(rvest)
library(dplyr)
library(stringr)
url %>% read_html() %>% html_nodes('.docsum-title') %>% html_text() %>% str_remove_all('\n')
[1] " HPLC-MS/MS analysis of peramivir in rat plasma: Elimination of matrix effect using the phospholipid-removal solid-phase extraction method. "
[2] " Development of matrix effect-free MISPE-UHPLC-MS/MS method for determination of lovastatin in Pu-erh tea, oyster mushroom, and red yeast rice.
和 links 到
的文章
df = url %>% read_html() %>% html_nodes('.docsum-title') %>% html_attr('href')
paste0('https://pubmed.ncbi.nlm.nih.gov', df)
[1] "https://pubmed.ncbi.nlm.nih.gov/28976569/" "https://pubmed.ncbi.nlm.nih.gov/28410522/" "https://pubmed.ncbi.nlm.nih.gov/27491846/"
[4] "https://pubmed.ncbi.nlm.nih.gov/31532096/" "https://pubmed.ncbi.nlm.nih.gov/31288535/" "https://pubmed.ncbi.nlm.nih.gov/29433096/"
你可以看到旁边的节点.docsum-title
和link。
我会考虑您的搜索字词是否拼写正确,以及您是否希望在每个字词之间使用 AND 或 OR 来适当地设置您的请求。基于确定这些,您可能决定使用提供的 public APIs 来应用您的查询,提取 pubmed id,然后请求相关文档。
API指导:https://www.ncbi.nlm.nih.gov/home/develop/api/
library(jsonlite)
library(rvest)
library(tidyverse)
get_data <- function(link) {
page <- read_html(link)
data.frame(
link = link,
id = page %>% html_element('[title="PubMed ID"]') %>% html_text(trim = T),
title = page %>% html_element(".heading-title") %>% html_text(trim = T),
authors = page %>% html_elements(".full-name") %>% html_text(trim = T) %>% paste(., collapse = ', '),
abstract = page %>% html_element("#enc-abstract") %>% html_text2()
)
}
r <- jsonlite::read_json("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax=200&retmode=json&term=eliminate+AND+matrix+AND+effect+AND+hplc+ms/ms&mindate=2013&maxdate=2022")
ids <- r$esearchresult$idlist
if(length(ids)>0){
links <- sprintf("https://pubmed.ncbi.nlm.nih.gov/%s", ids)
results <- map_dfr(links, get_data)
}
我正在学习rvest
。
我打算抓取我的搜索结果。这是网页,
我抬头html_nodes()
。没有我在网页上看到的。
我能做什么?
这里是 'body'.
webpage %>% html_node('body')
{html_node}
<body>
[1] <noscript>\n <div class="no-script-banner" id="no-script-banner">\n <div class="warning-message">\n <div class="warn ...
[2] <div class="no-session-banner" id="no-session-banner" hidden>\n <div class="warning-message">\n <div class="warning-messa ...
[3] <a class="usa-skipnav" href="#search-results">\n Skip to main page content\n </a>
[4] <div role="complementary" id="ncov-alert-from-server" class="ncbi-alerts nCoV_shutdown converted" style="display: block;" dat ...
[5] <div class="usa-overlay"></div>
[6] <header class="ncbi-header" role="banner" data-section="Header"><div class="usa-grid">\n\t\t<div class="usa-width-one-whole"> ...
[7] <div role="navigation" aria-label="access keys">\n<a id="nws_header_accesskey_0" href="https://www.ncbi.nlm.nih.gov/guide/bro ...
[8] <section data-section="Alerts"><div class="ncbi-alerts-placeholder"></div>\n</section>
[9] <a id="maincontent" aria-label="Main page content below" role="navigation"></a>
[10] <main class="search-page" id="search-page"><h1 class="usa-sr-only">Search Page</h1>\n \n \n\n\n\n<input type="hidden" n ...
[11] <div id="ncbi-footer">\n <div class="literature-footer" role="complementary" title="Links to NCBI Literature Resources"> ...
[12] <script src="https://cdn.ncbi.nlm.nih.gov/pubmed/0399d7a0-471a-4f7d-84af-66091af9d657/CACHE/js/output.293fbf76aa18.js"></script>
[13] <script src="https://cdn.ncbi.nlm.nih.gov/pubmed/0399d7a0-471a-4f7d-84af-66091af9d657/CACHE/js/output.29588445dbd9.js"></script>
[14] <script>\n ncbi.awesome.basePage.init({\n userInfo: {\n isLoggedIn: false,\n username: "",\n log ...
[15] <script type="text/javascript">\n jQuery.getScript("https://www.ncbi.nlm.nih.gov/core/alerts/alerts.js", function () {\n ...
[16] <script defer type="text/javascript" src="https://cdn.ncbi.nlm.nih.gov/core/pinger/pinger.js"> </script>
[17] <svg class="timeline-filter-gradient" xmlns="http://www.w3.org/2000/svg"><defs><lineargradient id="timeline-filter-selected-g ...
[18] <script src="https://cdn.ncbi.nlm.nih.gov/pubmed/0399d7a0-471a-4f7d-84af-66091af9d657/CACHE/js/output.714a700656e1.js"></script>
[19] <script>\n ncbi.awesome.searchPage.init({\n searchQuery: "eliminat matrix effect HPLC\u002Dms/ms",\n searchCons ...
Not
我们可以通过
获取搜索结果的标题library(rvest)
library(dplyr)
library(stringr)
url %>% read_html() %>% html_nodes('.docsum-title') %>% html_text() %>% str_remove_all('\n')
[1] " HPLC-MS/MS analysis of peramivir in rat plasma: Elimination of matrix effect using the phospholipid-removal solid-phase extraction method. "
[2] " Development of matrix effect-free MISPE-UHPLC-MS/MS method for determination of lovastatin in Pu-erh tea, oyster mushroom, and red yeast rice.
和 links 到
的文章df = url %>% read_html() %>% html_nodes('.docsum-title') %>% html_attr('href')
paste0('https://pubmed.ncbi.nlm.nih.gov', df)
[1] "https://pubmed.ncbi.nlm.nih.gov/28976569/" "https://pubmed.ncbi.nlm.nih.gov/28410522/" "https://pubmed.ncbi.nlm.nih.gov/27491846/"
[4] "https://pubmed.ncbi.nlm.nih.gov/31532096/" "https://pubmed.ncbi.nlm.nih.gov/31288535/" "https://pubmed.ncbi.nlm.nih.gov/29433096/"
你可以看到旁边的节点.docsum-title
和link。
我会考虑您的搜索字词是否拼写正确,以及您是否希望在每个字词之间使用 AND 或 OR 来适当地设置您的请求。基于确定这些,您可能决定使用提供的 public APIs 来应用您的查询,提取 pubmed id,然后请求相关文档。
API指导:https://www.ncbi.nlm.nih.gov/home/develop/api/
library(jsonlite)
library(rvest)
library(tidyverse)
get_data <- function(link) {
page <- read_html(link)
data.frame(
link = link,
id = page %>% html_element('[title="PubMed ID"]') %>% html_text(trim = T),
title = page %>% html_element(".heading-title") %>% html_text(trim = T),
authors = page %>% html_elements(".full-name") %>% html_text(trim = T) %>% paste(., collapse = ', '),
abstract = page %>% html_element("#enc-abstract") %>% html_text2()
)
}
r <- jsonlite::read_json("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmax=200&retmode=json&term=eliminate+AND+matrix+AND+effect+AND+hplc+ms/ms&mindate=2013&maxdate=2022")
ids <- r$esearchresult$idlist
if(length(ids)>0){
links <- sprintf("https://pubmed.ncbi.nlm.nih.gov/%s", ids)
results <- map_dfr(links, get_data)
}