在 R 和 rvest 中抓取多个链接的 HTML 表
scrape multiple linked HTML tables in R and rvest
这篇文章 http://www.ajnr.org/content/30/7/1402.full 包含四个指向 html-table 的链接,我想用 rvest 抓取它们。
借助 css select 或:
"#T1 a"
可以像这样到达第一个 table:
library("rvest")
html_session("http://www.ajnr.org/content/30/7/1402.full") %>%
follow_link(css="#T1 a") %>%
html_table() %>%
View()
css-select或:
".table-inline li:nth-child(1) a"
可以 select 所有四个 html 节点包含链接到四个 table 的标签:
library("rvest")
html("http://www.ajnr.org/content/30/7/1402.full") %>%
html_nodes(css=".table-inline li:nth-child(1) a")
如何遍历此列表并一次性检索所有四个 table?什么是最好的方法?
您可能想按如下方式使用:
main_url <- "http://www.ajnr.org/content/30/7/1402/"
urls <- paste(main_url,c("T1.expansion","T2.expansion","T3.expansion","T4.expansion"),".html", sep = "")
tables <- list()
for(i in seq_along(urls))
{
total <- readHTMLTable(urls[i])
n.rows <- unlist(lapply(total, function(t) dim(t)[1]))
tables[[i]] <- as.data.frame(total[[which.max(n.rows)]])
}
tables
#[[1]]
# Glioma Grade Sensitivity Specificity PPV NPV
#1 II vs III 50.0% 92.9% 80.0% 76.5%
#2 II vs IV 100.0% 100.0% 100.0% 100.0%
#3 III vs IV 78.9% 87.5% 93.8% 63.6%
#[[2]]
# Glioma Grade Sensitivity Specificity PPV NPV
#1 II vs III 87.5% 71.4% 63.6% 90.9%
#2 II vs IV 100.0% 85.7% 90.5% 100.0%
#3 III vs IV 89.5% 75.0% 89.5% 75.0%
#[[3]]
# Criterion Sensitivity Specificity PPV NPV
#1 ≥1* 85.2% 92.9% 95.8% 76.5%
#2 ≥2 81.5% 100.0% 100.0% 73.7%
#[[4]]
# Criterion Sensitivity Specificity PPV NPV
#1 <1.92 96.3% 71.4% 86.7% 90.9%
#2 <2.02 92.6% 71.4% 86.2% 83.3%
#3 <2.12* 92.6% 85.7% 92.6% 85.7%
这是一种方法:
library(rvest)
url <- "http://www.ajnr.org/content/30/7/1402.full"
page <- read_html(url)
# First find all the urls
table_urls <- page %>%
html_nodes(".table-inline li:nth-child(1) a") %>%
html_attr("href") %>%
xml2::url_absolute(url)
# Then loop over the urls, downloading & extracting the table
lapply(table_urls, . %>% read_html() %>% html_table())
这篇文章 http://www.ajnr.org/content/30/7/1402.full 包含四个指向 html-table 的链接,我想用 rvest 抓取它们。
借助 css select 或:
"#T1 a"
可以像这样到达第一个 table:
library("rvest")
html_session("http://www.ajnr.org/content/30/7/1402.full") %>%
follow_link(css="#T1 a") %>%
html_table() %>%
View()
css-select或:
".table-inline li:nth-child(1) a"
可以 select 所有四个 html 节点包含链接到四个 table 的标签:
library("rvest")
html("http://www.ajnr.org/content/30/7/1402.full") %>%
html_nodes(css=".table-inline li:nth-child(1) a")
如何遍历此列表并一次性检索所有四个 table?什么是最好的方法?
您可能想按如下方式使用:
main_url <- "http://www.ajnr.org/content/30/7/1402/"
urls <- paste(main_url,c("T1.expansion","T2.expansion","T3.expansion","T4.expansion"),".html", sep = "")
tables <- list()
for(i in seq_along(urls))
{
total <- readHTMLTable(urls[i])
n.rows <- unlist(lapply(total, function(t) dim(t)[1]))
tables[[i]] <- as.data.frame(total[[which.max(n.rows)]])
}
tables
#[[1]]
# Glioma Grade Sensitivity Specificity PPV NPV
#1 II vs III 50.0% 92.9% 80.0% 76.5%
#2 II vs IV 100.0% 100.0% 100.0% 100.0%
#3 III vs IV 78.9% 87.5% 93.8% 63.6%
#[[2]]
# Glioma Grade Sensitivity Specificity PPV NPV
#1 II vs III 87.5% 71.4% 63.6% 90.9%
#2 II vs IV 100.0% 85.7% 90.5% 100.0%
#3 III vs IV 89.5% 75.0% 89.5% 75.0%
#[[3]]
# Criterion Sensitivity Specificity PPV NPV
#1 ≥1* 85.2% 92.9% 95.8% 76.5%
#2 ≥2 81.5% 100.0% 100.0% 73.7%
#[[4]]
# Criterion Sensitivity Specificity PPV NPV
#1 <1.92 96.3% 71.4% 86.7% 90.9%
#2 <2.02 92.6% 71.4% 86.2% 83.3%
#3 <2.12* 92.6% 85.7% 92.6% 85.7%
这是一种方法:
library(rvest)
url <- "http://www.ajnr.org/content/30/7/1402.full"
page <- read_html(url)
# First find all the urls
table_urls <- page %>%
html_nodes(".table-inline li:nth-child(1) a") %>%
html_attr("href") %>%
xml2::url_absolute(url)
# Then loop over the urls, downloading & extracting the table
lapply(table_urls, . %>% read_html() %>% html_table())