抓取 HTML 时将网站的值添加到 table

Adding values form a website to a table when scraping HTML

这是我需要的数据:

https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0

我已经将 table 导入 R:

library(tidyverse)
library(rvest)

webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0")

tbls <- html_nodes(webpage, "table")
tbls_ls <- webpage %>%
  html_nodes("table") %>%
  .[5] %>%
  html_table(fill = TRUE)

data = as.tibble(tbls_ls[[1]]) 

然而,我还需要在 table 中添加一件事。对于一些陨石,有可用的氧同位素值。单击“地块”部分下的陨石名称时可以看到这一点。单击绘图时,我们将被重定向到一个页面,其中包含三个同位素值。我想要做的是向我的 table 添加三列,其中包含每个陨石的相应同位素值。我尝试分别为每个“情节”部分编写代码,但我觉得可能会有更优雅的解决方案。

您可以获取不含同位素的 table,然后如果您决定使用同位素,则模仿页面所做的 post 请求;然后 left-join Name 列中的两个。你会得到比左边更多的行 table (没有同位素),因为有多个 Change values,但这与你在查看你描述的同位素的方法中看到的相匹配,其中有逗号分隔图中同位素的值列表,而不是按行拆分。

我选择更具选择性的 css 选择器来首先针对特定的 table 感兴趣,而不是索引到列表中。

我使用 write_excel_csv 在写出时保留 headers 的字符编码(我从 @stefan 得到的想法)。

您可以在写出(subset/select 等)之前从 joint_table 的输出中删除不需要的列。


r

library(dyplr)
library(httr)
library(rvest)
library(readr)
library(magrittr)
library(stringr)

webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0")

no_isotopes <- webpage %>%
  html_node("#maintable") %>%
  html_table(fill = T) 

data <- list(
  'sfor' = "names",
  'stype' = "contains",
  'country' = "All",
  'categ' = "Ungrouped achondrites",
  'page' = "0",
  'map' = "ge",
  'srt' = "name",
  'lrec' = "200",
  'pnt' = "Oxygen isotopes",
  'mblist' = "All",
  'snew' = "0",
  'sea' = "*"
)


r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php", body = data)

isotopes <- content(r, "text") %>%
  read_html(encoding = "UTF-8") %>%
  html_node("#maintable") %>%
  html_table(fill = T)


joint_table <- dplyr::left_join(no_isotopes, isotopes, by = "Name", copy = FALSE)

write_excel_csv(x = joint_table, path = "joint.csv", col_names = T, na = "")

示例输出:


编辑:

根据您在评论中的要求添加来自其他网址的附加信息。我必须动态确定要接听哪个 table 号码,以及处理没有 table 的情况。

library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(httr)
#> Warning: package 'httr' was built under R version 4.0.3
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
#> 
#> Attaching package: 'rvest'
#> The following object is masked from 'package:purrr':
#> 
#>     pluck
#> The following object is masked from 'package:readr':
#> 
#>     guess_encoding
library(readr)
library(furrr)

get_table <- function(url) {
  page <- read_html(url)
  test_list <- page %>%
    html_nodes("#maintable tr > .inside:nth-child(odd)") %>%
    html_text() # get left hand column %>%
  index <- match(TRUE, stringr::str_detect(test_list, "Data from:")) + 1
  table <- page %>%
    html_node(paste0("#maintable tr:nth-of-type(", index, ") table")) %>%
    html_table() %>%
    as_tibble()
  temp <- set_names(data.frame(t(table[, -1]), row.names = c()), t(table[, 1])) # https://www.nesono.com/node/456 ; https://whosebug.com/a/7970267/6241235
  return(temp)
}


start_url <- "https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0"
base <- "https://www.lpi.usra.edu"
webpage <- read_html(start_url)

no_isotopes <- webpage %>%
  html_node("#maintable") %>%
  html_table(fill = T)

data <- list(
  "sfor" = "names",
  "stype" = "contains",
  "country" = "All",
  "categ" = "Ungrouped achondrites",
  "page" = "0",
  "map" = "ge",
  "srt" = "name",
  "lrec" = "200",
  "pnt" = "Oxygen isotopes",
  "mblist" = "All",
  "snew" = "0",
  "sea" = "*"
)

r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php", body = data)

isotopes <- content(r, "text") %>%
  read_html(encoding = "UTF-8") %>%
  html_node("#maintable") %>%
  html_table(fill = T)

joint_table <- dplyr::left_join(no_isotopes, isotopes, by = "Name", copy = FALSE)

lookups <- webpage %>%
  html_node("#maintable") %>%
  html_nodes("td:nth-of-type(1) a") %>%
  map_df(~ c(html_text(.), html_attr(., "href")) %>%
    set_names("Name", "Link")) %>%
  mutate(Link = paste0(base, gsub("\s+", "%20", Link)))

error_df <- tibble(
  `State/Prov/County:` = NA_character_,
  `Origin or pseudonym:` = NA_character_,
  `Date:` = NA_character_,
  `Latitude:` = NA_character_,
  `Longitude:` = NA_character_,
  `Mass (g):` = NA_character_,
  `Pieces:` = NA_character_,
  `Class:` = NA_character_,
  `Shock stage:` = NA_character_,
  `Fayalite (mol%):` = NA_character_,
  `Ferrosilite (mol%):` = NA_character_,
  `Wollastonite (mol%):` = NA_character_,
  `Magnetic suscept.:` = NA_character_,
  `Classifier:` = NA_character_,
  `Type spec mass (g):` = NA_character_,
  `Type spec location:` = NA_character_,
  `Main mass:` = NA_character_,
  `Finder:` = NA_character_,
  `Comments:` = NA_character_,
)

no_cores <- future::availableCores() - 1

future::plan(future::multisession, workers = no_cores)

df <- furrr::future_map_dfr(lookups$Link, ~ tryCatch(get_table(.x), error = function(e) error_df))

colnames(df) <- sub(":", "", colnames(df))

df2 <- df %>%
  mutate(
    `Mass (g)` = gsub(",", "", `Mass (g)`),
    across(c(`Mass (g)`, `Magnetic suscept.`), as.numeric)
  )

if (nrow(df2) == nrow(no_isotopes)) {
  additional_info <- cbind(lookups, df2)
  joint_table$Name <- gsub(" \*\*", "", joint_table$Name)
  final_table <- dplyr::left_join(joint_table, additional_info, by = "Name", copy = FALSE)
  write_excel_csv(x = final_table, file = "joint.csv", col_names = T, na = "")
}

reprex package (v0.3.0)

于 2021-02-27 创建

N.B.

OP 由于某种原因在查找变量方面存在问题,所以这是我编写的对他们有用的替代方法:

lookups <- map_df(
  webpage %>% html_node("#maintable") %>% html_nodes("td:nth-of-type(1) a") , ~
    data.frame(
      Name = .x %>% html_text(),
      Link =  paste0(base, gsub("\s+", "%20", .x %>%  html_attr("href")))
    )
) %>% as_tibble()