使用 "rvest" 包进行网络抓取,如何遍历一堆县级 FIPS 代码?

Web-scraping using "rvest" package, how do I loop over a bunch of county FIPS codes?

所以我正在尝试从网站获取农场补贴数据,我已经想出了如何抓取我正在寻找的内容,现在我正在尝试遍历一个州 (CO) 的所有县从各县逐年获取这些补贴数据。我同意 (a) 在循环运行后为每个县拥有一个单独的 .csv 或 (b) 将它们全部编译到一个数据框中然后另存为 .csv。

以下是仅针对一个县完成的抓取示例。现在我想编写一个循环,暂时遍历 fips 代码 08003、08005、08007、08009 和 08011(稍后我可以将其推断到 CO 的其他县)。

# Starting with Adams County
library(rvest)
library(dplyr)
library(tidyr)


link = "https://farm.ewg.org/regionsummary.php?fips=08001"
page = read_html(link)

year = page %>% html_nodes("tr~ tr+ tr td:nth-child(1)") %>% html_text()
year

subs = page %>% html_nodes("td:nth-child(3)") %>% html_text()
subs

  subsidy_data <- data.frame(subs)

  subs = data.frame(do.call("rbind", strsplit(as.character(subsidy_data$subs), "$", fixed = TRUE)))
  
  sub_data <- cbind(year, subs)
  sub_data <- sub_data[-c(28),]

cons_sub_rec = page %>% html_nodes("td~ td+ td small:nth-child(1) em") %>% html_text()
cons_sub_rec <- cons_sub_rec[-c(28)]

dis_sub_rec = page %>% html_nodes("small:nth-child(3) em") %>% html_text()

comm_sub_rec = page %>% html_nodes("small:nth-child(5) em") %>% html_text()

ins_sub_rec = page %>% html_nodes("small:nth-child(7) em") %>% html_text()

  sub_data <- cbind(year, subs, cons_sub_rec, dis_sub_rec, comm_sub_rec, ins_sub_rec)
  
  sub_data$fips = 8001
  
  write.csv(sub_data,"filepath/ewg_sub_8001.csv", row.names = TRUE)  

欢迎提出任何建议!

您显示的代码不起作用,因此我在下面进行了对我来说有意义的最快更正。

这里的想法是捕获您想要在自定义函数中循环的步骤,其中进入的变量是您想要循环的任何内容。然后用purrr::map()用这个函数映射fips码。

library(rvest)

fips <- c(08001, 08003, 08005, 08007, 08009,  08011)

get_fips_data <- function(x) {
  url <- paste("https://farm.ewg.org/regionsummary.php?fips=", x)
  site <- read_html(url)
  
  year = site %>% html_nodes("tr~ tr+ tr td:nth-child(1)") %>% html_text()

  subs = site %>% html_nodes("td:nth-child(3)") %>% html_text()

  subsidy_data <- data.frame(subs)

  subs = data.frame(do.call("rbind", strsplit(as.character(subsidy_data$subs), "$", fixed = TRUE)))

  sub_data <- cbind(year, subs)
  sub_data <- sub_data[-28,]

  cons_sub_rec = site %>% html_nodes("small:nth-child(1) em") %>% html_text()
  cons_sub_rec <- cons_sub_rec[-28]

  dis_sub_rec = site %>% html_nodes("small:nth-child(3) em") %>% html_text()

  comm_sub_rec = site %>% html_nodes("small:nth-child(5) em") %>% html_text()

  ins_sub_rec = site %>% html_nodes("small:nth-child(7) em") %>% html_text()

  cbind(year, subs, cons_sub_rec, dis_sub_rec, comm_sub_rec, ins_sub_rec)
}


fips %>% 
  purrr::set_names() %>% 
  purrr::map_dfr(get_fips_data, .id = "fips")

您可以请求地理文件首先收集与给定州关联的所有县代码和名称。这可以通过辅助函数来完成。然后,您可以编写一个额外的辅助函数来整理从每个请求到给定网页的 html、returned(其中 url 由与县 [=29 连接的基本字符串构成) =]), 到包含感兴趣信息的单个 DataFrame 中。使用 future_map_dfr 将后一个函数从 furrr 映射到 return 单个 DataFrame。

备注:

代码是用 R 4.1.0+ 语法编写的。

感谢 @hrbrmstr 处理 br 元素的方法。


library(rvest)
library(tidyverse)
library(jsonlite)
#> 
#> Attaching package: 'jsonlite'
#> The following object is masked from 'package:purrr':
#> 
#>     flatten
library(janitor)
#> 
#> Attaching package: 'janitor'
#> The following objects are masked from 'package:stats':
#> 
#>     chisq.test, fisher.test
library(furrr)
#> Loading required package: future
library(xml2)

state_county_codes <- \(state_code){
  read_html(sprintf("https://farm.ewg.org/ammap/maps/js/%sCounties.js", state_code)) |>
    html_text() |>
    stringr::str_match("(\[.*\])") |>
    {
      \(x) x[, 1]
    }() |>
    jsonlite::parse_json(simplifyVector = T) |>
    select(-d) |>
    mutate(
      id = substr(id, 2, 6),
      webpage = paste0("https://farm.ewg.org/regionsummary.php?fips=", id)
    ) |>
    tibble() -> df
}

county_summary <- \(county_code) {
  page <- read_html(sprintf("https://farm.ewg.org/regionsummary.php?fips=%s", county_code))

  xml_find_all(page, ".//br") |> xml_add_sibling("p", "#")
  xml_find_all(page, ".//br") |> xml_remove()

  t <- page |>
    html_element(".table") |>
    html_table()

  t <- t[-c(5)] |> clean_names()

  df <- data.frame(
    id = county_code,
    year = t$year |> stringi::stri_remove_empty() |> rep(4) |>
      {
        \(x) stringr::str_replace(x, "‡", "")
      }(),
    `subsidy_category` = stringr::str_split_fixed(t$`subsidy_category`, "#", 4) |> stringi::stri_remove_empty() |> as.vector(),
    amount = stringr::str_split_fixed(t$`subsidy_category_2`, "#", 4) |> stringi::stri_remove_empty() |> as.vector(),
    number = stringr::str_split_fixed(t$`subsidy_category_3`, "#", 4) |> stringi::stri_remove_empty() |> as.vector()
  )
}

state_code <- "co"

counties <- state_county_codes(state_code)

no_cores <- future::availableCores() - 1
future::plan(future::multisession, workers = no_cores)
results <- future_map_dfr(counties$id, .f = county_summary)

final <- dplyr::left_join(results, counties, by = "id") |>
  select(title, everything()) |>
  rename(county = title)

reprex package (v2.0.1)

于 2021-11-03 创建