在 R 中使用 Rvest 进行礼貌的网络抓取

Question

我有抓取网站的代码，但这样做的方式是在从运行抓取这么多次之后，我收到 403 禁止错误。我知道 R 中有一个名为 polite 的包，它负责弄清楚如何运行抓取主机要求，这样 403 就不会发生。我尽力使它适应我的代码，但我被卡住了。真的很感激一些帮助。这是一些示例可重现的代码，其中仅包含来自许多链接的几个链接：

library(tidyverse)
library(httr) 
library(rvest)
library(curl)

urls = c("https://www.pro-football-reference.com/teams/pit/2021.htm", "https://www.pro- 
football-reference.com/teams/pit/2020.htm", "https://www.pro-football- 
reference.com/teams/pit/2019.htm")


pitt <- map_dfr(
.x = urls,
 .f = function(x) {Sys.sleep(2); cat(1);
 read_html(
  curl(x, handle = curl::new_handle("useragent" = "chrome"))) %>% 
  html_nodes("table") %>% 
  html_table(header = TRUE) %>% 
  simplify() %>%
  .[[2]] %>% 
  janitor::row_to_names(row_number = 1) %>% 
  janitor::clean_names(.) %>% 
  select(week, day, date, result = x_2, record = rec, opponent = opp, team_score = tm, opponent_score = opp_2) %>% 
  mutate(year = str_extract(string = x, pattern = "\d{4}"))
 }
)

这个运行应该没问题，但完整的运行包括 1933-2021 年的所有年份，而不仅仅是示例中提供的三年链接。我愿意以任何方式负责任地使用 polite package 或任何其他专家可能更熟悉的方式来抓取它。

Answer 1

这是我在这种情况下如何使用礼貌的建议。该代码创建了一个团队和赛季的网格，并礼貌地抓取数据。

解析器取自您的示例。

library(magrittr)

# Create polite session
host <- "https://www.pro-football-reference.com/"
session <- polite::bow(host, force = TRUE)

# Create grid of teams and seasons that shall be scraped
seasons <- 2020:2021
teams <- c("pit", "nor")
grid_to_scrape <- tidyr::expand_grid(team = teams, season = seasons)
grid_to_scrape
#> # A tibble: 4 × 2
#>   team  season
#>   <chr>  <int>
#> 1 pit     2020
#> 2 pit     2021
#> 3 nor     2020
#> 4 nor     2021

responses <- purrr::pmap_dfr(grid_to_scrape, function(team, season, session){
  # For some verbose status updates
  cli::cli_process_start("Scrape {.val {team}}, {.val {season}}")
  # Create full url and scrape
  full_url <- polite::nod(session, glue::glue("teams/{team}/{season}.htm"))
  scrape <- polite::scrape(full_url)
  # Parse the response, suppress Janitor warnings. This is a problem of the parser
  suppressWarnings({
    response <- scrape %>% 
      rvest::html_elements("table") %>% 
      rvest::html_table(header = TRUE) %>% 
      purrr::simplify() %>%
      .[[2]] %>%
      janitor::row_to_names(row_number = 1) %>% 
      janitor::clean_names() %>% 
      dplyr::select(week, day, date, result = x_2, record = rec, opponent = opp, team_score = tm, opponent_score = opp_2) %>% 
      dplyr::mutate(year = season, team = team)
  })
  # Update status
  cli::cli_process_done()
  # return parsed data
  response
}, session = session)
#> ℹ Scrape "pit", 2020
#> ✓ Scrape "pit", 2020 ... done
#> 
#> ℹ Scrape "pit", 2021
#> ✓ Scrape "pit", 2021 ... done
#> 
#> ℹ Scrape "nor", 2020
#> ✓ Scrape "nor", 2020 ... done
#> 
#> ℹ Scrape "nor", 2021
#> ✓ Scrape "nor", 2021 ... done
#> 

responses
#> # A tibble: 77 × 10
#>    week  day   date       result record opponent team_score opponent_score  year
#>    <chr> <chr> <chr>      <chr>  <chr>  <chr>    <chr>      <chr>          <int>
#>  1 1     "Mon" "Septembe… "boxs… "1-0"  New Yor… "26"       "16"            2020
#>  2 2     "Sun" "Septembe… "boxs… "2-0"  Denver … "26"       "21"            2020
#>  3 3     "Sun" "Septembe… "boxs… "3-0"  Houston… "28"       "21"            2020
#>  4 4     ""    ""         ""     ""     Bye Week ""         ""              2020
#>  5 5     "Sun" "October … "boxs… "4-0"  Philade… "38"       "29"            2020
#>  6 6     "Sun" "October … "boxs… "5-0"  Clevela… "38"       "7"             2020
#>  7 7     "Sun" "October … "boxs… "6-0"  Tenness… "27"       "24"            2020
#>  8 8     "Sun" "November… "boxs… "7-0"  Baltimo… "28"       "24"            2020
#>  9 9     "Sun" "November… "boxs… "8-0"  Dallas … "24"       "19"            2020
#> 10 10    "Sun" "November… "boxs… "9-0"  Cincinn… "36"       "10"            2020
#> # … with 67 more rows, and 1 more variable: team <chr>

^{由 reprex package (v2.0.1)}

于 2022-02-22 创建

在 R 中使用 Rvest 进行礼貌的网络抓取

Polite Webscraping with Rvest in R

r

web-scraping

rvest