在 R 中使用 Rvest 进行礼貌的网络抓取
Polite Webscraping with Rvest in R
我有抓取网站的代码,但这样做的方式是在从 运行 抓取这么多次之后,我收到 403 禁止错误。我知道 R 中有一个名为 polite 的包,它负责弄清楚如何 运行 抓取主机要求,这样 403 就不会发生。我尽力使它适应我的代码,但我被卡住了。真的很感激一些帮助。这是一些示例可重现的代码,其中仅包含来自许多链接的几个链接:
library(tidyverse)
library(httr)
library(rvest)
library(curl)
urls = c("https://www.pro-football-reference.com/teams/pit/2021.htm", "https://www.pro-
football-reference.com/teams/pit/2020.htm", "https://www.pro-football-
reference.com/teams/pit/2019.htm")
pitt <- map_dfr(
.x = urls,
.f = function(x) {Sys.sleep(2); cat(1);
read_html(
curl(x, handle = curl::new_handle("useragent" = "chrome"))) %>%
html_nodes("table") %>%
html_table(header = TRUE) %>%
simplify() %>%
.[[2]] %>%
janitor::row_to_names(row_number = 1) %>%
janitor::clean_names(.) %>%
select(week, day, date, result = x_2, record = rec, opponent = opp, team_score = tm, opponent_score = opp_2) %>%
mutate(year = str_extract(string = x, pattern = "\d{4}"))
}
)
这个 运行 应该没问题,但完整的 运行 包括 1933-2021 年的所有年份,而不仅仅是示例中提供的三年链接。我愿意以任何方式负责任地使用 polite package 或任何其他专家可能更熟悉的方式来抓取它。
这是我在这种情况下如何使用礼貌的建议。该代码创建了一个团队和赛季的网格,并礼貌地抓取数据。
解析器取自您的示例。
library(magrittr)
# Create polite session
host <- "https://www.pro-football-reference.com/"
session <- polite::bow(host, force = TRUE)
# Create grid of teams and seasons that shall be scraped
seasons <- 2020:2021
teams <- c("pit", "nor")
grid_to_scrape <- tidyr::expand_grid(team = teams, season = seasons)
grid_to_scrape
#> # A tibble: 4 × 2
#> team season
#> <chr> <int>
#> 1 pit 2020
#> 2 pit 2021
#> 3 nor 2020
#> 4 nor 2021
responses <- purrr::pmap_dfr(grid_to_scrape, function(team, season, session){
# For some verbose status updates
cli::cli_process_start("Scrape {.val {team}}, {.val {season}}")
# Create full url and scrape
full_url <- polite::nod(session, glue::glue("teams/{team}/{season}.htm"))
scrape <- polite::scrape(full_url)
# Parse the response, suppress Janitor warnings. This is a problem of the parser
suppressWarnings({
response <- scrape %>%
rvest::html_elements("table") %>%
rvest::html_table(header = TRUE) %>%
purrr::simplify() %>%
.[[2]] %>%
janitor::row_to_names(row_number = 1) %>%
janitor::clean_names() %>%
dplyr::select(week, day, date, result = x_2, record = rec, opponent = opp, team_score = tm, opponent_score = opp_2) %>%
dplyr::mutate(year = season, team = team)
})
# Update status
cli::cli_process_done()
# return parsed data
response
}, session = session)
#> ℹ Scrape "pit", 2020
#> ✓ Scrape "pit", 2020 ... done
#>
#> ℹ Scrape "pit", 2021
#> ✓ Scrape "pit", 2021 ... done
#>
#> ℹ Scrape "nor", 2020
#> ✓ Scrape "nor", 2020 ... done
#>
#> ℹ Scrape "nor", 2021
#> ✓ Scrape "nor", 2021 ... done
#>
responses
#> # A tibble: 77 × 10
#> week day date result record opponent team_score opponent_score year
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int>
#> 1 1 "Mon" "Septembe… "boxs… "1-0" New Yor… "26" "16" 2020
#> 2 2 "Sun" "Septembe… "boxs… "2-0" Denver … "26" "21" 2020
#> 3 3 "Sun" "Septembe… "boxs… "3-0" Houston… "28" "21" 2020
#> 4 4 "" "" "" "" Bye Week "" "" 2020
#> 5 5 "Sun" "October … "boxs… "4-0" Philade… "38" "29" 2020
#> 6 6 "Sun" "October … "boxs… "5-0" Clevela… "38" "7" 2020
#> 7 7 "Sun" "October … "boxs… "6-0" Tenness… "27" "24" 2020
#> 8 8 "Sun" "November… "boxs… "7-0" Baltimo… "28" "24" 2020
#> 9 9 "Sun" "November… "boxs… "8-0" Dallas … "24" "19" 2020
#> 10 10 "Sun" "November… "boxs… "9-0" Cincinn… "36" "10" 2020
#> # … with 67 more rows, and 1 more variable: team <chr>
由 reprex package (v2.0.1)
于 2022-02-22 创建
我有抓取网站的代码,但这样做的方式是在从 运行 抓取这么多次之后,我收到 403 禁止错误。我知道 R 中有一个名为 polite 的包,它负责弄清楚如何 运行 抓取主机要求,这样 403 就不会发生。我尽力使它适应我的代码,但我被卡住了。真的很感激一些帮助。这是一些示例可重现的代码,其中仅包含来自许多链接的几个链接:
library(tidyverse)
library(httr)
library(rvest)
library(curl)
urls = c("https://www.pro-football-reference.com/teams/pit/2021.htm", "https://www.pro-
football-reference.com/teams/pit/2020.htm", "https://www.pro-football-
reference.com/teams/pit/2019.htm")
pitt <- map_dfr(
.x = urls,
.f = function(x) {Sys.sleep(2); cat(1);
read_html(
curl(x, handle = curl::new_handle("useragent" = "chrome"))) %>%
html_nodes("table") %>%
html_table(header = TRUE) %>%
simplify() %>%
.[[2]] %>%
janitor::row_to_names(row_number = 1) %>%
janitor::clean_names(.) %>%
select(week, day, date, result = x_2, record = rec, opponent = opp, team_score = tm, opponent_score = opp_2) %>%
mutate(year = str_extract(string = x, pattern = "\d{4}"))
}
)
这个 运行 应该没问题,但完整的 运行 包括 1933-2021 年的所有年份,而不仅仅是示例中提供的三年链接。我愿意以任何方式负责任地使用 polite package 或任何其他专家可能更熟悉的方式来抓取它。
这是我在这种情况下如何使用礼貌的建议。该代码创建了一个团队和赛季的网格,并礼貌地抓取数据。
解析器取自您的示例。
library(magrittr)
# Create polite session
host <- "https://www.pro-football-reference.com/"
session <- polite::bow(host, force = TRUE)
# Create grid of teams and seasons that shall be scraped
seasons <- 2020:2021
teams <- c("pit", "nor")
grid_to_scrape <- tidyr::expand_grid(team = teams, season = seasons)
grid_to_scrape
#> # A tibble: 4 × 2
#> team season
#> <chr> <int>
#> 1 pit 2020
#> 2 pit 2021
#> 3 nor 2020
#> 4 nor 2021
responses <- purrr::pmap_dfr(grid_to_scrape, function(team, season, session){
# For some verbose status updates
cli::cli_process_start("Scrape {.val {team}}, {.val {season}}")
# Create full url and scrape
full_url <- polite::nod(session, glue::glue("teams/{team}/{season}.htm"))
scrape <- polite::scrape(full_url)
# Parse the response, suppress Janitor warnings. This is a problem of the parser
suppressWarnings({
response <- scrape %>%
rvest::html_elements("table") %>%
rvest::html_table(header = TRUE) %>%
purrr::simplify() %>%
.[[2]] %>%
janitor::row_to_names(row_number = 1) %>%
janitor::clean_names() %>%
dplyr::select(week, day, date, result = x_2, record = rec, opponent = opp, team_score = tm, opponent_score = opp_2) %>%
dplyr::mutate(year = season, team = team)
})
# Update status
cli::cli_process_done()
# return parsed data
response
}, session = session)
#> ℹ Scrape "pit", 2020
#> ✓ Scrape "pit", 2020 ... done
#>
#> ℹ Scrape "pit", 2021
#> ✓ Scrape "pit", 2021 ... done
#>
#> ℹ Scrape "nor", 2020
#> ✓ Scrape "nor", 2020 ... done
#>
#> ℹ Scrape "nor", 2021
#> ✓ Scrape "nor", 2021 ... done
#>
responses
#> # A tibble: 77 × 10
#> week day date result record opponent team_score opponent_score year
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int>
#> 1 1 "Mon" "Septembe… "boxs… "1-0" New Yor… "26" "16" 2020
#> 2 2 "Sun" "Septembe… "boxs… "2-0" Denver … "26" "21" 2020
#> 3 3 "Sun" "Septembe… "boxs… "3-0" Houston… "28" "21" 2020
#> 4 4 "" "" "" "" Bye Week "" "" 2020
#> 5 5 "Sun" "October … "boxs… "4-0" Philade… "38" "29" 2020
#> 6 6 "Sun" "October … "boxs… "5-0" Clevela… "38" "7" 2020
#> 7 7 "Sun" "October … "boxs… "6-0" Tenness… "27" "24" 2020
#> 8 8 "Sun" "November… "boxs… "7-0" Baltimo… "28" "24" 2020
#> 9 9 "Sun" "November… "boxs… "8-0" Dallas … "24" "19" 2020
#> 10 10 "Sun" "November… "boxs… "9-0" Cincinn… "36" "10" 2020
#> # … with 67 more rows, and 1 more variable: team <chr>
由 reprex package (v2.0.1)
于 2022-02-22 创建