跨多个页面的 R 网络抓取
R web scraping across multiple pages
我正在开发一个网络抓取程序来搜索特定的葡萄酒和 return 该品种的当地葡萄酒列表。我遇到的问题是多页结果。下面的代码是我正在使用的基本示例
url2 <- "http://www.winemag.com/?s=washington+merlot&search_type=reviews"
htmlpage2 <- read_html(url2)
names2 <- html_nodes(htmlpage2, ".review-listing .title")
Wines2 <- html_text(names2)
对于这个特定的搜索,有 39 页的结果。我知道 url 更改为 http://www.winemag.com/?s=washington%20merlot&drink_type=wine&page=2,但是是否有一种简单的方法可以让代码循环遍历所有 returned 页面并将所有 39 页的结果编译到一个列表中?我知道我可以手动完成所有 urls,但这似乎有点矫枉过正。
您可以 lapply
跨越 URL 的矢量,您可以通过将基数 URL 粘贴到一个序列来制作它:
library(rvest)
wines <- lapply(paste0('http://www.winemag.com/?s=washington%20merlot&drink_type=wine&page=', 1:39),
function(url){
url %>% read_html() %>%
html_nodes(".review-listing .title") %>%
html_text()
})
结果将在列表中返回,每个页面都有一个元素。
如果您希望所有信息都作为 data.frame
:
,您也可以对 purrr::map_df()
执行类似的操作
library(rvest)
library(purrr)
url_base <- "http://www.winemag.com/?s=washington merlot&drink_type=wine&page=%d"
map_df(1:39, function(i) {
# simple but effective progress indicator
cat(".")
pg <- read_html(sprintf(url_base, i))
data.frame(wine=html_text(html_nodes(pg, ".review-listing .title")),
excerpt=html_text(html_nodes(pg, "div.excerpt")),
rating=gsub(" Points", "", html_text(html_nodes(pg, "span.rating"))),
appellation=html_text(html_nodes(pg, "span.appellation")),
price=gsub("\$", "", html_text(html_nodes(pg, "span.price"))),
stringsAsFactors=FALSE)
}) -> wines
dplyr::glimpse(wines)
## Observations: 1,170
## Variables: 5
## $ wine (chr) "Charles Smith 2012 Royal City Syrah (Columbia Valley (WA)...
## $ excerpt (chr) "Green olive, green stem and fresh herb aromas are at the ...
## $ rating (chr) "96", "95", "94", "93", "93", "93", "93", "93", "93", "93"...
## $ appellation (chr) "Columbia Valley", "Columbia Valley", "Columbia Valley", "...
## $ price (chr) "140", "70", "70", "20", "70", "40", "135", "50", "60", "3...
我正在开发一个网络抓取程序来搜索特定的葡萄酒和 return 该品种的当地葡萄酒列表。我遇到的问题是多页结果。下面的代码是我正在使用的基本示例
url2 <- "http://www.winemag.com/?s=washington+merlot&search_type=reviews"
htmlpage2 <- read_html(url2)
names2 <- html_nodes(htmlpage2, ".review-listing .title")
Wines2 <- html_text(names2)
对于这个特定的搜索,有 39 页的结果。我知道 url 更改为 http://www.winemag.com/?s=washington%20merlot&drink_type=wine&page=2,但是是否有一种简单的方法可以让代码循环遍历所有 returned 页面并将所有 39 页的结果编译到一个列表中?我知道我可以手动完成所有 urls,但这似乎有点矫枉过正。
您可以 lapply
跨越 URL 的矢量,您可以通过将基数 URL 粘贴到一个序列来制作它:
library(rvest)
wines <- lapply(paste0('http://www.winemag.com/?s=washington%20merlot&drink_type=wine&page=', 1:39),
function(url){
url %>% read_html() %>%
html_nodes(".review-listing .title") %>%
html_text()
})
结果将在列表中返回,每个页面都有一个元素。
如果您希望所有信息都作为 data.frame
:
purrr::map_df()
执行类似的操作
library(rvest)
library(purrr)
url_base <- "http://www.winemag.com/?s=washington merlot&drink_type=wine&page=%d"
map_df(1:39, function(i) {
# simple but effective progress indicator
cat(".")
pg <- read_html(sprintf(url_base, i))
data.frame(wine=html_text(html_nodes(pg, ".review-listing .title")),
excerpt=html_text(html_nodes(pg, "div.excerpt")),
rating=gsub(" Points", "", html_text(html_nodes(pg, "span.rating"))),
appellation=html_text(html_nodes(pg, "span.appellation")),
price=gsub("\$", "", html_text(html_nodes(pg, "span.price"))),
stringsAsFactors=FALSE)
}) -> wines
dplyr::glimpse(wines)
## Observations: 1,170
## Variables: 5
## $ wine (chr) "Charles Smith 2012 Royal City Syrah (Columbia Valley (WA)...
## $ excerpt (chr) "Green olive, green stem and fresh herb aromas are at the ...
## $ rating (chr) "96", "95", "94", "93", "93", "93", "93", "93", "93", "93"...
## $ appellation (chr) "Columbia Valley", "Columbia Valley", "Columbia Valley", "...
## $ price (chr) "140", "70", "70", "20", "70", "40", "135", "50", "60", "3...