Vivino - 用 R 刮擦
Vivino - Scraping with R
我想从 Vivino 抓取有关葡萄酒的基本数据。我以前从未进行过抓取,但基于一些关于 Datacamp 的教程和讲座,我尝试使用库 rvest 来使用基本代码。
但是,它似乎不起作用并且 returns 值为零。
谁能帮我告诉我,问题出在哪里?代码是否完全错误,我应该使用其他方法,还是我只是遗漏了一些东西并且做错了?
预先感谢您的任何回答!
library(rvest)
library(dplyr)
url <- 'https://www.vivino.com/explore?e=eJwNybEOQDAQBuC3ubkG4z-abMQkIqdO00RbuTbF2_OtX1A0FHyEocAPWmPIvhh7suimga5_3YHK6qXwSWmDcvHR5ZWrKDuhhF2ypbvMC5oP96QajA%3D%3D&cart_item_source=nav-explore'
web <- read_html(url)
winery_data <- web %>% html_nodes('.vintageTitle__winery--2YoIr') %>% html_text()
head(winery_data)
wine_name <- web %>% html_nodes('.vintageTitle__wine--U7t9G') %>% html_text()
wine_country <- web %>% html_nodes('.vintageLocation__anchor--T7J3k+ .vintageLocation__anchor--T7J3k') %>% html_text()
wine_region <- web %>% html_nodes('span+ .vintageLocation__anchor--T7J3k') %>% html_text()
wine_rating <- web %>% html_nodes('.vivinoRating__averageValue--3Navj') %>% html_text()
n_ratings <- web %>% html_nodes('.vivinoRating__caption--3tZeS') %>% html_text()
页面动态加载,这就是单独 rvest
不起作用的原因;你还需要使用 RSelenium
.
假设我使用 Firefox,下面的代码应该可以工作:
# RSelenium with Firefox
rD <- RSelenium::rsDriver(browser="firefox", port=4546L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate(url)
# Scroll down a couple of times to reach the bottom of the page
# so that additional data load dynamically with each scroll.
# Here I scroll 4 times, but perhaps you will need much more than that.
for(i in 1:4){
remDr$executeScript(paste("scroll(0,",i*10000,");"))
Sys.sleep(3)
}
# get the page source
web <- remDr$getPageSource()
web <- xml2::read_html(web[[1]])
# close RSelenium
remDr$close()
gc()
rD$server$stop()
system("taskkill /im java.exe /f", intern=FALSE, ignore.stdout=FALSE)
# now we can go on to our rvest code and scrape the data
winery_data <- web %>% html_nodes('.vintageTitle__winery--2YoIr') %>% html_text()
head(winery_data)
wine_name <- web %>% html_nodes('.vintageTitle__wine--U7t9G') %>% html_text()
wine_country <- web %>% html_nodes('.vintageLocation__anchor--T7J3k+ .vintageLocation__anchor--T7J3k') %>% html_text()
wine_region <- web %>% html_nodes('span+ .vintageLocation__anchor--T7J3k') %>% html_text()
wine_rating <- web %>% html_nodes('.vivinoRating__averageValue--3Navj') %>% html_text()
n_ratings <- web %>% html_nodes('.vivinoRating__caption--3tZeS') %>% html_text()
我想从 Vivino 抓取有关葡萄酒的基本数据。我以前从未进行过抓取,但基于一些关于 Datacamp 的教程和讲座,我尝试使用库 rvest 来使用基本代码。 但是,它似乎不起作用并且 returns 值为零。 谁能帮我告诉我,问题出在哪里?代码是否完全错误,我应该使用其他方法,还是我只是遗漏了一些东西并且做错了? 预先感谢您的任何回答!
library(rvest)
library(dplyr)
url <- 'https://www.vivino.com/explore?e=eJwNybEOQDAQBuC3ubkG4z-abMQkIqdO00RbuTbF2_OtX1A0FHyEocAPWmPIvhh7suimga5_3YHK6qXwSWmDcvHR5ZWrKDuhhF2ypbvMC5oP96QajA%3D%3D&cart_item_source=nav-explore'
web <- read_html(url)
winery_data <- web %>% html_nodes('.vintageTitle__winery--2YoIr') %>% html_text()
head(winery_data)
wine_name <- web %>% html_nodes('.vintageTitle__wine--U7t9G') %>% html_text()
wine_country <- web %>% html_nodes('.vintageLocation__anchor--T7J3k+ .vintageLocation__anchor--T7J3k') %>% html_text()
wine_region <- web %>% html_nodes('span+ .vintageLocation__anchor--T7J3k') %>% html_text()
wine_rating <- web %>% html_nodes('.vivinoRating__averageValue--3Navj') %>% html_text()
n_ratings <- web %>% html_nodes('.vivinoRating__caption--3tZeS') %>% html_text()
页面动态加载,这就是单独 rvest
不起作用的原因;你还需要使用 RSelenium
.
假设我使用 Firefox,下面的代码应该可以工作:
# RSelenium with Firefox
rD <- RSelenium::rsDriver(browser="firefox", port=4546L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate(url)
# Scroll down a couple of times to reach the bottom of the page
# so that additional data load dynamically with each scroll.
# Here I scroll 4 times, but perhaps you will need much more than that.
for(i in 1:4){
remDr$executeScript(paste("scroll(0,",i*10000,");"))
Sys.sleep(3)
}
# get the page source
web <- remDr$getPageSource()
web <- xml2::read_html(web[[1]])
# close RSelenium
remDr$close()
gc()
rD$server$stop()
system("taskkill /im java.exe /f", intern=FALSE, ignore.stdout=FALSE)
# now we can go on to our rvest code and scrape the data
winery_data <- web %>% html_nodes('.vintageTitle__winery--2YoIr') %>% html_text()
head(winery_data)
wine_name <- web %>% html_nodes('.vintageTitle__wine--U7t9G') %>% html_text()
wine_country <- web %>% html_nodes('.vintageLocation__anchor--T7J3k+ .vintageLocation__anchor--T7J3k') %>% html_text()
wine_region <- web %>% html_nodes('span+ .vintageLocation__anchor--T7J3k') %>% html_text()
wine_rating <- web %>% html_nodes('.vivinoRating__averageValue--3Navj') %>% html_text()
n_ratings <- web %>% html_nodes('.vivinoRating__caption--3tZeS') %>% html_text()