击中空白值时出现 Purrr 函数式编程错误
Purrr functional programming error when hitting blank values
很抱歉,如果这是一个脑筋急转弯的问题 - 它可能是一个简单的错误处理。当其中一个变量遇到空白时(在本例中为 'num_views' 变量),此代码会中断 - 有没有办法为任何空白值 return 和 'NA'?如果有任何建议,我将不胜感激
错误响应为:
错误:tibble 中的所有列都必须是向量。
num_views
列是函数。
# Custom functions
parse_listing <- function(listing){
# Review content
address <- listings %>% html_nodes(xpath = '//p[@data-testid="listing-description"]') %>% html_text2()
link <- listings %>% html_nodes(xpath = '//div[@data-testid="search-result"]/div/div/a[2]') %>% html_attr('href') %>% paste("https://www.zoopla.co.uk", ., sep="")
prop_type <- listings %>% html_nodes(xpath = '//h2[@data-testid="listing-title"]') %>% html_text2()
price <- listings %>% html_nodes(xpath = '//div[@data-testid="listing-price"]/p[@size="6"]') %>% html_text2() %>% str_remove_all("[£,]")
est_agent <- listings %>% html_nodes(xpath = '//div[@data-testid="search-result"]//a/img') %>% html_attr('alt') %>% str_remove('Marketed by ')
date_listed <- listings %>% html_nodes(xpath = '//span[@data-testid="date-published"]') %>% html_text2() %>% str_remove('Listed on ') %>% dmy()
num_views <- possibly(listings %>% html_nodes(xpath = '//span[@data-testid="number-of-views"]') %>% html_text2() %>% str_remove(' views'), otherwise = NULL)
tibble(address, link, prop_type, price, est_agent, date_listed, num_views)
}
# Script
link <- 'https://www.zoopla.co.uk/for-sale/property/wd3/?page_size=25&q=wd3&radius=0&results_sort=most_popular&search_source=refine&pn=7'
page <- read_html(link)
listings <- page %>% html_nodes(xpath = '//div[@data-testid="search-result"]')
wd3p7 <- map_dfr(listings, parse_listing)
出现错误时用tryCatch
或possibly/safely
(从purrr
)换行到return所需的值
library(purrr)
library(rvest)
library(dplyr)
library(lubridate)
parse_listing <- function(listing){
# Review content
address <- possibly(function(x) {
x %>%
html_nodes(xpath = '//p[@data-testid="listing-description"]') %>%
html_text2()
}, otherwise = NA_character_)(listing)
link <- possibly(function(x) {
x %>%
html_nodes(xpath = '//div[@data-testid="search-result"]/div/div/a[2]') %>%
html_attr('href') %>%
paste("https://www.zoopla.co.uk", ., sep="")
}, otherwise = NA_character_)(listing)
prop_type <- possibly(function(x) {
x %>%
html_nodes(xpath = '//h2[@data-testid="listing-title"]') %>%
html_text2()
}, otherwise = NA_character_)(listing)
price <- possibly(function(x) {
x %>%
html_nodes(xpath = '//div[@data-testid="listing-price"]/p[@size="6"]') %>%
html_text2() %>%
str_remove_all("[£,]")
}, otherwise = NA_character_)(listing)
est_agent <- possibly(function(x) {
x %>%
html_nodes(xpath = '//div[@data-testid="search-result"]//a/img') %>%
html_attr('alt') %>%
str_remove('Marketed by ')
}, otherwise = NA_character_)(listing)
date_listed <- possibly(function(x) {
x %>%
html_nodes(xpath = '//span[@data-testid="date-published"]') %>%
html_text2() %>%
str_remove('Listed on ') %>%
dmy()
}, otherwise = NA_character_)(listing)
num_views <- possibly(function(x) {
x %>%
html_nodes(xpath = '//span[@data-testid="number-of-views"]') %>%
html_text2() %>%
str_remove(' views')
}, otherwise = NA_character_)(listing)
type.convert(as_tibble(do.call(qpcR:::cbind.na, dplyr::lst(address, link, prop_type, price, est_agent, date_listed, num_views))), as.is = TRUE)
}
-测试
link <- 'https://www.zoopla.co.uk/for-sale/property/wd3/?page_size=25&q=wd3&radius=0&results_sort=most_popular&search_source=refine&pn=7'
page <- read_html(link)
listings <- page %>% html_nodes(xpath = '//div[@data-testid="search-result"]')
wd3p7 <- map_dfr(setNames(listings, listings), parse_listing, .id = "listing")
-输出
> wd3p7
# A tibble: 625 × 8
listing address link prop_type price est_agent date_listed num_views
<chr> <chr> <chr> <chr> <int> <chr> <int> <int>
1 "<div data-testid=\"search-result\" class=\"ea… High Street, Rickman… https://www.zoopla… 2 bed flat … 400000 Gibbs Gillespie… 19039 40
2 "<div data-testid=\"search-result\" class=\"ea… High Street, Rickman… https://www.zoopla… 1 bed prope… 315000 Trend & Thomas,… 19033 34
3 "<div data-testid=\"search-result\" class=\"ea… Springwell Lane, Ric… https://www.zoopla… 2 bed flat … 375000 Purplebricks, H… 19044 32
4 "<div data-testid=\"search-result\" class=\"ea… Rectory Road, Rickma… https://www.zoopla… 1 bed flat … 315000 Trend & Thomas,… 18897 30
5 "<div data-testid=\"search-result\" class=\"ea… Penn House, 30 High … https://www.zoopla… Studio for … 270000 Gibbs Gillespie… 18982 25
6 "<div data-testid=\"search-result\" class=\"ea… The Forge, Bury Lane… https://www.zoopla… 2 bed flat … 425000 Robsons, WD3 19005 25
7 "<div data-testid=\"search-result\" class=\"ea… High Street, Rickman… https://www.zoopla… 1 bed prope… 299950 Trend & Thomas,… 18792 26
8 "<div data-testid=\"search-result\" class=\"ea… The Forge, Bury Lane… https://www.zoopla… 2 bed flat … 450000 Savills - Rickm… 18960 19
9 "<div data-testid=\"search-result\" class=\"ea… Homestead Road, Rick… https://www.zoopla… 1 bed flat … 279000 Trend & Thomas,… 18654 18
10 "<div data-testid=\"search-result\" class=\"ea… High Street, Rickman… https://www.zoopla… 1 bed flat … 270000 Trend & Thomas,… 18463 18
# … with 615 more rows
很抱歉,如果这是一个脑筋急转弯的问题 - 它可能是一个简单的错误处理。当其中一个变量遇到空白时(在本例中为 'num_views' 变量),此代码会中断 - 有没有办法为任何空白值 return 和 'NA'?如果有任何建议,我将不胜感激
错误响应为:
错误:tibble 中的所有列都必须是向量。
num_views
列是函数。
# Custom functions
parse_listing <- function(listing){
# Review content
address <- listings %>% html_nodes(xpath = '//p[@data-testid="listing-description"]') %>% html_text2()
link <- listings %>% html_nodes(xpath = '//div[@data-testid="search-result"]/div/div/a[2]') %>% html_attr('href') %>% paste("https://www.zoopla.co.uk", ., sep="")
prop_type <- listings %>% html_nodes(xpath = '//h2[@data-testid="listing-title"]') %>% html_text2()
price <- listings %>% html_nodes(xpath = '//div[@data-testid="listing-price"]/p[@size="6"]') %>% html_text2() %>% str_remove_all("[£,]")
est_agent <- listings %>% html_nodes(xpath = '//div[@data-testid="search-result"]//a/img') %>% html_attr('alt') %>% str_remove('Marketed by ')
date_listed <- listings %>% html_nodes(xpath = '//span[@data-testid="date-published"]') %>% html_text2() %>% str_remove('Listed on ') %>% dmy()
num_views <- possibly(listings %>% html_nodes(xpath = '//span[@data-testid="number-of-views"]') %>% html_text2() %>% str_remove(' views'), otherwise = NULL)
tibble(address, link, prop_type, price, est_agent, date_listed, num_views)
}
# Script
link <- 'https://www.zoopla.co.uk/for-sale/property/wd3/?page_size=25&q=wd3&radius=0&results_sort=most_popular&search_source=refine&pn=7'
page <- read_html(link)
listings <- page %>% html_nodes(xpath = '//div[@data-testid="search-result"]')
wd3p7 <- map_dfr(listings, parse_listing)
出现错误时用tryCatch
或possibly/safely
(从purrr
)换行到return所需的值
library(purrr)
library(rvest)
library(dplyr)
library(lubridate)
parse_listing <- function(listing){
# Review content
address <- possibly(function(x) {
x %>%
html_nodes(xpath = '//p[@data-testid="listing-description"]') %>%
html_text2()
}, otherwise = NA_character_)(listing)
link <- possibly(function(x) {
x %>%
html_nodes(xpath = '//div[@data-testid="search-result"]/div/div/a[2]') %>%
html_attr('href') %>%
paste("https://www.zoopla.co.uk", ., sep="")
}, otherwise = NA_character_)(listing)
prop_type <- possibly(function(x) {
x %>%
html_nodes(xpath = '//h2[@data-testid="listing-title"]') %>%
html_text2()
}, otherwise = NA_character_)(listing)
price <- possibly(function(x) {
x %>%
html_nodes(xpath = '//div[@data-testid="listing-price"]/p[@size="6"]') %>%
html_text2() %>%
str_remove_all("[£,]")
}, otherwise = NA_character_)(listing)
est_agent <- possibly(function(x) {
x %>%
html_nodes(xpath = '//div[@data-testid="search-result"]//a/img') %>%
html_attr('alt') %>%
str_remove('Marketed by ')
}, otherwise = NA_character_)(listing)
date_listed <- possibly(function(x) {
x %>%
html_nodes(xpath = '//span[@data-testid="date-published"]') %>%
html_text2() %>%
str_remove('Listed on ') %>%
dmy()
}, otherwise = NA_character_)(listing)
num_views <- possibly(function(x) {
x %>%
html_nodes(xpath = '//span[@data-testid="number-of-views"]') %>%
html_text2() %>%
str_remove(' views')
}, otherwise = NA_character_)(listing)
type.convert(as_tibble(do.call(qpcR:::cbind.na, dplyr::lst(address, link, prop_type, price, est_agent, date_listed, num_views))), as.is = TRUE)
}
-测试
link <- 'https://www.zoopla.co.uk/for-sale/property/wd3/?page_size=25&q=wd3&radius=0&results_sort=most_popular&search_source=refine&pn=7'
page <- read_html(link)
listings <- page %>% html_nodes(xpath = '//div[@data-testid="search-result"]')
wd3p7 <- map_dfr(setNames(listings, listings), parse_listing, .id = "listing")
-输出
> wd3p7
# A tibble: 625 × 8
listing address link prop_type price est_agent date_listed num_views
<chr> <chr> <chr> <chr> <int> <chr> <int> <int>
1 "<div data-testid=\"search-result\" class=\"ea… High Street, Rickman… https://www.zoopla… 2 bed flat … 400000 Gibbs Gillespie… 19039 40
2 "<div data-testid=\"search-result\" class=\"ea… High Street, Rickman… https://www.zoopla… 1 bed prope… 315000 Trend & Thomas,… 19033 34
3 "<div data-testid=\"search-result\" class=\"ea… Springwell Lane, Ric… https://www.zoopla… 2 bed flat … 375000 Purplebricks, H… 19044 32
4 "<div data-testid=\"search-result\" class=\"ea… Rectory Road, Rickma… https://www.zoopla… 1 bed flat … 315000 Trend & Thomas,… 18897 30
5 "<div data-testid=\"search-result\" class=\"ea… Penn House, 30 High … https://www.zoopla… Studio for … 270000 Gibbs Gillespie… 18982 25
6 "<div data-testid=\"search-result\" class=\"ea… The Forge, Bury Lane… https://www.zoopla… 2 bed flat … 425000 Robsons, WD3 19005 25
7 "<div data-testid=\"search-result\" class=\"ea… High Street, Rickman… https://www.zoopla… 1 bed prope… 299950 Trend & Thomas,… 18792 26
8 "<div data-testid=\"search-result\" class=\"ea… The Forge, Bury Lane… https://www.zoopla… 2 bed flat … 450000 Savills - Rickm… 18960 19
9 "<div data-testid=\"search-result\" class=\"ea… Homestead Road, Rick… https://www.zoopla… 1 bed flat … 279000 Trend & Thomas,… 18654 18
10 "<div data-testid=\"search-result\" class=\"ea… High Street, Rickman… https://www.zoopla… 1 bed flat … 270000 Trend & Thomas,… 18463 18
# … with 615 more rows