R dplyr 与HTML(或XML)文档、节点或节点集发生变异
R dplyr mutate with HTML (or XML) documents, nodes, or node sets
我有一个包含多个 HTML link 的文件,现在我想使用 dplyr 和 rvest 为每个 link 的图像获取 link每行。
当我手动执行时它工作正常并且 returns 该行但是当在一个函数中调用相同的代码时它失败并出现以下错误:
Error: no applicable method for 'xml_find_all' applied to an object of
class "factor"
我不知道我做错了什么。任何帮助表示赞赏。为了使我的问题更清楚,我添加了(在评论中)一些示例行并展示了手动方法。
library(rvest)
library(dplyr)
library(httr) # contains function stop_for_status()
#get html links from file
#EXAMPLE
# "_id",url
# 560fc55c65818bee0b77ec33,http://www.seriouseats.com/recipes/2011/01/sriracha-ceviche-recipe.html
# 560fc57e65818bee0b78d8b7,http://www.seriouseats.com/recipes/2008/07/pasta-arugula-tomatoes-recipe.html
# 560fc57e65818bee0b78dcde,http://www.seriouseats.com/recipes/2007/08/cook-the-book-minty-boozy-chic.html
# 560fc57e65818bee0b78de93,http://www.seriouseats.com/recipes/2010/02/chipped-beef-gravy-on-toast-stew-on-a-shingle-recipe.html
# 560fc57e65818bee0b78dfe6,http://www.seriouseats.com/recipes/2011/05/dinner-tonight-quinoa-salad-with-lemon-cream.html
# 560fc58165818bee0b78e65e,http://www.seriouseats.com/recipes/2010/10/dinner-tonight-spicy-quinoa-salad-recipe.html
#
#load into SE
#
SE <- read.csv("~/Desktop/SeriousEats.csv")
#
#function to retrieve imgPath per URL
#using rvest
#
getImgPath <- function(x) {
imgPath <- x %>% html_nodes(".photo") %>% html_attr("src")
stop_for_status(res)
return(imgPath)
}
#This works fine
#UrlPage <- read_html ("http://www.seriouseats.com/recipes/2011/01/sriracha-ceviche-recipe.html")
#imgPath <- UrlPage %>% html_nodes(".photo") %>% html_attr("src")
#
#This throws an error msg
#
S <- mutate(SE, imgPath = getImgPath(SE$url))
这个有效:
library(rvest)
library(dplyr)
# SE <- data_frame(url = c(
# "http://www.seriouseats.com/recipes/2011/01/sriracha-ceviche-recipe.html",
# "http://www.seriouseats.com/recipes/2008/07/pasta-arugula-tomatoes-recipe.html"
# ))
SE <- read.csv('/path/to/SeriousEats.csv', stringsAsFactors = FALSE)
getImgPath <- function(x) {
# x must be "a document, a node set or a single node" per rvest documentation; cannot be a factor or character
imgPath <- read_html(x) %>% html_nodes(".photo") %>% html_attr("src")
# httr::stop_for_status(res) OP said this is not necessary, so I removed
return(imgPath)
}
S <- SE %>%
rowwise() %>%
mutate(imgPath = getImgPath(url))
感谢您的帮助和耐心以及@Jubbles。为了其他人的利益,这里是完整的答案。
library(rvest)
library(dplyr)
SE <- read.csv("~/Desktop/FILE.txt", stringsAsFactors = FALSE)
getImgPath <- function(x) {
if (try(url.exists(x))) {
imgPath <- html(x) %>%
html_nodes(".photo") %>%
html_attr("src")
}
else {
imgPath = "NA"
}
#imgPath
return(imgPath)
}
SE1 <- SE %>%
rowwise() %>%
mutate(imgPath = getImgPath(url))
我有一个包含多个 HTML link 的文件,现在我想使用 dplyr 和 rvest 为每个 link 的图像获取 link每行。
当我手动执行时它工作正常并且 returns 该行但是当在一个函数中调用相同的代码时它失败并出现以下错误:
Error: no applicable method for 'xml_find_all' applied to an object of class "factor"
我不知道我做错了什么。任何帮助表示赞赏。为了使我的问题更清楚,我添加了(在评论中)一些示例行并展示了手动方法。
library(rvest)
library(dplyr)
library(httr) # contains function stop_for_status()
#get html links from file
#EXAMPLE
# "_id",url
# 560fc55c65818bee0b77ec33,http://www.seriouseats.com/recipes/2011/01/sriracha-ceviche-recipe.html
# 560fc57e65818bee0b78d8b7,http://www.seriouseats.com/recipes/2008/07/pasta-arugula-tomatoes-recipe.html
# 560fc57e65818bee0b78dcde,http://www.seriouseats.com/recipes/2007/08/cook-the-book-minty-boozy-chic.html
# 560fc57e65818bee0b78de93,http://www.seriouseats.com/recipes/2010/02/chipped-beef-gravy-on-toast-stew-on-a-shingle-recipe.html
# 560fc57e65818bee0b78dfe6,http://www.seriouseats.com/recipes/2011/05/dinner-tonight-quinoa-salad-with-lemon-cream.html
# 560fc58165818bee0b78e65e,http://www.seriouseats.com/recipes/2010/10/dinner-tonight-spicy-quinoa-salad-recipe.html
#
#load into SE
#
SE <- read.csv("~/Desktop/SeriousEats.csv")
#
#function to retrieve imgPath per URL
#using rvest
#
getImgPath <- function(x) {
imgPath <- x %>% html_nodes(".photo") %>% html_attr("src")
stop_for_status(res)
return(imgPath)
}
#This works fine
#UrlPage <- read_html ("http://www.seriouseats.com/recipes/2011/01/sriracha-ceviche-recipe.html")
#imgPath <- UrlPage %>% html_nodes(".photo") %>% html_attr("src")
#
#This throws an error msg
#
S <- mutate(SE, imgPath = getImgPath(SE$url))
这个有效:
library(rvest)
library(dplyr)
# SE <- data_frame(url = c(
# "http://www.seriouseats.com/recipes/2011/01/sriracha-ceviche-recipe.html",
# "http://www.seriouseats.com/recipes/2008/07/pasta-arugula-tomatoes-recipe.html"
# ))
SE <- read.csv('/path/to/SeriousEats.csv', stringsAsFactors = FALSE)
getImgPath <- function(x) {
# x must be "a document, a node set or a single node" per rvest documentation; cannot be a factor or character
imgPath <- read_html(x) %>% html_nodes(".photo") %>% html_attr("src")
# httr::stop_for_status(res) OP said this is not necessary, so I removed
return(imgPath)
}
S <- SE %>%
rowwise() %>%
mutate(imgPath = getImgPath(url))
感谢您的帮助和耐心以及@Jubbles。为了其他人的利益,这里是完整的答案。
library(rvest)
library(dplyr)
SE <- read.csv("~/Desktop/FILE.txt", stringsAsFactors = FALSE)
getImgPath <- function(x) {
if (try(url.exists(x))) {
imgPath <- html(x) %>%
html_nodes(".photo") %>%
html_attr("src")
}
else {
imgPath = "NA"
}
#imgPath
return(imgPath)
}
SE1 <- SE %>%
rowwise() %>%
mutate(imgPath = getImgPath(url))