创建 html() 元素的列表
Create a list of html() elements
我敢肯定,问题很简单。但我不知道如何让它发挥作用。我有四个这样的网站:
require(xml2)
require(rvest)
html1 <- html("http://academic.research.microsoft.com/RankList?entitytype=4&topdomainid=2&subdomainid=6&last=0&orderby=6")
html2 <- html("http://academic.research.microsoft.com/RankList?entitytype=3&topdomainid=2&subdomainid=6&last=0&orderby=6")
html3 <- html("http://academic.research.microsoft.com/RankList?entitytype=3&topdomainid=2&subdomainid=7&last=0&orderby=6")
html4 <- html("http://academic.research.microsoft.com/RankList?entitytype=4&topDomainID=2&subDomainID=7&last=0&orderby=6")
htmlPages <- c(html1,html2,html3,html4)
我试图将它们全部放在一个列表中,以便在 for 循环或其他内容中轻松访问。将它们放在列表中是没有问题的。问题是稍后访问它们。我的意思是我不再从节点获取文本。
getCSSElementText <- function(htmlpage, CSSElement)
{
#Return a vector of the text values of the CSS element the function is looking for
cssNodes <- html_nodes(htmlpage, CSSElement)
cssValues <- html_text(cssNodes)
return(cssValues)
}
如我所说
getCSSElementText(htmlPages[1], #properCSSTag#)
我收到这个错误:
Error in UseMethod("xml_find_all") :
no applicable method for 'xml_find_all' applied to an object of class "list"
这是我的全部代码,以防其他地方出现问题:
library(rvest)
library(xml2)
html1 <- html("http://academic.research.microsoft.com/RankList?entitytype=4&topdomainid=2&subdomainid=6&last=0&orderby=6")
html2 <- html("http://academic.research.microsoft.com/RankList?entitytype=3&topdomainid=2&subdomainid=6&last=0&orderby=6")
html3 <- html("http://academic.research.microsoft.com/RankList?entitytype=3&topdomainid=2&subdomainid=7&last=0&orderby=6")
html4 <- html("http://academic.research.microsoft.com/RankList?entitytype=4&topDomainID=2&subDomainID=7&last=0&orderby=6")
htmlPages <- c(html1,html2,html3,html4)
CSSElementIDs <- c("#ctl00_MainContent_divRankList a", ".staticOrderCol:nth-child(3)", ".staticOrderCol:nth-child(4)")
getCSSElementText <- function(htmlpage, CSSElement)
{
#Return a vector of the text values of the CSS element the function is looking for
cssNodes <- html_nodes(htmlpage, CSSElement)
cssValues <- html_text(cssNodes)
return(cssValues)
}
getCSSElementNumber <- function(htmlpage, CSSElement)
{
#Return a vector of numbers with proper formatting etc from the CSS element the function is looking for
cssNodes <- html_nodes(htmlpage, CSSElement)
cssValues <- html_text(cssNodes)
parsedCssValues <- as.numeric(gsub("\D", "", cssValues))
return(parsedCssValues)
}
addToDataFrame <- function(df, vector)
{
df[deparse(substitute(vector))] <- vector
return(df)
}
非常感谢您的宝贵时间!
当您连接 html*
个对象(每个列表的长度为 2)时,它们变成一个包含 8 个的列表:
htmlPages <- c(html1,html2,html3,html4)
str(htmlPages)
# List of 8
# $ node:<externalptr>
# $ doc :<externalptr>
# $ node:<externalptr>
# $ doc :<externalptr>
# $ node:<externalptr>
# $ doc :<externalptr>
# $ node:<externalptr>
# $ doc :<externalptr>
相反,将 html*
个对象放入列表中:
htmlPages <- list(html1,html2,html3,html4)
str(htmlPages)
# List of 4
# $ :List of 2
# ..$ node:<externalptr>
# ..$ doc :<externalptr>
# ..- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
# $ :List of 2
# ..$ node:<externalptr>
# ..$ doc :<externalptr>
# ..- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
# $ :List of 2
# ..$ node:<externalptr>
# ..$ doc :<externalptr>
# ..- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
# $ :List of 2
# ..$ node:<externalptr>
# ..$ doc :<externalptr>
# ..- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
并使用 [[
:
访问它们
htmlPages[[1]]
# {xml_document}
# <html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
# [1] <head id="Head1">\n <meta http-equiv="Content-Type" content="text/html; ...
# [2] <body onpageshow="document.forms['aspnetForm'].reset();"> \n <form ...
我敢肯定,问题很简单。但我不知道如何让它发挥作用。我有四个这样的网站:
require(xml2)
require(rvest)
html1 <- html("http://academic.research.microsoft.com/RankList?entitytype=4&topdomainid=2&subdomainid=6&last=0&orderby=6")
html2 <- html("http://academic.research.microsoft.com/RankList?entitytype=3&topdomainid=2&subdomainid=6&last=0&orderby=6")
html3 <- html("http://academic.research.microsoft.com/RankList?entitytype=3&topdomainid=2&subdomainid=7&last=0&orderby=6")
html4 <- html("http://academic.research.microsoft.com/RankList?entitytype=4&topDomainID=2&subDomainID=7&last=0&orderby=6")
htmlPages <- c(html1,html2,html3,html4)
我试图将它们全部放在一个列表中,以便在 for 循环或其他内容中轻松访问。将它们放在列表中是没有问题的。问题是稍后访问它们。我的意思是我不再从节点获取文本。
getCSSElementText <- function(htmlpage, CSSElement)
{
#Return a vector of the text values of the CSS element the function is looking for
cssNodes <- html_nodes(htmlpage, CSSElement)
cssValues <- html_text(cssNodes)
return(cssValues)
}
如我所说
getCSSElementText(htmlPages[1], #properCSSTag#)
我收到这个错误:
Error in UseMethod("xml_find_all") : no applicable method for 'xml_find_all' applied to an object of class "list"
这是我的全部代码,以防其他地方出现问题:
library(rvest)
library(xml2)
html1 <- html("http://academic.research.microsoft.com/RankList?entitytype=4&topdomainid=2&subdomainid=6&last=0&orderby=6")
html2 <- html("http://academic.research.microsoft.com/RankList?entitytype=3&topdomainid=2&subdomainid=6&last=0&orderby=6")
html3 <- html("http://academic.research.microsoft.com/RankList?entitytype=3&topdomainid=2&subdomainid=7&last=0&orderby=6")
html4 <- html("http://academic.research.microsoft.com/RankList?entitytype=4&topDomainID=2&subDomainID=7&last=0&orderby=6")
htmlPages <- c(html1,html2,html3,html4)
CSSElementIDs <- c("#ctl00_MainContent_divRankList a", ".staticOrderCol:nth-child(3)", ".staticOrderCol:nth-child(4)")
getCSSElementText <- function(htmlpage, CSSElement)
{
#Return a vector of the text values of the CSS element the function is looking for
cssNodes <- html_nodes(htmlpage, CSSElement)
cssValues <- html_text(cssNodes)
return(cssValues)
}
getCSSElementNumber <- function(htmlpage, CSSElement)
{
#Return a vector of numbers with proper formatting etc from the CSS element the function is looking for
cssNodes <- html_nodes(htmlpage, CSSElement)
cssValues <- html_text(cssNodes)
parsedCssValues <- as.numeric(gsub("\D", "", cssValues))
return(parsedCssValues)
}
addToDataFrame <- function(df, vector)
{
df[deparse(substitute(vector))] <- vector
return(df)
}
非常感谢您的宝贵时间!
当您连接 html*
个对象(每个列表的长度为 2)时,它们变成一个包含 8 个的列表:
htmlPages <- c(html1,html2,html3,html4)
str(htmlPages)
# List of 8
# $ node:<externalptr>
# $ doc :<externalptr>
# $ node:<externalptr>
# $ doc :<externalptr>
# $ node:<externalptr>
# $ doc :<externalptr>
# $ node:<externalptr>
# $ doc :<externalptr>
相反,将 html*
个对象放入列表中:
htmlPages <- list(html1,html2,html3,html4)
str(htmlPages)
# List of 4
# $ :List of 2
# ..$ node:<externalptr>
# ..$ doc :<externalptr>
# ..- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
# $ :List of 2
# ..$ node:<externalptr>
# ..$ doc :<externalptr>
# ..- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
# $ :List of 2
# ..$ node:<externalptr>
# ..$ doc :<externalptr>
# ..- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
# $ :List of 2
# ..$ node:<externalptr>
# ..$ doc :<externalptr>
# ..- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
并使用 [[
:
htmlPages[[1]]
# {xml_document}
# <html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
# [1] <head id="Head1">\n <meta http-equiv="Content-Type" content="text/html; ...
# [2] <body onpageshow="document.forms['aspnetForm'].reset();"> \n <form ...