无法在 R 中使用 xpathSapply select 特定的 html table
Unable to select a specific html table using xpathSapply in R
我正在尝试从以下 link http://cepea.esalq.usp.br/frango/?page=379&Dias=15
中抓取第二个 table
我使用 XML 包尝试了以下 R 代码:
p_frango_resfriado <- htmlTreeParse("http://cepea.esalq.usp.br/frango/?page=379&Dias=15",
useInternalNodes = TRUE,
encoding = "UTF-8")
xpathSApply(p_frango_resfriado, "//table[@width = '95%']//tr//td[2]", xmlValue)
xpathSApply(p_frango_resfriado, "//table[@width = '95%']//tr//td[3]", xmlValue)
xpathSApply(p_frango_resfriado, "//table[@width = '95%']//tr//td[4]", xmlValue)
问题是这段代码抓取了网页中的两个 html table,而我只想抓取第二个。我已经尝试过,下面的代码没有 return 任何有趣的东西:
xpathSApply(p_frango_resfriado,
"//a[text() = 'Preços do frango resfriado CEPEA/ESALQ - Estado SP']/table[@width = '95%']",
xmlValue)
谁能帮我解决这个问题?我不太擅长 XPath 语言和 html.
使用 XML::xmlToDataFrame
和 XPath 查询
library("httr")
library("XML")
URL <- "http://cepea.esalq.usp.br/frango/?page=379&Dias=15"
temp <- tempfile(fileext = ".html")
GET(url = URL, user_agent("Mozilla/5.0"), write_disk(temp))
两个 table 之间的唯一区别是 xpath 查询中使用的 table 名称
Table 1: Preços do frango congelado CEPEA/ESALQ - Estado SP
xpexpr <- "//center/a[contains(., 'do frango congelado')]/../table/tr/td/font/tr"
Table 2: Preços do frango resfriado CEPEA/ESALQ - Estado SP
xpexpr <- "//center/a[contains(., 'do frango resfriado')]/../table/tr/td/font/tr"
doc <- htmlParse(temp)
listofTableNodes <- getNodeSet(doc, xpexpr)
length_nodes <- length(listofTableNodes)
include_indices1 <- 1:(length_nodes - 2)
# create dataframe using xmlvalues of the nodelist. Both `getNodeSet()`
# and `xpathSApply` will provide identical results.
# using `getNodeSet()`
df <- xmlToDataFrame(listofTableNodes[include_indices1], stringsAsFactors=FALSE)
# using `xpathSApply`
df <- xmlToDataFrame(xpathSApply(doc, xpexpr)[include_indices1], stringsAsFactors=FALSE)
# clean data
df$td <- as.Date(gsub("[Â ]\s*", "", df$td), format = "%d/%m/%Y")
df[, 4] <- gsub("\t$", '', df[, 4])
# add column names
xpexpr <- "//center/a[contains(., 'do frango resfriado')]/../table/tr/td/font/text()"
# for Table-1
# xpexpr <- "//center/a[contains(., 'do frango congelado')]/../table/tr/td/font/text()"
listofTableNodes <- getNodeSet(doc, xpexpr)
colnames(df) <- c('Date', sapply(listofTableNodes, xmlValue))
df
# Date Valor R$ Var./dia Var./mês
# 1 2016-08-17 4,37 0,46% 8,17%
# 2 2016-08-16 4,35 0,46% 7,67%
# 3 2016-08-15 4,33 0,46% 7,18%
# 4 2016-08-12 4,31 0,00% 6,68%
# 5 2016-08-11 4,31 0,70% 6,68%
# 6 2016-08-10 4,28 0,47% 5,94%
# 7 2016-08-09 4,26 -0,70% 5,45%
# 8 2016-08-08 4,29 3,87% 6,19%
# 9 2016-08-05 4,13 0,49% 2,23%
# 10 2016-08-04 4,11 0,00% 1,73%
# 11 2016-08-03 4,11 1,73% 1,73%
# 12 2016-08-02 4,04 0,00% 0,00%
# 13 2016-08-01 4,04 0,00% 0,00%
# 14 2016-07-29 4,04 0,00% -0,49%
# 15 2016-07-28 4,04 -0,25% -0,49%
注意: 每天都会更新此网页上的值,这些值将使用 length_nodes
加以考虑。
在没有 XPath 查询的情况下使用 XML::readHTMLTable
library("httr")
library("XML")
URL <- "http://cepea.esalq.usp.br/frango/?page=379&Dias=15"
temp <- tempfile(fileext = ".html")
GET(url = URL, user_agent("Mozilla/5.0"), write_disk(temp))
df <- readHTMLTable(temp, stringAsFactors = FALSE, which = 8)
# Table 1
df[4:18,]
# Table 2
df[28:42,]
使用 XML::readHTMLTable
和 XPath 查询
library("httr")
library("XML")
URL <- "http://cepea.esalq.usp.br/frango/?page=379&Dias=15"
temp <- tempfile(fileext = ".html")
GET(url = URL, user_agent("Mozilla/5.0"), write_disk(temp))
doc <- htmlParse(temp)
# XPath Query
# Table -1
xpexpr <- "//center/a[contains(., 'do frango congelado')]/../table/tr/td/font"
df <- xpathSApply(doc, xpexpr, readHTMLTable)
include_indices <- 1:(nrow(df[[4]]) -4)
df <- df[[4]][include_indices,]
# Table-2
xpexpr <- "//center/a[contains(., 'do frango resfriado')]/../table/tr/td/font"
df <- xpathSApply(doc, xpexpr, readHTMLTable)
include_indices <- 1:(nrow(df[[4]]) -4)
df <- df[[4]][include_indices,]
这应该暂时有效,但我想知道如果你每天 运行 它是否会有效。
options(stringsAsFactors=FALSE)
library(XML)
#get website
tfile <- tempfile()
download.file("http://cepea.esalq.usp.br/frango/?page=379&Dias=15", tfile)
temp <- readHTMLTable(tfile)
#read in table
tbl <- temp[[5]][!is.na(temp[[5]]$V2) & !is.na(temp[[5]]$V4),]
tbl2 <- split(tbl, cumsum(tbl$V1==""))[[2]][-1,]
tbl2
#perform formatting
colnames(tbl2) <- c("Date","Price","Pct1","Pct2")
tbl2 <- data.frame(apply(tbl2, c(1,2), function(x) {
x <- gsub("Â","",x)
x <- gsub(",",".",x,fixed=TRUE)
x <- gsub("%","",x)
}))
tbl2$Date <- as.Date(tbl2$Date," %d/%m/%Y ")
tbl2$Price <- as.numeric(tbl2$Price)
tbl2$Pct1 <- as.numeric(tbl2$Pct1) / 100
tbl2$Pct2 <- as.numeric(tbl2$Pct2) / 100
tbl2
# Date Price Pct1 Pct2
#65 2016-08-17 4.37 0.0046 0.0817
#66 2016-08-16 4.35 0.0046 0.0767
#67 2016-08-15 4.33 0.0046 0.0718
#68 2016-08-12 4.31 0.0000 0.0668
#69 2016-08-11 4.31 0.0070 0.0668
#70 2016-08-10 4.28 0.0047 0.0594
#71 2016-08-09 4.26 -0.0070 0.0545
#72 2016-08-08 4.29 0.0387 0.0619
#73 2016-08-05 4.13 0.0049 0.0223
#74 2016-08-04 4.11 0.0000 0.0173
#75 2016-08-03 4.11 0.0173 0.0173
#76 2016-08-02 4.04 0.0000 0.0000
#77 2016-08-01 4.04 0.0000 0.0000
#78 2016-07-29 4.04 0.0000 -0.0049
#79 2016-07-28 4.04 -0.0025 -0.0049
我正在尝试从以下 link http://cepea.esalq.usp.br/frango/?page=379&Dias=15
中抓取第二个 table我使用 XML 包尝试了以下 R 代码:
p_frango_resfriado <- htmlTreeParse("http://cepea.esalq.usp.br/frango/?page=379&Dias=15",
useInternalNodes = TRUE,
encoding = "UTF-8")
xpathSApply(p_frango_resfriado, "//table[@width = '95%']//tr//td[2]", xmlValue)
xpathSApply(p_frango_resfriado, "//table[@width = '95%']//tr//td[3]", xmlValue)
xpathSApply(p_frango_resfriado, "//table[@width = '95%']//tr//td[4]", xmlValue)
问题是这段代码抓取了网页中的两个 html table,而我只想抓取第二个。我已经尝试过,下面的代码没有 return 任何有趣的东西:
xpathSApply(p_frango_resfriado,
"//a[text() = 'Preços do frango resfriado CEPEA/ESALQ - Estado SP']/table[@width = '95%']",
xmlValue)
谁能帮我解决这个问题?我不太擅长 XPath 语言和 html.
使用 XML::xmlToDataFrame
和 XPath 查询
library("httr")
library("XML")
URL <- "http://cepea.esalq.usp.br/frango/?page=379&Dias=15"
temp <- tempfile(fileext = ".html")
GET(url = URL, user_agent("Mozilla/5.0"), write_disk(temp))
两个 table 之间的唯一区别是 xpath 查询中使用的 table 名称
Table 1: Preços do frango congelado CEPEA/ESALQ - Estado SP
xpexpr <- "//center/a[contains(., 'do frango congelado')]/../table/tr/td/font/tr"
Table 2: Preços do frango resfriado CEPEA/ESALQ - Estado SP
xpexpr <- "//center/a[contains(., 'do frango resfriado')]/../table/tr/td/font/tr"
doc <- htmlParse(temp)
listofTableNodes <- getNodeSet(doc, xpexpr)
length_nodes <- length(listofTableNodes)
include_indices1 <- 1:(length_nodes - 2)
# create dataframe using xmlvalues of the nodelist. Both `getNodeSet()`
# and `xpathSApply` will provide identical results.
# using `getNodeSet()`
df <- xmlToDataFrame(listofTableNodes[include_indices1], stringsAsFactors=FALSE)
# using `xpathSApply`
df <- xmlToDataFrame(xpathSApply(doc, xpexpr)[include_indices1], stringsAsFactors=FALSE)
# clean data
df$td <- as.Date(gsub("[Â ]\s*", "", df$td), format = "%d/%m/%Y")
df[, 4] <- gsub("\t$", '', df[, 4])
# add column names
xpexpr <- "//center/a[contains(., 'do frango resfriado')]/../table/tr/td/font/text()"
# for Table-1
# xpexpr <- "//center/a[contains(., 'do frango congelado')]/../table/tr/td/font/text()"
listofTableNodes <- getNodeSet(doc, xpexpr)
colnames(df) <- c('Date', sapply(listofTableNodes, xmlValue))
df
# Date Valor R$ Var./dia Var./mês
# 1 2016-08-17 4,37 0,46% 8,17%
# 2 2016-08-16 4,35 0,46% 7,67%
# 3 2016-08-15 4,33 0,46% 7,18%
# 4 2016-08-12 4,31 0,00% 6,68%
# 5 2016-08-11 4,31 0,70% 6,68%
# 6 2016-08-10 4,28 0,47% 5,94%
# 7 2016-08-09 4,26 -0,70% 5,45%
# 8 2016-08-08 4,29 3,87% 6,19%
# 9 2016-08-05 4,13 0,49% 2,23%
# 10 2016-08-04 4,11 0,00% 1,73%
# 11 2016-08-03 4,11 1,73% 1,73%
# 12 2016-08-02 4,04 0,00% 0,00%
# 13 2016-08-01 4,04 0,00% 0,00%
# 14 2016-07-29 4,04 0,00% -0,49%
# 15 2016-07-28 4,04 -0,25% -0,49%
注意: 每天都会更新此网页上的值,这些值将使用 length_nodes
加以考虑。
在没有 XPath 查询的情况下使用 XML::readHTMLTable
library("httr")
library("XML")
URL <- "http://cepea.esalq.usp.br/frango/?page=379&Dias=15"
temp <- tempfile(fileext = ".html")
GET(url = URL, user_agent("Mozilla/5.0"), write_disk(temp))
df <- readHTMLTable(temp, stringAsFactors = FALSE, which = 8)
# Table 1
df[4:18,]
# Table 2
df[28:42,]
使用 XML::readHTMLTable
和 XPath 查询
library("httr")
library("XML")
URL <- "http://cepea.esalq.usp.br/frango/?page=379&Dias=15"
temp <- tempfile(fileext = ".html")
GET(url = URL, user_agent("Mozilla/5.0"), write_disk(temp))
doc <- htmlParse(temp)
# XPath Query
# Table -1
xpexpr <- "//center/a[contains(., 'do frango congelado')]/../table/tr/td/font"
df <- xpathSApply(doc, xpexpr, readHTMLTable)
include_indices <- 1:(nrow(df[[4]]) -4)
df <- df[[4]][include_indices,]
# Table-2
xpexpr <- "//center/a[contains(., 'do frango resfriado')]/../table/tr/td/font"
df <- xpathSApply(doc, xpexpr, readHTMLTable)
include_indices <- 1:(nrow(df[[4]]) -4)
df <- df[[4]][include_indices,]
这应该暂时有效,但我想知道如果你每天 运行 它是否会有效。
options(stringsAsFactors=FALSE)
library(XML)
#get website
tfile <- tempfile()
download.file("http://cepea.esalq.usp.br/frango/?page=379&Dias=15", tfile)
temp <- readHTMLTable(tfile)
#read in table
tbl <- temp[[5]][!is.na(temp[[5]]$V2) & !is.na(temp[[5]]$V4),]
tbl2 <- split(tbl, cumsum(tbl$V1==""))[[2]][-1,]
tbl2
#perform formatting
colnames(tbl2) <- c("Date","Price","Pct1","Pct2")
tbl2 <- data.frame(apply(tbl2, c(1,2), function(x) {
x <- gsub("Â","",x)
x <- gsub(",",".",x,fixed=TRUE)
x <- gsub("%","",x)
}))
tbl2$Date <- as.Date(tbl2$Date," %d/%m/%Y ")
tbl2$Price <- as.numeric(tbl2$Price)
tbl2$Pct1 <- as.numeric(tbl2$Pct1) / 100
tbl2$Pct2 <- as.numeric(tbl2$Pct2) / 100
tbl2
# Date Price Pct1 Pct2
#65 2016-08-17 4.37 0.0046 0.0817
#66 2016-08-16 4.35 0.0046 0.0767
#67 2016-08-15 4.33 0.0046 0.0718
#68 2016-08-12 4.31 0.0000 0.0668
#69 2016-08-11 4.31 0.0070 0.0668
#70 2016-08-10 4.28 0.0047 0.0594
#71 2016-08-09 4.26 -0.0070 0.0545
#72 2016-08-08 4.29 0.0387 0.0619
#73 2016-08-05 4.13 0.0049 0.0223
#74 2016-08-04 4.11 0.0000 0.0173
#75 2016-08-03 4.11 0.0173 0.0173
#76 2016-08-02 4.04 0.0000 0.0000
#77 2016-08-01 4.04 0.0000 0.0000
#78 2016-07-29 4.04 0.0000 -0.0049
#79 2016-07-28 4.04 -0.0025 -0.0049