来自 B3/BM&F Bovespa 的网页抓取
Web Scraping from B3/BM&F Bovespa
我正在尝试从 BM&FBOVESPA 参考利率页面下载一些数据。
他们的网页是...
框架是...
http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-ptBR.asp
这是我的代码,它给我一个错误:out[j + k, ] 错误:下标越界
#URL which contains the data
url <- 'http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-ptBR.asp'
#Read the HTML from the URL
site <- read_html(url)
#Save the table as "list"
lista_tabela <- site %>%
html_nodes("table") %>%
html_table(fill = TRUE)
#"list" to df
CurvaDI <- lista_tabela[[1]]
我无法更正此错误,只能从他们的网站下载 table 并将其另存为 df。
此外,我正在尝试在一个代码中下载多个时期。如果有人能提供帮助,我很高兴!
非常感谢!
似乎 html 在原始源代码中有意变形,因此您必须在解析 table 之前对其进行重组。下面使用一系列正则表达式来获得 table 可解析的:
library(rvest)
library(httr)
library(stringr)
url <- 'http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-enUS.asp'
html <- content(GET(url), as = "raw") %>% rawToChar(.)
html <- str_replace_all(html, '(</tr>\r\n\r\n</tr>)', '</tr>\r\n\r\n<tr>')
html <- str_replace_all(html[[1]], '(<thead>|</thead>)', '')
html <- str_replace_all(html[[1]], '(</tr>\r\n\r\n<th)', '</tr><tr>')
data <- html[[1]] %>% read_html() %>% html_table(fill = TRUE)
dataframe <- tail(data[[1]], -1)
print(dataframe)
给出:
Calendar Days ID x fixed rate ID x fixed rate
2 1 1.90 0.00
3 7 1.90 1.55
4 8 1.90 1.70
5 9 1.90 1.81
6 13 1.91 1.67
7 14 1.91 1.75
8 21 1.91 1.81
9 23 1.91 1.89
10 24 1.91 1.93
11 28 1.91 1.75
12 30 1.91 1.82
13 34 1.92 1.77
14 41 1.93 1.82
15 43 1.94 1.87
16 52 1.95 1.93
.................................................
要提交表单数据,您可以使用特定的选项和日期格式构建 POST 请求。以下将获取选项并提示用户 select 一个然后获取数据:
library(rvest)
library(httr)
library(stringr)
date <- as.Date("2020-10-07")
url <- 'http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-enUS.asp'
html <- content(GET(url), as = "raw") %>% rawToChar(.)
getData <- function(html){
html <- str_replace_all(html, '(</tr>\r\n\r\n</tr>)', '</tr>\r\n\r\n<tr>')
html <- str_replace_all(html[[1]], '(<thead>|</thead>)', '')
html <- str_replace_all(html[[1]], '(</tr>\r\n\r\n<th)', '</tr><tr>')
body <- html[[1]] %>% read_html()
table <- body %>% html_table(fill = TRUE)
if (length(table) > 0){
dataframe <- tail(table[[1]], -1)
return(list(data = dataframe, body = body))
}
return(list(data = NULL, body = body))
}
res <- getData(html)
print(res[[1]])
options <- res[[2]] %>% html_nodes("option")
i <- 1
optionList = list()
for(o in options){
optionList[[i]] <- c(
key = o %>% html_attr("value"),
value = str_replace_all(o %>% html_text(),'\r\n','')
)
print(paste("[",i,"] ", optionList[[i]]["value"], sep=""))
i <- i + 1
}
cat("Choose option by index : ")
selected <- readLines("stdin",n=1);
selectedOption <- optionList[[as.integer(selected)]]
print(paste("you selected :", selectedOption["value"], sep=" "))
postUrl <- modify_url(url,
query = list(
Data = format(date, format="%m/%d/%Y"),
Data1 = format(date, format="%Y%m%d"),
slcTaxa = selectedOption["key"]
)
)
html <- content(POST(postUrl, body = list(
Data = format(date, format="%m/%d/%Y"),
Data1 = format(date, format="%Y%m%d"),
slcTaxa = selectedOption["key"],
nomexls = "",
lQtdTabelas = "",
IDIOM = 2
), encode = "form"), as = "raw") %>% rawToChar(.)
res <- getData(html)
print(res[[1]])
我正在尝试从 BM&FBOVESPA 参考利率页面下载一些数据。
他们的网页是...
框架是...
http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-ptBR.asp
这是我的代码,它给我一个错误:out[j + k, ] 错误:下标越界
#URL which contains the data
url <- 'http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-ptBR.asp'
#Read the HTML from the URL
site <- read_html(url)
#Save the table as "list"
lista_tabela <- site %>%
html_nodes("table") %>%
html_table(fill = TRUE)
#"list" to df
CurvaDI <- lista_tabela[[1]]
我无法更正此错误,只能从他们的网站下载 table 并将其另存为 df。
此外,我正在尝试在一个代码中下载多个时期。如果有人能提供帮助,我很高兴!
非常感谢!
似乎 html 在原始源代码中有意变形,因此您必须在解析 table 之前对其进行重组。下面使用一系列正则表达式来获得 table 可解析的:
library(rvest)
library(httr)
library(stringr)
url <- 'http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-enUS.asp'
html <- content(GET(url), as = "raw") %>% rawToChar(.)
html <- str_replace_all(html, '(</tr>\r\n\r\n</tr>)', '</tr>\r\n\r\n<tr>')
html <- str_replace_all(html[[1]], '(<thead>|</thead>)', '')
html <- str_replace_all(html[[1]], '(</tr>\r\n\r\n<th)', '</tr><tr>')
data <- html[[1]] %>% read_html() %>% html_table(fill = TRUE)
dataframe <- tail(data[[1]], -1)
print(dataframe)
给出:
Calendar Days ID x fixed rate ID x fixed rate
2 1 1.90 0.00
3 7 1.90 1.55
4 8 1.90 1.70
5 9 1.90 1.81
6 13 1.91 1.67
7 14 1.91 1.75
8 21 1.91 1.81
9 23 1.91 1.89
10 24 1.91 1.93
11 28 1.91 1.75
12 30 1.91 1.82
13 34 1.92 1.77
14 41 1.93 1.82
15 43 1.94 1.87
16 52 1.95 1.93
.................................................
要提交表单数据,您可以使用特定的选项和日期格式构建 POST 请求。以下将获取选项并提示用户 select 一个然后获取数据:
library(rvest)
library(httr)
library(stringr)
date <- as.Date("2020-10-07")
url <- 'http://www2.bmf.com.br/pages/portal/bmfbovespa/lumis/lum-taxas-referenciais-bmf-enUS.asp'
html <- content(GET(url), as = "raw") %>% rawToChar(.)
getData <- function(html){
html <- str_replace_all(html, '(</tr>\r\n\r\n</tr>)', '</tr>\r\n\r\n<tr>')
html <- str_replace_all(html[[1]], '(<thead>|</thead>)', '')
html <- str_replace_all(html[[1]], '(</tr>\r\n\r\n<th)', '</tr><tr>')
body <- html[[1]] %>% read_html()
table <- body %>% html_table(fill = TRUE)
if (length(table) > 0){
dataframe <- tail(table[[1]], -1)
return(list(data = dataframe, body = body))
}
return(list(data = NULL, body = body))
}
res <- getData(html)
print(res[[1]])
options <- res[[2]] %>% html_nodes("option")
i <- 1
optionList = list()
for(o in options){
optionList[[i]] <- c(
key = o %>% html_attr("value"),
value = str_replace_all(o %>% html_text(),'\r\n','')
)
print(paste("[",i,"] ", optionList[[i]]["value"], sep=""))
i <- i + 1
}
cat("Choose option by index : ")
selected <- readLines("stdin",n=1);
selectedOption <- optionList[[as.integer(selected)]]
print(paste("you selected :", selectedOption["value"], sep=" "))
postUrl <- modify_url(url,
query = list(
Data = format(date, format="%m/%d/%Y"),
Data1 = format(date, format="%Y%m%d"),
slcTaxa = selectedOption["key"]
)
)
html <- content(POST(postUrl, body = list(
Data = format(date, format="%m/%d/%Y"),
Data1 = format(date, format="%Y%m%d"),
slcTaxa = selectedOption["key"],
nomexls = "",
lQtdTabelas = "",
IDIOM = 2
), encode = "form"), as = "raw") %>% rawToChar(.)
res <- getData(html)
print(res[[1]])