使用 rvest 读取 HTML table 有时会卡住并产生超时错误

Read HTML table with rvest sometimes stuck and produce TimeOut Error

我必须从 https://kursdollar.org 中读取每家银行的美元汇率 table,并且我必须多次测试此片段:

library(stringr)
library(tidyverse)
library(rvest)
library(httr)
library(RCurl)
  
curlSetOpt(timeout = 200)
  
kurs_bi <- "https://kursdollar.org/bank/bi.php"
kurs_mandiri <- "https://kursdollar.org/bank/mandiri.php"
kurs_bca <- "https://kursdollar.org/bank/bca.php"
kurs_bni <- "https://kursdollar.org/bank/bni.php"
kurs_hsbc <- "https://kursdollar.org/bank/hsbc.php"
kurs_panin <- "https://kursdollar.org/bank/panin.php"
kurs_cimb <- "https://kursdollar.org/bank/cimb.php"
kurs_ocbc <- "https://kursdollar.org/bank/ocbc.php"
kurs_bri <- "https://kursdollar.org/bank/bri.php"
kurs_uob <- "https://kursdollar.org/bank/uob.php"
kurs_maybank <- 'https://kursdollar.org/bank/maybank.php'
kurs_permata <- "https://kursdollar.org/bank/permata.php"
kurs_mega <- "https://kursdollar.org/bank/mega.php"
kurs_danamon <- "https://kursdollar.org/bank/danamon.php"
kurs_btn <- "https://kursdollar.org/bank/btn.php"
kurs_mayapada <- "https://kursdollar.org/bank/mayapada.php"
kurs_muamalat <- "https://kursdollar.org/bank/muamalat.php"
kurs_bukopin <- "https://kursdollar.org/bank/bukopin.php"
  
link_kurs <- c(kurs_bi, kurs_mandiri, kurs_bca, kurs_bni, kurs_hsbc, kurs_panin, 
kurs_cimb, kurs_ocbc, kurs_bri, kurs_uob, kurs_maybank, kurs_permata, kurs_mega, 
kurs_danamon, kurs_btn, kurs_mayapada, kurs_muamalat, kurs_bukopin)

for(v in 1:length(link_kurs)){
    writeLines(paste0(v,') Read Table on ', link_kurs[v]))
    open_url <- url(link_kurs[v], "rb")
    extract_df <- read_html(open_url) 
    close(open_url)
    extract_df <- extract_df %>%
      html_nodes("table") %>% 
      html_table(fill = T) %>% as.data.frame()
    writeLines("Test Read Success!")
  }

运行时结果可能相差数倍,读取成功时速度很快,但有时读取某个Link会卡住(RCurl的超时限制无效) 并抛出:

Error in url(link_kurs[v], "rb") : cannot open the connection
In addition: Warning message:
In url(link_kurs[v], "rb") :
  InternetOpenUrl failed: 'The operation timed out'

想绕过这个吗?有没有一种方法可以始终如一地读取所有这些 table,即使它有点慢?

尝试使用 tryCatch

for(v in 1:length(link_kurs)){
  writeLines(paste0(v,') Read Table on ', link_kurs[v]))
  open_url <- url(link_kurs[v], "rb")
  tryCatch({
    extract_df <- read_html(open_url) 
  close(open_url)
  extract_df <- extract_df %>%
    html_nodes("table") %>% 
    html_table(fill = T) %>% as.data.frame()
  writeLines("Test Read Success!") 
  }, error=function(e) NULL)
}

tryCatch 的完整版本和循环重试捕获 table 无限次尝试(OP 编辑​​)

for(v in 1:length(link_kurs)){
    writeLines(paste0(v,') Read Table on ', link_kurs[v]))
    while(TRUE){
      tryCatch({
        open_url <- url(link_kurs[v], "rb")
        extract_df <- read_html(open_url) 
        close(open_url)
        extract_df <- extract_df %>%
          html_nodes("table") %>% 
          html_table(fill = T) %>% as.data.frame()
        extract_df_list <- c(extract_df_list, list(extract_df))
        writeLines("Test Read Success!") 
        break
      }, error=function(e){
        message("Test Read Timeout")
        message("Retrying. .")
      })
    }
  }