打开从网站下载的 xls 文件

Question

我有这个用户定义的函数，它使用 rvest 包从网页获取可下载的文件。

GetFluDataFiles <- function(URL = "https://www1.health.gov.au/internet/main/publishing.nsf/Content/ohp-pub-datasets.htm",    
                            REMOVE_URL_STRING = "ohp-pub-datasets.htm/",    
                            DEBUG = TRUE){

    if(DEBUG) message("GetFluDataFiles: Function initialized  \n")

    FUNCTION_OUTPUT <- list()

    FUNCTION_OUTPUT[["URL"]] <- URL

    page <- rvest::read_html(URL)

    if(DEBUG) message("GetFluDataFiles: Get all downloadable files on webpage  \n")

    all_downloadable_files <- page %>%    
                                rvest::html_nodes("a") %>%      
                                rvest::html_attr("href") %>%      
                                str_subset("\.xlsx")    
    # all_downloadable_files

    FUNCTION_OUTPUT[["ALL_DOWNLOADABLE_FILES"]] <- all_downloadable_files

    if(DEBUG) message("GetFluDataFiles: Get all downloadable files on webpage which contain flu data  \n")

    influenza_file <- all_downloadable_files[tolower(all_downloadable_files) %like% c("influenza")]    
    # influenza_file   

    FUNCTION_OUTPUT[["FLU_FILE"]] <- influenza_file

    file_path = file.path(URL, influenza_file)    
    # file_path

    FUNCTION_OUTPUT[["FLU_FILE_PATH"]] <- file_path

    if(DEBUG) message("GetFluDataFiles: Collect final path  \n")

    if(!is.null(REMOVE_URL_STRING)){    
        full_final_path <- gsub(REMOVE_URL_STRING, "", file_path)    
    } else {    
        full_final_path <- file_path    
    }

    FUNCTION_OUTPUT[["FULL_FINAL_PATH"]] <- full_final_path

    if(!is.na(full_final_path) | !is.null(full_final_path)){
        if(DEBUG) message("GetFluDataFiles: Function run completed  \n")

        return(FUNCTION_OUTPUT)
    } else {
         stop("GetFluDataFiles: Folders not created  \n")    
    }

}

我已经使用这个函数提取了我想要的数据

似乎一切正常...我可以下载文件。

> output <- GetFluDataFiles()

GetFluDataFiles: Function initialized 

GetFluDataFiles: Get all downloadable files on webpage 

GetFluDataFiles: Get all downloadable files on webpage which contain flu data 

GetFluDataFiles: Collect final path 

GetFluDataFiles: Function run completed 

> output$FULL_FINAL_PATH 
[1] "https://www1.health.gov.au/internet/main/publishing.nsf/Content/C4DDC0B448F04792CA258728001EC5D0/$File/x.Influenza-laboratory-confirmed-Public-datset-2008-2019.xlsx"

> download.file(output$FULL_FINAL_PATH, destfile = "myfile.xlsx") 
trying URL 'https://www1.health.gov.au/internet/main/publishing.nsf/Content/C4DDC0B448F04792CA258728001EC5D0/$File/x.Influenza-laboratory-confirmed-Public-datset-2008-2019.xlsx'

Content type 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' length 27134133 bytes (25.9 MB)

downloaded 25.9 MB

并且文件存在。

> file.exists("myfile.xlsx")  
[1] TRUE

但是当我去导入xlsx文件时，弹出这个错误。

> library("readxl")

> my_data <- read_excel("myfile.xlsx", sheet = 1, skip = 1)

Error: Evaluation error: error -103 with zipfile in unzGetCurrentFileInfo

这是什么错误？我该如何解决？

Answer 1

将下载选项设置为 curl

download.file(output$FULL_FINAL_PATH, destfile = "myfile.xlsx", method = 'curl') 
my_data <- read_excel("myfile.xlsx", sheet = 1, skip = 1)

打开从网站下载的 xls 文件

Opening an xls file downloaded from a website

r

rvest

tidyverse