readxl,单个 .xlsx-workbook 中的选定工作表

readxl, selected worksheets in single .xlsx-workbook

如何从 .xlsx-workbook 简洁地导入选定的工作表,最好使用

使用以下代码(方法 # 1)我可以将所有工作表导入单个 .xlsx-workbook,但是如何 filter()select() ?第二位代码,使用 map_dfr(),方法 # 2,有点更简洁,但是使用该方法 sheet 向量失去了它的名字,变成了 1, 2, ctc.

假设我只想导入工作表 irismtcars

sh_to_impt <- c('iris', 'mtcars')

需要软件包,

library(readxl)
library(tidyverse)
library(purrr)

代码方法#1,

path <- readxl_example("datasets.xlsx")
datasets_data <- readxl::excel_sheets(path = path) %>% 
  purrr::set_names() %>% select(mtcars) %>% 
  purrr::map_dfr(
    ~ readxl::read_excel(path = path, sheet = .x)
    , .id = "sheet"
  )

datasets_data 
# A tibble: 1,253 x 24
   sheet Sepal.Length Sepal.Width Petal.Length Petal.Width Species
   <chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>  
 1 iris           5.1         3.5          1.4         0.2 setosa 
 2 iris           4.9         3            1.4         0.2 setosa 
 3 iris           4.7         3.2          1.3         0.2 setosa 
 4 iris           4.6         3.1          1.5         0.2 setosa 
 5 iris           5           3.6          1.4         0.2 setosa 
 6 iris           5.4         3.9          1.7         0.4 setosa 
 7 iris           4.6         3.4          1.4         0.3 setosa 
 8 iris           5           3.4          1.5         0.2 setosa 
 9 iris           4.4         2.9          1.4         0.2 setosa 
10 iris           4.9         3.1          1.5         0.1 setosa 
# ... with 1,243 more rows, and 18 more variables: mpg <dbl>,
#   cyl <dbl>, disp <dbl>, hp <dbl>, drat <dbl>, wt <dbl>,
#   qsec <dbl>, vs <dbl>, am <dbl>, gear <dbl>, carb <dbl>,
#   weight <dbl>, feed <chr>, lat <dbl>, long <dbl>, depth <dbl>,
#   mag <dbl>, stations <dbl>

我可以这样绕过它,但是 sheet 向量失去了它的名字,变成了 1, 2, ctc。

代码方法#2,

map_dfr(sh_to_impt, ~ read_excel(path, sheet = .x), .id = "sheet")
# A tibble: 182 x 17
sheet Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>  
  1 1              5.1         3.5          1.4         0.2 setosa 
2 1              4.9         3            1.4         0.2 setosa 
3 1              4.7         3.2          1.3         0.2 setosa 
4 1              4.6         3.1          1.5         0.2 setosa 
5 1              5           3.6          1.4         0.2 setosa 
6 1              5.4         3.9          1.7         0.4 setosa 
7 1              4.6         3.4          1.4         0.3 setosa 
8 1              5           3.4          1.5         0.2 setosa 
9 1              4.4         2.9          1.4         0.2 setosa 
10 1              4.9         3.1          1.5         0.1 setosa 
# ... with 172 more rows, and 11 more variables: mpg <dbl>,
#   cyl <dbl>, disp <dbl>, hp <dbl>, drat <dbl>, wt <dbl>,
#   qsec <dbl>, vs <dbl>, am <dbl>, gear <dbl>, carb <dbl> 

我看过 ,认为它可能是关键。

我正在寻找一个简洁的解决方案。 A,对我来说,显然不是那么简洁的解决方案可能是,

map_dfr(sh_to_impt, ~ read_excel(path, sheet = .x), .id = "sheet") %>% 
  mutate(sheet = recode(sheet, `1` = sh_to_impt[1], `2` = sh_to_impt[2]))
# A tibble: 182 x 17
sheet Sepal.Length Sepal.Width Petal.Length Petal.Width Species   mpg
<chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>   <dbl>
  1 iris           5.1         3.5          1.4         0.2 setosa     NA
2 iris           4.9         3            1.4         0.2 setosa     NA
3 iris           4.7         3.2          1.3         0.2 setosa     NA
4 iris           4.6         3.1          1.5         0.2 setosa     NA
5 iris           5           3.6          1.4         0.2 setosa     NA
6 iris           5.4         3.9          1.7         0.4 setosa     NA
7 iris           4.6         3.4          1.4         0.3 setosa     NA
8 iris           5           3.4          1.5         0.2 setosa     NA
9 iris           4.4         2.9          1.4         0.2 setosa     NA
10 iris           4.9         3.1          1.5         0.1 setosa     NA
# ... with 172 more rows, and 10 more variables: cyl <dbl>, disp <dbl>,
#   hp <dbl>, drat <dbl>, wt <dbl>, qsec <dbl>, vs <dbl>, am <dbl>,
#   gear <dbl>, carb <dbl>

这是我制定的解决方案。请自由逃离改善或 批评,

    sh_to_impt <- c('iris', 'mtcars')
    path <- readxl_example("datasets.xlsx")
    
    path %>%
      excel_sheets() %>%
      set_names() %>% .[sh_to_impt] %>%
      map_df(read_excel,
             path = path,
             .id = "sheet") 
    
    # A tibble: 182 x 17
    sheet Sepal.Length Sepal.Width Petal.Length Petal.Width Species   mpg
    <chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>   <dbl>
      1 iris           5.1         3.5          1.4         0.2 setosa     NA
    2 iris           4.9         3            1.4         0.2 setosa     NA
    3 iris           4.7         3.2          1.3         0.2 setosa     NA
    4 iris           4.6         3.1          1.5         0.2 setosa     NA
    5 iris           5           3.6          1.4         0.2 setosa     NA
    6 iris           5.4         3.9          1.7         0.4 setosa     NA
    7 iris           4.6         3.4          1.4         0.3 setosa     NA
    8 iris           5           3.4          1.5         0.2 setosa     NA
    9 iris           4.4         2.9          1.4         0.2 setosa     NA
    10 iris           4.9         3.1          1.5         0.1 setosa     NA
    # ... with 172 more rows, and 10 more variables: cyl <dbl>, disp <dbl>,
    #   hp <dbl>, drat <dbl>, wt <dbl>, qsec <dbl>, vs <dbl>, am <dbl>,
    #   gear <dbl>, carb <dbl>

在 R 中表示 Excel 工作簿的“整洁”方式是作为嵌套数据框,例如:

# A tibble: 2 x 2                                                                                        
  sheet  data              
  <chr>  <list>            
1 iris   <tibble [150 × 5]>
2 mtcars <tibble [32 × 11]>

所以我会通过将 sheet 名称存储在列中,将数据作为附加列读取,然后取消嵌套来简化您的第一种方法:

library("readxl")
library("dplyr")
library("purrr")
library("tidyr")

path <- readxl_example("datasets.xlsx")
sh_to_impt <- c("iris", "mtcars")

tibble(sheet = sh_to_impt) %>% 
  mutate(data = map(sheet, ~read_xlsx(path, .))) %>% 
  unnest(data)
#> # A tibble: 182 x 17
#>    sheet Sepal.Length Sepal.Width Petal.Length Petal.Width Species   mpg   cyl
#>    <chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>   <dbl> <dbl>
#>  1 iris           5.1         3.5          1.4         0.2 setosa     NA    NA
#>  2 iris           4.9         3            1.4         0.2 setosa     NA    NA
#>  3 iris           4.7         3.2          1.3         0.2 setosa     NA    NA
#>  4 iris           4.6         3.1          1.5         0.2 setosa     NA    NA
#>  5 iris           5           3.6          1.4         0.2 setosa     NA    NA
#>  6 iris           5.4         3.9          1.7         0.4 setosa     NA    NA
#>  7 iris           4.6         3.4          1.4         0.3 setosa     NA    NA
#>  8 iris           5           3.4          1.5         0.2 setosa     NA    NA
#>  9 iris           4.4         2.9          1.4         0.2 setosa     NA    NA
#> 10 iris           4.9         3.1          1.5         0.1 setosa     NA    NA
#> # … with 172 more rows, and 9 more variables: disp <dbl>, hp <dbl>, drat <dbl>,
#> #   wt <dbl>, qsec <dbl>, vs <dbl>, am <dbl>, gear <dbl>, carb <dbl>

如果您事先不知道您想要的 sheets,或者想要不同的子集用于不同的分析,您也可以导入所有这些并在取消嵌套之前进行过滤:

tibble(sheet = excel_sheets(path)) %>% 
  mutate(data = map(sheet, ~read_xlsx(path, .))) %>% 
  filter(sheet %in% sh_to_impt) %>% 
  unnest(data)
#> # A tibble: 182 x 17
#>    sheet Sepal.Length Sepal.Width Petal.Length Petal.Width Species   mpg   cyl
#>    <chr>        <dbl>       <dbl>        <dbl>       <dbl> <chr>   <dbl> <dbl>
#>  1 iris           5.1         3.5          1.4         0.2 setosa     NA    NA
#>  2 iris           4.9         3            1.4         0.2 setosa     NA    NA
#>  3 iris           4.7         3.2          1.3         0.2 setosa     NA    NA
#>  4 iris           4.6         3.1          1.5         0.2 setosa     NA    NA
#>  5 iris           5           3.6          1.4         0.2 setosa     NA    NA
#>  6 iris           5.4         3.9          1.7         0.4 setosa     NA    NA
#>  7 iris           4.6         3.4          1.4         0.3 setosa     NA    NA
#>  8 iris           5           3.4          1.5         0.2 setosa     NA    NA
#>  9 iris           4.4         2.9          1.4         0.2 setosa     NA    NA
#> 10 iris           4.9         3.1          1.5         0.1 setosa     NA    NA
#> # … with 172 more rows, and 9 more variables: disp <dbl>, hp <dbl>, drat <dbl>,
#> #   wt <dbl>, qsec <dbl>, vs <dbl>, am <dbl>, gear <dbl>, carb <dbl>
library(tidyverse)
library(readxl)

excel_sheets(path) %>%
  str_subset(sh_to_impt) %>%
  map_df(
    ~read_excel(path = path, sheet = .x)
    )