使用 purrr 将每月命名的数据框列表组合成每年的数据框列表

Question

每个月我都会创建一个具有相同名称的命名数据框列表。

我想使用 purrr 组合多个月份的列表，以获得一个数据帧列表，每个数据帧中包含所有月份的数据。

（在下面的 repexp 中，不同数据帧中的数据在所有月份都是相同的，在实际数据中显然不是这样）

#create some data

method1 <- read.csv(text="input,output,result
A,a,1
A,b,3
B,b,8
A,d,11
") 

method2 <- read.csv(text="rainy_days, count
1,4
3,3
5,8
7,10
") 


method3 <- read.csv(text="in,out,rslt
A,a,6
A,b,5
B,b,1
A,d,12
") 

method4 <- read.csv(text="input,output,result
A,a,1
C,c,4
")

每月命名列表

month1 <- list(method1=method1,method2=method2,method3=method3)
month2 <- list(method1=method1,method2=method2,method3=method3)
month3 <- list(method1=method1,method3=method3)
month4 <- list(method1=method1,method2=method2,method3=method3, method4=method4)

# > str(month1)
# List of 3
# $ method1:'data.frame':   4 obs. of  3 variables:
#   ..$ input : chr [1:4] "A" "A" "B" "A"
# ..$ ouput : chr [1:4] "a" "b" "b" "d"
# ..$ result: int [1:4] 1 3 8 11
# $ method2:'data.frame':   4 obs. of  3 variables:
#   ..$ input : chr [1:4] "A" "A" "B" "A"
# ..$ ouput : chr [1:4] "a" "b" "b" "d"
# ..$ result: int [1:4] 4 3 8 10
# $ method3:'data.frame':   4 obs. of  3 variables:
#   ..$ input : chr [1:4] "A" "A" "B" "A"
# ..$ ouput : chr [1:4] "a" "b" "b" "d"
# ..$ result: int [1:4] 6 5 1 12

我要更换

current_year_method1 <- bind_rows(month1[[1]], month2[[1]],month3[[1]],month4[[1]])
current_year_method2 <-bind_rows(month1[[2]], month2[[2]],month4[[2]])
current_year_method3 <-bind_rows(month1[[3]], month2[[3]],month3[[3]],month4[[3]])
current_year_method4 <-bind_rows(month4[[4]])

year_all_data <- list(method1=current_year_method1,method2=current_year_method2,method3=current_year_method3,method4=current_year_method4)

使用从月份列表开始的更通用的解决方案。

一些解决方法是展平列表，然后使用建议的解决方案之一

year_list_flat <- flatten(list(month1,month2,month3,month4))

all_year <- year_list_flat %>% split(names(.)) %>% map(bind_rows)

# > str(all_year)
# List of 4
# $ method1:'data.frame':   16 obs. of  3 variables:
#   ..$ input : chr [1:16] "A" "A" "B" "A" ...
# ..$ ouput : chr [1:16] "a" "b" "b" "d" ...
# ..$ result: int [1:16] 1 3 8 11 1 3 8 11 1 3 ...
# $ method2:'data.frame':   12 obs. of  3 variables:
#   ..$ input : chr [1:12] "A" "A" "B" "A" ...
# ..$ ouput : chr [1:12] "a" "b" "b" "d" ...
# ..$ result: int [1:12] 4 3 8 10 4 3 8 10 4 3 ...
# $ method3:'data.frame':   16 obs. of  3 variables:
#   ..$ input : chr [1:16] "A" "A" "B" "A" ...
# ..$ ouput : chr [1:16] "a" "b" "b" "d" ...
# ..$ result: int [1:16] 6 5 1 12 6 5 1 12 6 5 ...
# $ method4:'data.frame':   2 obs. of  3 variables:
#   ..$ input : chr [1:4] "A" "C"
# ..$ ouput : chr [1:4] "a" "c"
# ..$ result: int [1:4] 1 4

但是由于使用了 flatten，我失去了使用每月文件的名称作为指标变量的可能性。

从概念上讲它也是不同的，因为您首先将所有列表组合成一个列表，然后再组合数据帧。

有没有不使用flatten的优雅方式？

Answer 1

这是 bind_rows 和 map(x, bind_rows)

的解决方案

library(tidyverse)
  
# create some data

method1 <- read.csv(text="input,output,result
A,a,1
A,b,3
B,b,8
A,d,11
") 

method2 <- read.csv(text="input,output,result
A,a,4
A,b,3
B,b,8
A,d,10
") 


method3 <- read.csv(text="input,output,result
A,a,6
A,b,5
B,b,1
A,d,12
") 

method4 <- read.csv(text="input,output,result
A,a,1
A,b,2
B,b,3
C,c,4
") 

# The monthly named lists

month1 <- list(method1=method1,method2=method2,method3=method3)
month2 <- list(method1=method1,method2=method2,method3=method3)
month3 <- list(method1=method1,method3=method3)
month4 <- list(method1=method1,method2=method2,method3=method3, method4=method4)

# for one of these lists, can bind_rows
bind_rows(month1, .id = "month")
#>      month input output result
#> 1  method1     A      a      1
#> 2  method1     A      b      3
#> 3  method1     B      b      8
#> 4  method1     A      d     11
#> 5  method2     A      a      4
#> 6  method2     A      b      3
#> 7  method2     B      b      8
#> 8  method2     A      d     10
#> 9  method3     A      a      6
#> 10 method3     A      b      5
#> 11 method3     B      b      1
#> 12 method3     A      d     12

# for a list of lists, bind rows within each element (methods in months), then bind again (months)
df <- lst(month1, month2, month3, month4) %>% 
  map(bind_rows, .id = "method") %>%   
  bind_rows(.id = "month")             
print(df)
#>     month  method input output result
#> 1  month1 method1     A      a      1
#> 2  month1 method1     A      b      3
#> 3  month1 method1     B      b      8
#> 4  month1 method1     A      d     11
#> 5  month1 method2     A      a      4
#> 6  month1 method2     A      b      3
#> 7  month1 method2     B      b      8
#> 8  month1 method2     A      d     10
#> 9  month1 method3     A      a      6
#> 10 month1 method3     A      b      5
#> 11 month1 method3     B      b      1
#> 12 month1 method3     A      d     12
#> 13 month2 method1     A      a      1
#> 14 month2 method1     A      b      3
#> 15 month2 method1     B      b      8
#> 16 month2 method1     A      d     11
#> 17 month2 method2     A      a      4
#> 18 month2 method2     A      b      3
#> 19 month2 method2     B      b      8
#> 20 month2 method2     A      d     10
#> 21 month2 method3     A      a      6
#> 22 month2 method3     A      b      5
#> 23 month2 method3     B      b      1
#> 24 month2 method3     A      d     12
#> 25 month3 method1     A      a      1
#> 26 month3 method1     A      b      3
#> 27 month3 method1     B      b      8
#> 28 month3 method1     A      d     11
#> 29 month3 method3     A      a      6
#> 30 month3 method3     A      b      5
#> 31 month3 method3     B      b      1
#> 32 month3 method3     A      d     12
#> 33 month4 method1     A      a      1
#> 34 month4 method1     A      b      3
#> 35 month4 method1     B      b      8
#> 36 month4 method1     A      d     11
#> 37 month4 method2     A      a      4
#> 38 month4 method2     A      b      3
#> 39 month4 method2     B      b      8
#> 40 month4 method2     A      d     10
#> 41 month4 method3     A      a      6
#> 42 month4 method3     A      b      5
#> 43 month4 method3     B      b      1
#> 44 month4 method3     A      d     12
#> 45 month4 method4     A      a      1
#> 46 month4 method4     A      b      2
#> 47 month4 method4     B      b      3
#> 48 month4 method4     C      c      4

^{由 reprex package (v2.0.1)}

创建于 2022-03-14

Answer 2

如果我正确理解了您的问题，一种可能的解决方案是结合使用 plyr 包和 purrr 包。我想这部分是“给定的”：

month1 <- list(method1=method1,method2=method2,method3=method3)
month2 <- list(method1=method1,method2=method2,method3=method3)
month3 <- list(method1=method1,method3=method3)
month4 <- list(method1=method1,method2=method2,method3=method3, method4=method4)

我们可以将其组合成一个命名列表（在您的每月运行代码中，您只需增加一个列表项）：

d <- list(month1 = month1, month2 = month2, month3 = month3, month4 = month4)

这为您提供了一个两层深度的命名列表。为了使 ldply 函数正常工作（名称成为新的 id 字段），必须在最低级别以方法命名列表，在最高级别以月份命名列表：

library(dplyr)
library(plyr)

# first call use purr to transform sub lists do df with method as new column name
ir <- purrr::map(d, ~plyr::ldply(.x, .id ="method")) %>% 
    # secondly convert the new one level list to one df with month as new column
    plyr::ldply(.id = "month")

    month  method input output result
1  month1 method1     A      a      1
2  month1 method1     A      b      3
3  month1 method1     B      b      8
4  month1 method1     A      d     11
5  month1 method2     A      a      4
6  month1 method2     A      b      3
7  month1 method2     B      b      8
(some lines where omitted)

# you can group and split by grouping into lists
ir %>% 
    # build the grouping to split by into list and use group_split
    dplyr::group_by(method) %>% 
    dplyr::group_split()

[[1]]
# A tibble: 16 x 5
   month  method  input output result
   <fct>  <fct>   <chr> <chr>   <int>
 1 month1 method1 A     a           1
 2 month1 method1 A     b           3
 3 month1 method1 B     b           8
 4 month1 method1 A     d          11
 5 month2 method1 A     a           1
 6 month2 method1 A     b           3
 7 month2 method1 B     b           8
 8 month2 method1 A     d          11
 (lines and further listt items omitted)

编辑

由于输入数据从具有相同列的 dfs 列表更改为具有不同列的 dfs 列表，这里采用上面的逻辑显示的一种方法。这一次虽然我们必须将所有列转换为字符才能工作，这意味着输出也将是所有字符，但考虑到数据结构，这应该很容易解决。对第一个版本的修改是将所有 dfs 转换为长格式，以便行名称成为一个新列，我们只有一列值。最后我们必须让数据变宽才能得到原始格式。我们只需要一个小帮手来识别 df 行，也就是行号：

res <- purrr::map(d, ~purrr::map(.x, ~ dplyr::mutate(.x,
                                                     dplyr::across(dplyr::everything(), ~ as.character(.x)), 
                                                     rn = dplyr::row_number()) %>% 
                                           tidyr::pivot_longer(-rn)) %>%
                                           plyr::ldply(.id = "method")) %>% 
    plyr::ldply(.id = "month") %>%  
    dplyr::group_by(method) %>% 
    dplyr::group_split() %>%
    purrr::map(~ .x %>% 
                     tidyr::pivot_wider(names_from = "name", values_from = "value") %>%
                     dplyr::select(-rn))
# name the list items cording to unique method colum of list item dfs
names(res) <- purrr::map_chr(res, ~ unique(as.character(.x$method)))

res

$method1
# A tibble: 16 x 5
   month  method  input output result
   <fct>  <fct>   <chr> <chr>  <chr> 
 1 month1 method1 A     a      1     
 2 month1 method1 A     b      3     
 3 month1 method1 B     b      8     
 4 month1 method1 A     d      11    
(many lines and other list item omitted)

使用 purrr 将每月命名的数据框列表组合成每年的数据框列表

Combine list of monthly named list of dataframes into one yearly name list of dataframes using purrr

r

purrr