将数据框列表合并为一个数据框并一步汇总

Combine list of dataframes into one dataframe and summarize in one step

我想combine/reduce将一系列数据帧合并为一个数据帧,但我也想一步汇总数据。输出来自模拟;因此,每个数据帧都具有相同的输出结构(即,一个组列,然后是 2 个具有值的列,每个输出的值都不同)。

最小可重现示例

df_list <- list(structure(list(Group = c("A", "B", "C"), Top_Group = c(1L, 
0L, 0L), Efficiency = c(0.464688158128411, 0.652386676520109, 
0.282913417555392)), row.names = c(NA, -3L), class = c("tbl_df", 
"tbl", "data.frame")), structure(list(Group = c("A", "B", "C"
), Top_Group = c(0L, 1L, 0L), Efficiency = c(0.120292583014816, 
0.0356206290889531, 0.37196880299598)), row.names = c(NA, -3L
), class = c("tbl_df", "tbl", "data.frame")), structure(list(
    Group = c("A", "B", "C"), Top_Group = c(0L, 1L, 0L), Efficiency = c(0.261322160949931, 
    0.383351784432307, 0.754808459430933)), row.names = c(NA, 
-3L), class = c("tbl_df", "tbl", "data.frame")))

我试过的

我知道我可以将数据绑定在一起,然后分组并汇总。

library(tidyverse)

df_list %>% 
  bind_rows() %>%
  group_by(Group) %>%
  summarise(Top_Group = sum(Top_Group), Efficiency = max(Efficiency))

#  Group Top_Group Efficiency
#  <chr>     <int>      <dbl>
#1 A             1      0.465
#2 B             2      0.652
#3 C             0      0.755

我希望有办法使用类似 reduce 的东西;但是,我只能让它用于拉出一列(如此处显示的 Top_Group),并且不确定如何跨所有列(如果可能)和 return 使用数据框而不是向量。

df_list %>%
  map(2) %>%
  reduce(`+`)

# [1] 1 2 0

预期输出

  Group Top_Group Efficiency
  <chr>     <int>      <dbl>
1 A             1      0.465
2 B             2      0.652
3 C             0      0.755

根据OP的代码,不同的列使用了不同的函数。因此,我们可能必须单独应用这些元素函数

library(purrr)
reduce(df_list, ~ tibble(.x[1], .x[2] + .y[2], pmax(.x[3], .y[3])))

-输出

# A tibble: 3 × 3
  Group Top_Group Efficiency
  <chr>     <int>      <dbl>
1 A             1      0.465
2 B             2      0.652
3 C             0      0.755

你几乎成功了!查看 ?unnest()

require(tidyverse)

df_list %>% 
  tibble() %>%
  unnest(cols = c(.)) %>% 
  group_by(Group) %>%  
  summarise(Top_Group = sum(Top_Group), Efficiency = max(Efficiency))

# A tibble: 3 x 3
  Group Top_Group Efficiency
  <chr>     <int>      <dbl>
1 A             1      0.465
2 B             2      0.652
3 C             0      0.755

在基础 R 中你可以这样做

Reduce(function(a, b) cbind(a[1], a[2] + b[2], pmax(a[3], b[3])), df_list)
#>   Group Top_Group Efficiency
#> 1     A         1  0.4646882
#> 2     B         2  0.6523867
#> 3     C         0  0.7548085

另一种解决方案 reducefulljoin,然后 rowwise summarize:

library(tidyverse)
df_list %>%
  reduce(full_join, by = "Group") %>%
  rowwise() %>%
  summarize(Group = Group,
            Top_Group  = sum(c_across(starts_with("Top_Group"))),
            Efficiency = max(c_across(starts_with("Efficiency")))) %>%
  ungroup()

# A tibble: 3 x 3
  Group Top_Group Efficiency
  <chr>     <int>      <dbl>
1 A             1      0.465
2 B             2      0.652
3 C             0      0.755

使用 aggregate + ave

的基础 R 选项
aggregate(
    . ~ Group,
    transform(
        do.call(
            rbind,
            df_list
        ),
        Efficiency = ave(
            Efficiency,
            Group,
            FUN = function(x) max(x) / length(x)
        )
    ), sum
)

aggregate + sapply

transform(
    aggregate(. ~ Group, do.call(rbind, df_list), list),
    Top_Group = sapply(Top_Group, sum),
    Efficiency = sapply(Efficiency, max)
)

给予

  Group Top_Group Efficiency
1     A         1  0.4646882
2     B         2  0.6523867
3     C         0  0.7548085

另一个选项是使用 data.table,我们可以使用 rbindlist,然后汇总列。

library(data.table)

rbindlist(df_list)[, list(Top_Group = sum(Top_Group),
                          Efficiency = max(Efficiency)), by = .(Group)]

输出

   Group Top_Group Efficiency
1:     A         1  0.4646882
2:     B         2  0.6523867
3:     C         0  0.7548085

基准

出于好奇(因为这个问题与效率无关),我也运行所有当前答案,看看最快的是什么。基本 R 选项很快,但显然 data.table 选项是最快的。

代码

microbenchmark::microbenchmark(akrun = reduce(df_list, ~ tibble(.x[1], .x[2] + .y[2], pmax(.x[3], .y[3]))),
                                     AllanCameron = Reduce(function(a, b) cbind(a[1], a[2] + b[2], pmax(a[3], b[3])), df_list),
                                     ThomasIsCoding_agg_ave = {aggregate(
                                       . ~ Group,
                                       transform(
                                         do.call(
                                           rbind,
                                           df_list
                                         ),
                                         Efficiency = ave(
                                           Efficiency,
                                           Group,
                                           FUN = function(x) max(x) / length(x)
                                         )
                                       ), sum
                                     )},
                                     ThomasIsCoding_agg_sapply = {transform(
                                       aggregate(. ~ Group, do.call(rbind, df_list), list),
                                       Top_Group = sapply(Top_Group, sum),
                                       Efficiency = sapply(Efficiency, max)
                                     )
                                       },
                                     deschen = df_list %>%
                                       reduce(full_join, by = "Group") %>%
                                       rowwise() %>%
                                       summarize(Group = Group,
                                                 Top_Group  = sum(c_across(starts_with("Top_Group"))),
                                                 Efficiency = max(c_across(starts_with("Efficiency")))) %>%
                                       ungroup(),
                                     TomHoel = df_list %>% 
                                       tibble() %>%
                                       unnest(cols = c(.)) %>% 
                                       group_by(Group) %>%  
                                       summarise(Top_Group = sum(Top_Group), Efficiency = max(Efficiency)),
                                     AndrewGB_tidyverse = df_list %>% 
                                       bind_rows() %>%
                                       group_by(Group) %>%
                                       summarise(Top_Group = sum(Top_Group), Efficiency = max(Efficiency)),
                                     AndrewGB_datatable = rbindlist(df_list)[, list(Top_Group = sum(Top_Group), Efficiency = max(Efficiency)), by=.(Group)],
                                     times = 2000
                                     )

又一个基地R,晚了几个月:

subset(
  within(
    do.call(rbind, df_list),
    {
      Top_Group <- ave(Top_Group, Group, FUN = sum)
      Efficiency <- ave(Efficiency, Group, FUN = max)
    }
  ),
  !(duplicated(Group))
)