将数据框列表合并为一个数据框并一步汇总
Combine list of dataframes into one dataframe and summarize in one step
我想combine/reduce将一系列数据帧合并为一个数据帧,但我也想一步汇总数据。输出来自模拟;因此,每个数据帧都具有相同的输出结构(即,一个组列,然后是 2 个具有值的列,每个输出的值都不同)。
最小可重现示例
df_list <- list(structure(list(Group = c("A", "B", "C"), Top_Group = c(1L,
0L, 0L), Efficiency = c(0.464688158128411, 0.652386676520109,
0.282913417555392)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(Group = c("A", "B", "C"
), Top_Group = c(0L, 1L, 0L), Efficiency = c(0.120292583014816,
0.0356206290889531, 0.37196880299598)), row.names = c(NA, -3L
), class = c("tbl_df", "tbl", "data.frame")), structure(list(
Group = c("A", "B", "C"), Top_Group = c(0L, 1L, 0L), Efficiency = c(0.261322160949931,
0.383351784432307, 0.754808459430933)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")))
我试过的
我知道我可以将数据绑定在一起,然后分组并汇总。
library(tidyverse)
df_list %>%
bind_rows() %>%
group_by(Group) %>%
summarise(Top_Group = sum(Top_Group), Efficiency = max(Efficiency))
# Group Top_Group Efficiency
# <chr> <int> <dbl>
#1 A 1 0.465
#2 B 2 0.652
#3 C 0 0.755
我希望有办法使用类似 reduce
的东西;但是,我只能让它用于拉出一列(如此处显示的 Top_Group
),并且不确定如何跨所有列(如果可能)和 return 使用数据框而不是向量。
df_list %>%
map(2) %>%
reduce(`+`)
# [1] 1 2 0
预期输出
Group Top_Group Efficiency
<chr> <int> <dbl>
1 A 1 0.465
2 B 2 0.652
3 C 0 0.755
根据OP的代码,不同的列使用了不同的函数。因此,我们可能必须单独应用这些元素函数
library(purrr)
reduce(df_list, ~ tibble(.x[1], .x[2] + .y[2], pmax(.x[3], .y[3])))
-输出
# A tibble: 3 × 3
Group Top_Group Efficiency
<chr> <int> <dbl>
1 A 1 0.465
2 B 2 0.652
3 C 0 0.755
你几乎成功了!查看 ?unnest()
require(tidyverse)
df_list %>%
tibble() %>%
unnest(cols = c(.)) %>%
group_by(Group) %>%
summarise(Top_Group = sum(Top_Group), Efficiency = max(Efficiency))
# A tibble: 3 x 3
Group Top_Group Efficiency
<chr> <int> <dbl>
1 A 1 0.465
2 B 2 0.652
3 C 0 0.755
在基础 R 中你可以这样做
Reduce(function(a, b) cbind(a[1], a[2] + b[2], pmax(a[3], b[3])), df_list)
#> Group Top_Group Efficiency
#> 1 A 1 0.4646882
#> 2 B 2 0.6523867
#> 3 C 0 0.7548085
另一种解决方案 reduce
、fulljoin
,然后 rowwise
summarize
:
library(tidyverse)
df_list %>%
reduce(full_join, by = "Group") %>%
rowwise() %>%
summarize(Group = Group,
Top_Group = sum(c_across(starts_with("Top_Group"))),
Efficiency = max(c_across(starts_with("Efficiency")))) %>%
ungroup()
# A tibble: 3 x 3
Group Top_Group Efficiency
<chr> <int> <dbl>
1 A 1 0.465
2 B 2 0.652
3 C 0 0.755
使用 aggregate
+ ave
的基础 R 选项
aggregate(
. ~ Group,
transform(
do.call(
rbind,
df_list
),
Efficiency = ave(
Efficiency,
Group,
FUN = function(x) max(x) / length(x)
)
), sum
)
或aggregate
+ sapply
transform(
aggregate(. ~ Group, do.call(rbind, df_list), list),
Top_Group = sapply(Top_Group, sum),
Efficiency = sapply(Efficiency, max)
)
给予
Group Top_Group Efficiency
1 A 1 0.4646882
2 B 2 0.6523867
3 C 0 0.7548085
另一个选项是使用 data.table
,我们可以使用 rbindlist
,然后汇总列。
library(data.table)
rbindlist(df_list)[, list(Top_Group = sum(Top_Group),
Efficiency = max(Efficiency)), by = .(Group)]
输出
Group Top_Group Efficiency
1: A 1 0.4646882
2: B 2 0.6523867
3: C 0 0.7548085
基准
出于好奇(因为这个问题与效率无关),我也运行所有当前答案,看看最快的是什么。基本 R 选项很快,但显然 data.table
选项是最快的。
代码
microbenchmark::microbenchmark(akrun = reduce(df_list, ~ tibble(.x[1], .x[2] + .y[2], pmax(.x[3], .y[3]))),
AllanCameron = Reduce(function(a, b) cbind(a[1], a[2] + b[2], pmax(a[3], b[3])), df_list),
ThomasIsCoding_agg_ave = {aggregate(
. ~ Group,
transform(
do.call(
rbind,
df_list
),
Efficiency = ave(
Efficiency,
Group,
FUN = function(x) max(x) / length(x)
)
), sum
)},
ThomasIsCoding_agg_sapply = {transform(
aggregate(. ~ Group, do.call(rbind, df_list), list),
Top_Group = sapply(Top_Group, sum),
Efficiency = sapply(Efficiency, max)
)
},
deschen = df_list %>%
reduce(full_join, by = "Group") %>%
rowwise() %>%
summarize(Group = Group,
Top_Group = sum(c_across(starts_with("Top_Group"))),
Efficiency = max(c_across(starts_with("Efficiency")))) %>%
ungroup(),
TomHoel = df_list %>%
tibble() %>%
unnest(cols = c(.)) %>%
group_by(Group) %>%
summarise(Top_Group = sum(Top_Group), Efficiency = max(Efficiency)),
AndrewGB_tidyverse = df_list %>%
bind_rows() %>%
group_by(Group) %>%
summarise(Top_Group = sum(Top_Group), Efficiency = max(Efficiency)),
AndrewGB_datatable = rbindlist(df_list)[, list(Top_Group = sum(Top_Group), Efficiency = max(Efficiency)), by=.(Group)],
times = 2000
)
又一个基地R,晚了几个月:
subset(
within(
do.call(rbind, df_list),
{
Top_Group <- ave(Top_Group, Group, FUN = sum)
Efficiency <- ave(Efficiency, Group, FUN = max)
}
),
!(duplicated(Group))
)
我想combine/reduce将一系列数据帧合并为一个数据帧,但我也想一步汇总数据。输出来自模拟;因此,每个数据帧都具有相同的输出结构(即,一个组列,然后是 2 个具有值的列,每个输出的值都不同)。
最小可重现示例
df_list <- list(structure(list(Group = c("A", "B", "C"), Top_Group = c(1L,
0L, 0L), Efficiency = c(0.464688158128411, 0.652386676520109,
0.282913417555392)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(Group = c("A", "B", "C"
), Top_Group = c(0L, 1L, 0L), Efficiency = c(0.120292583014816,
0.0356206290889531, 0.37196880299598)), row.names = c(NA, -3L
), class = c("tbl_df", "tbl", "data.frame")), structure(list(
Group = c("A", "B", "C"), Top_Group = c(0L, 1L, 0L), Efficiency = c(0.261322160949931,
0.383351784432307, 0.754808459430933)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")))
我试过的
我知道我可以将数据绑定在一起,然后分组并汇总。
library(tidyverse)
df_list %>%
bind_rows() %>%
group_by(Group) %>%
summarise(Top_Group = sum(Top_Group), Efficiency = max(Efficiency))
# Group Top_Group Efficiency
# <chr> <int> <dbl>
#1 A 1 0.465
#2 B 2 0.652
#3 C 0 0.755
我希望有办法使用类似 reduce
的东西;但是,我只能让它用于拉出一列(如此处显示的 Top_Group
),并且不确定如何跨所有列(如果可能)和 return 使用数据框而不是向量。
df_list %>%
map(2) %>%
reduce(`+`)
# [1] 1 2 0
预期输出
Group Top_Group Efficiency
<chr> <int> <dbl>
1 A 1 0.465
2 B 2 0.652
3 C 0 0.755
根据OP的代码,不同的列使用了不同的函数。因此,我们可能必须单独应用这些元素函数
library(purrr)
reduce(df_list, ~ tibble(.x[1], .x[2] + .y[2], pmax(.x[3], .y[3])))
-输出
# A tibble: 3 × 3
Group Top_Group Efficiency
<chr> <int> <dbl>
1 A 1 0.465
2 B 2 0.652
3 C 0 0.755
你几乎成功了!查看 ?unnest()
require(tidyverse)
df_list %>%
tibble() %>%
unnest(cols = c(.)) %>%
group_by(Group) %>%
summarise(Top_Group = sum(Top_Group), Efficiency = max(Efficiency))
# A tibble: 3 x 3
Group Top_Group Efficiency
<chr> <int> <dbl>
1 A 1 0.465
2 B 2 0.652
3 C 0 0.755
在基础 R 中你可以这样做
Reduce(function(a, b) cbind(a[1], a[2] + b[2], pmax(a[3], b[3])), df_list)
#> Group Top_Group Efficiency
#> 1 A 1 0.4646882
#> 2 B 2 0.6523867
#> 3 C 0 0.7548085
另一种解决方案 reduce
、fulljoin
,然后 rowwise
summarize
:
library(tidyverse)
df_list %>%
reduce(full_join, by = "Group") %>%
rowwise() %>%
summarize(Group = Group,
Top_Group = sum(c_across(starts_with("Top_Group"))),
Efficiency = max(c_across(starts_with("Efficiency")))) %>%
ungroup()
# A tibble: 3 x 3
Group Top_Group Efficiency
<chr> <int> <dbl>
1 A 1 0.465
2 B 2 0.652
3 C 0 0.755
使用 aggregate
+ ave
aggregate(
. ~ Group,
transform(
do.call(
rbind,
df_list
),
Efficiency = ave(
Efficiency,
Group,
FUN = function(x) max(x) / length(x)
)
), sum
)
或aggregate
+ sapply
transform(
aggregate(. ~ Group, do.call(rbind, df_list), list),
Top_Group = sapply(Top_Group, sum),
Efficiency = sapply(Efficiency, max)
)
给予
Group Top_Group Efficiency
1 A 1 0.4646882
2 B 2 0.6523867
3 C 0 0.7548085
另一个选项是使用 data.table
,我们可以使用 rbindlist
,然后汇总列。
library(data.table)
rbindlist(df_list)[, list(Top_Group = sum(Top_Group),
Efficiency = max(Efficiency)), by = .(Group)]
输出
Group Top_Group Efficiency
1: A 1 0.4646882
2: B 2 0.6523867
3: C 0 0.7548085
基准
出于好奇(因为这个问题与效率无关),我也运行所有当前答案,看看最快的是什么。基本 R 选项很快,但显然 data.table
选项是最快的。
代码
microbenchmark::microbenchmark(akrun = reduce(df_list, ~ tibble(.x[1], .x[2] + .y[2], pmax(.x[3], .y[3]))),
AllanCameron = Reduce(function(a, b) cbind(a[1], a[2] + b[2], pmax(a[3], b[3])), df_list),
ThomasIsCoding_agg_ave = {aggregate(
. ~ Group,
transform(
do.call(
rbind,
df_list
),
Efficiency = ave(
Efficiency,
Group,
FUN = function(x) max(x) / length(x)
)
), sum
)},
ThomasIsCoding_agg_sapply = {transform(
aggregate(. ~ Group, do.call(rbind, df_list), list),
Top_Group = sapply(Top_Group, sum),
Efficiency = sapply(Efficiency, max)
)
},
deschen = df_list %>%
reduce(full_join, by = "Group") %>%
rowwise() %>%
summarize(Group = Group,
Top_Group = sum(c_across(starts_with("Top_Group"))),
Efficiency = max(c_across(starts_with("Efficiency")))) %>%
ungroup(),
TomHoel = df_list %>%
tibble() %>%
unnest(cols = c(.)) %>%
group_by(Group) %>%
summarise(Top_Group = sum(Top_Group), Efficiency = max(Efficiency)),
AndrewGB_tidyverse = df_list %>%
bind_rows() %>%
group_by(Group) %>%
summarise(Top_Group = sum(Top_Group), Efficiency = max(Efficiency)),
AndrewGB_datatable = rbindlist(df_list)[, list(Top_Group = sum(Top_Group), Efficiency = max(Efficiency)), by=.(Group)],
times = 2000
)
又一个基地R,晚了几个月:
subset(
within(
do.call(rbind, df_list),
{
Top_Group <- ave(Top_Group, Group, FUN = sum)
Efficiency <- ave(Efficiency, Group, FUN = max)
}
),
!(duplicated(Group))
)