dplyr 汇总小计
dplyr summarize with subtotals
excel 中数据透视表的一大优点是它们会自动提供小计。首先,我想知道 dplyr 中是否已经创建了可以实现此目的的任何东西。如果没有,实现它的最简单方法是什么?
在下面的示例中,我显示了按气缸数和化油器数计算的平均排量。对于每组气缸 (4,6,8),我想查看该组的平均排量(或总排量,或任何其他汇总统计数据)。
library(dplyr)
mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl carb mean(disp)
1 4 1 91.38
2 4 2 116.60
3 6 1 241.50
4 6 4 163.80
5 6 6 145.00
6 8 2 345.50
7 8 3 275.80
8 8 4 405.50
9 8 8 301.00
data.table 很笨拙,但这是一种方式:
library(data.table)
DT <- data.table(mtcars)
rbind(
DT[,.(mean(disp)), by=.(cyl,carb)],
DT[,.(mean(disp), carb=NA), by=.(cyl) ],
DT[,.(mean(disp), cyl=NA), by=.(carb)]
)[order(cyl,carb)]
这给了
cyl carb V1
1: 4 1 91.3800
2: 4 2 116.6000
3: 4 NA 105.1364
4: 6 1 241.5000
5: 6 4 163.8000
6: 6 6 145.0000
7: 6 NA 183.3143
8: 8 2 345.5000
9: 8 3 275.8000
10: 8 4 405.5000
11: 8 8 301.0000
12: 8 NA 353.1000
13: NA 1 134.2714
14: NA 2 208.1600
15: NA 3 275.8000
16: NA 4 308.8200
17: NA 6 145.0000
18: NA 8 301.0000
我宁愿看到类似 R table
的结果,但不知道有什么函数可以实现。
dplyr @akrun 找到了这个类似的代码
bind_rows(
mtcars %>%
group_by(cyl, carb) %>%
summarise(Mean= mean(disp)),
mtcars %>%
group_by(cyl) %>%
summarise(carb=NA, Mean=mean(disp)),
mtcars %>%
group_by(carb) %>%
summarise(cyl=NA, Mean=mean(disp))
) %>% arrange(cyl, carb)
我们可以将重复操作包装在一个函数中
library(lazyeval)
f1 <- function(df, grp, Var, func){
FUN <- match.fun(func)
df %>%
group_by_(.dots=grp) %>%
summarise_(interp(~FUN(v), v=as.name(Var)))
}
m1 <- f1(mtcars, c('carb', 'cyl'), 'disp', 'mean')
m2 <- f1(mtcars, 'carb', 'disp', 'mean')
m3 <- f1(mtcars, 'cyl', 'disp', 'mean')
bind_rows(list(m1, m2, m3)) %>%
arrange(cyl, carb) %>%
rename(Mean=`FUN(disp)`)
carb cyl Mean
1 1 4 91.3800
2 2 4 116.6000
3 NA 4 105.1364
4 1 6 241.5000
5 4 6 163.8000
6 6 6 145.0000
7 NA 6 183.3143
8 2 8 345.5000
9 3 8 275.8000
10 4 8 405.5000
11 8 8 301.0000
12 NA 8 353.1000
13 1 NA 134.2714
14 2 NA 208.1600
15 3 NA 275.8000
16 4 NA 308.8200
17 6 NA 145.0000
18 8 NA 301.0000
data.table 的 rbindlist
和 fill
:
中的任何一个选项都可以变得不那么难看
rbindlist(list(
mtcars %>% group_by(cyl) %>% summarise(mean(disp)),
mtcars %>% group_by(carb) %>% summarise(mean(disp)),
mtcars %>% group_by(cyl,carb) %>% summarise(mean(disp))
),fill=TRUE) %>% arrange(cyl,carb)
rbindlist(list(
DT[,mean(disp),by=.(cyl,carb)],
DT[,mean(disp),by=.(cyl)],
DT[,mean(disp),by=.(carb)]
),fill=TRUE)[order(cyl,carb)]
我知道这可能不是一个非常优雅的解决方案,但我希望它能有所帮助:
p <-mtcars %>% group_by(cyl,carb)
p$cyl <- as.factor(p$cyl)
average_disp <- sapply(1:length(levels(p$cyl)), function(x)mean(subset(p,p$cyl==levels(p$cyl)[x])$disp))
df <- data.frame(levels(p$cyl),average_disp)
colnames(df)[1]<-"cyl"
#> df
# cyl average_disp
#1 4 105.1364
#2 6 183.3143
#3 8 353.1000
(编辑:在 p
的定义中稍作修改后,现在产生与 @Frank 和 @akrun 的解决方案相同的结果)
类似于 table
和 addmargins
的内容(尽管实际上是 data.frame
)
library(dplyr)
library(reshape2)
out <- bind_cols(
mtcars %>% group_by(cyl, carb) %>%
summarise(mu = mean(disp)) %>%
dcast(cyl ~ carb),
(mtcars %>% group_by(cyl) %>% summarise(Total=mean(disp)))[,2]
)
margin <- t((mtcars %>% group_by(carb) %>% summarise(Total=mean(disp)))[,2])
rbind(out, c(NA, margin, mean(mtcars$disp))) %>%
`rownames<-`(c(paste("cyl", c(4,6,8)), "Total")) # add some row names
# cyl 1 2 3 4 6 8 Total
# cyl 4 4 91.3800 116.60 NA NA NA NA 105.1364
# cyl 6 6 241.5000 NA NA 163.80 145 NA 183.3143
# cyl 8 8 NA 345.50 275.8 405.50 NA 301 353.1000
# Total NA 134.2714 208.16 275.8 308.82 145 301 230.7219
底行是列边距,名为 1:8 的列是碳水化合物,总计是行边距。
也可以通过简单地加入两组结果:
cyl_carb <- mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl <- mtcars %>% group_by(cyl) %>% summarize(mean(disp))
joined <- full_join(cyl_carb, cyl)
result <- arrange(joined, cyl)
result
给出:
Source: local data frame [12 x 3]
Groups: cyl [3]
cyl carb mean(disp)
(dbl) (dbl) (dbl)
1 4 1 91.3800
2 4 2 116.6000
3 4 NA 105.1364
4 6 1 241.5000
5 6 4 163.8000
6 6 6 145.0000
7 6 NA 183.3143
8 8 2 345.5000
9 8 3 275.8000
10 8 4 405.5000
11 8 8 301.0000
12 8 NA 353.1000
或附加一列:
cyl_carb <- mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl <- mtcars %>% group_by(cyl) %>% summarize(mean.cyl = mean(disp))
joined <- full_join(cyl_carb, cyl)
joined
给出:
Source: local data frame [9 x 4]
Groups: cyl [?]
cyl carb mean(disp) mean.cyl
(dbl) (dbl) (dbl) (dbl)
1 4 1 91.38 105.1364
2 4 2 116.60 105.1364
3 6 1 241.50 183.3143
4 6 4 163.80 183.3143
5 6 6 145.00 183.3143
6 8 2 345.50 353.1000
7 8 3 275.80 353.1000
8 8 4 405.50 353.1000
9 8 8 301.00 353.1000
这是一个简单的 one-liner 在 data_frame:
中创建边距
library(plyr)
library(dplyr)
# Margins without labels
mtcars %>%
group_by(cyl,carb) %>%
summarize(Mean_Disp=mean(disp)) %>%
do(plyr::rbind.fill(., data_frame(cyl=first(.$cyl), Mean_Disp=sum(.$Mean_Disp, na.rm=T))))
输出:
Source: local data frame [12 x 3]
Groups: cyl [3]
cyl carb Mean_Disp
<dbl> <dbl> <dbl>
1 4 1 91.38
2 4 2 116.60
3 4 NA 207.98
4 6 1 241.50
5 6 4 163.80
6 6 6 145.00
7 6 NA 550.30
8 8 2 345.50
9 8 3 275.80
10 8 4 405.50
11 8 8 301.00
12 8 NA 1327.80
您还可以为汇总统计信息添加标签,例如:
mtcars %>%
group_by(cyl,carb) %>%
summarize(Mean_Disp=mean(disp)) %>%
do(plyr::rbind.fill(., data_frame(cyl=first(.$cyl), carb=c("Total", "Mean"), Mean_Disp=c(sum(.$Mean_Disp, na.rm=T), mean(.$Mean_Disp, na.rm=T)))))
输出:
Source: local data frame [15 x 3]
Groups: cyl [3]
cyl carb Mean_Disp
<dbl> <chr> <dbl>
1 4 1 91.38
2 4 2 116.60
3 4 Total 207.98
4 4 Mean 103.99
5 6 1 241.50
6 6 4 163.80
7 6 6 145.00
8 6 Total 550.30
9 6 Mean 183.43
10 8 2 345.50
11 8 3 275.80
12 8 4 405.50
13 8 8 301.00
14 8 Total 1327.80
15 8 Mean 331.95
您可以在 ddply
周围使用此包装器,它对每个可能的边距应用 ddply
,并 rbinds
使用其通常的输出结果。
边缘化所有分组因素:
mtcars %>% ddplym(.variables = .(cyl, carb), .fun = summarise, mean(disp))
仅边缘化 carb
:
mtcars %>% ddplym(
.variables = .(carb),
.fun = function(data) data %>% group_by(cyl) %>% summarise(mean(disp)))
包装器:
require(plyr)
require(dplyr)
ddplym <- function(.data, .variables, .fun, ..., .margin = TRUE, .margin_name = '(all)') {
if (.margin) {
df <- .ddplym(.data, .variables, .fun, ..., .margin_name = .margin_name)
} else {
df <- ddply(.data, .variables, .fun, ...)
if (.variables %>% length == 0) {
df$.id <- NULL
}
}
return(df)
}
.ddplym <- function(.data,
.variables,
.fun,
...,
.margin_name = '(all)'
) {
.variables <- as.quoted(.variables)
n <- length(.variables)
var_combn_idx <- lapply(0:n, function(x) {
combn(1:n, n - x) %>% alply(2, c)
}) %>%
unlist(recursive = FALSE, use.names = FALSE)
data_list <- lapply(var_combn_idx, function(x) {
data <- ddply(.data, .variables[x], .fun, ...)
# drop '.id' column created when no variables to split by specified
if (!length(.variables[x]))
data <- data[, -1, drop = FALSE]
return(data)
})
# workaround for NULL .variables
if (unlist(.variables) %>% is.null && names(.variables) %>% is.null) {
data_list <- data_list[1]
} else if (unlist(.variables) %>% is.null) {
data_list <- data_list[2]
}
if (length(data_list) > 1) {
data_list <- lapply(data_list, function(data)
rbind_pre(
data = data,
colnames = colnames(data_list[[1]]),
fill = .margin_name
))
}
Reduce(rbind, data_list)
}
rbind_pre <- function(data, colnames, fill = NA) {
colnames_fill <- setdiff(colnames, colnames(data))
data_fill <- matrix(fill,
nrow = nrow(data),
ncol = length(colnames_fill)) %>%
as.data.frame %>% setNames(colnames_fill)
cbind(data, data_fill)[, colnames]
}
data.table
版本高于v1.11
library(data.table)
cubed <- cube(
as.data.table(mtcars),
.(`mean(disp)` = mean(disp)),
by = c("cyl", "carb")
)
#> cyl carb mean(disp)
#> 1: 6 4 163.8000
#> 2: 4 1 91.3800
#> 3: 6 1 241.5000
#> 4: 8 2 345.5000
#> 5: 8 4 405.5000
#> 6: 4 2 116.6000
#> 7: 8 3 275.8000
#> 8: 6 6 145.0000
#> 9: 8 8 301.0000
#> 10: 6 NA 183.3143
#> 11: 4 NA 105.1364
#> 12: 8 NA 353.1000
#> 13: NA 4 308.8200
#> 14: NA 1 134.2714
#> 15: NA 2 208.1600
#> 16: NA 3 275.8000
#> 17: NA 6 145.0000
#> 18: NA 8 301.0000
#> 19: NA NA 230.7219
res <- dcast(
cubed,
cyl ~ carb,
value.var = "mean(disp)"
)
#> cyl NA 1 2 3 4 6 8
#> 1: NA 230.7219 134.2714 208.16 275.8 308.82 145 301
#> 2: 4 105.1364 91.3800 116.60 NA NA NA NA
#> 3: 6 183.3143 241.5000 NA NA 163.80 145 NA
#> 4: 8 353.1000 NA 345.50 275.8 405.50 NA 301
由 reprex package (v0.3.0)
于 2020 年 2 月 20 日创建
来源:https://jozef.io/r912-datatable-grouping-sets/
library(kableExtra)
options(knitr.kable.NA = "")
res <- as.data.frame(res)
names(res)[2] <- "overall"
res[1, 1] <- "overall"
x <- kable(res, "html")
x <- kable_styling(x, "striped")
add_header_above(x, c(" " = 1, "carb" = ncol(res) - 1))
分享我的方法(如果有帮助的话)。这种方法允许非常容易地添加自定义小计和总计。
data = data.frame( thing1=sprintf("group %i",trunc(runif(200,0,5))),
thing2=sprintf("type %i",trunc(runif(200,0,5))),
value=rnorm(200,0,1) )
data %>%
group_by( thing1, thing2 ) %>%
summarise( sum=sum(value),
count=n() ) %>%
ungroup() %>%
bind_rows(.,
identity(.) %>%
group_by(thing1) %>%
summarise( aggregation="sub total",
sum=sum(sum),
count=sum(count) ) %>%
ungroup(),
identity(.) %>%
summarise( aggregation="total",
sum=sum(sum),
count=sum(count) ) %>%
ungroup() ) %>%
arrange( thing1, thing2, aggregation ) %>%
select( aggregation, everything() )
经过长期努力解决非常相似的问题,我发现 data.table
提供了最简单、最快速的解决方案,完全符合这个目的
data.table::cube(
data.table::as.data.table(mtcars),
.(mean_disp = mean(disp)),
by = c("cyl","carb"))
cyl carb mean_disp
1: 6 4 163.8000
2: 4 1 91.3800
3: 6 1 241.5000
4: 8 2 345.5000
5: 8 4 405.5000
6: 4 2 116.6000
7: 8 3 275.8000
8: 6 6 145.0000
9: 8 8 301.0000
10: 6 NA 183.3143
11: 4 NA 105.1364
12: 8 NA 353.1000
13: NA 4 308.8200
14: NA 1 134.2714
15: NA 2 208.1600
16: NA 3 275.8000
17: NA 6 145.0000
18: NA 8 301.0000
19: NA NA 230.7219
NA
项是您要查找的小计;例如在第 10 行,183.31
结果是所有 6 个气缸的平均值。最后一行带有双 NA
的是具有总体平均值的行。
从那里,您可以轻松地用 as_tibble()
包装结果以跳回 dplyr
语义世界。
遇到了同样的问题,我正在开发一个函数来解决这个问题(参见 https://github.com/jrf1111/TCCD/blob/dev/R/with_subtotals.R)。它仍处于开发阶段,但它可以满足您的需求。
mtcars %>%
group_by(cyl, carb) %>%
with_subtotals() %>%
summarize(mean(disp))
# A tibble: 19 x 3
# Groups: cyl [5]
cyl carb `mean(disp)`
<chr> <chr> <dbl>
1 4 1 91.4
2 4 2 117.
3 4 subtotal 105.
4 6 1 242.
5 6 4 164.
6 6 6 145
7 6 subtotal 183.
8 8 2 346.
9 8 3 276.
10 8 4 406.
11 8 8 301
12 8 subtotal 353.
13 subtotal 1 134.
14 subtotal 2 208.
15 subtotal 3 276.
16 subtotal 4 309.
17 subtotal 6 145
18 subtotal 8 301
19 total total 231.
excel 中数据透视表的一大优点是它们会自动提供小计。首先,我想知道 dplyr 中是否已经创建了可以实现此目的的任何东西。如果没有,实现它的最简单方法是什么?
在下面的示例中,我显示了按气缸数和化油器数计算的平均排量。对于每组气缸 (4,6,8),我想查看该组的平均排量(或总排量,或任何其他汇总统计数据)。
library(dplyr)
mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl carb mean(disp)
1 4 1 91.38
2 4 2 116.60
3 6 1 241.50
4 6 4 163.80
5 6 6 145.00
6 8 2 345.50
7 8 3 275.80
8 8 4 405.50
9 8 8 301.00
data.table 很笨拙,但这是一种方式:
library(data.table)
DT <- data.table(mtcars)
rbind(
DT[,.(mean(disp)), by=.(cyl,carb)],
DT[,.(mean(disp), carb=NA), by=.(cyl) ],
DT[,.(mean(disp), cyl=NA), by=.(carb)]
)[order(cyl,carb)]
这给了
cyl carb V1
1: 4 1 91.3800
2: 4 2 116.6000
3: 4 NA 105.1364
4: 6 1 241.5000
5: 6 4 163.8000
6: 6 6 145.0000
7: 6 NA 183.3143
8: 8 2 345.5000
9: 8 3 275.8000
10: 8 4 405.5000
11: 8 8 301.0000
12: 8 NA 353.1000
13: NA 1 134.2714
14: NA 2 208.1600
15: NA 3 275.8000
16: NA 4 308.8200
17: NA 6 145.0000
18: NA 8 301.0000
我宁愿看到类似 R table
的结果,但不知道有什么函数可以实现。
dplyr @akrun 找到了这个类似的代码
bind_rows(
mtcars %>%
group_by(cyl, carb) %>%
summarise(Mean= mean(disp)),
mtcars %>%
group_by(cyl) %>%
summarise(carb=NA, Mean=mean(disp)),
mtcars %>%
group_by(carb) %>%
summarise(cyl=NA, Mean=mean(disp))
) %>% arrange(cyl, carb)
我们可以将重复操作包装在一个函数中
library(lazyeval)
f1 <- function(df, grp, Var, func){
FUN <- match.fun(func)
df %>%
group_by_(.dots=grp) %>%
summarise_(interp(~FUN(v), v=as.name(Var)))
}
m1 <- f1(mtcars, c('carb', 'cyl'), 'disp', 'mean')
m2 <- f1(mtcars, 'carb', 'disp', 'mean')
m3 <- f1(mtcars, 'cyl', 'disp', 'mean')
bind_rows(list(m1, m2, m3)) %>%
arrange(cyl, carb) %>%
rename(Mean=`FUN(disp)`)
carb cyl Mean
1 1 4 91.3800
2 2 4 116.6000
3 NA 4 105.1364
4 1 6 241.5000
5 4 6 163.8000
6 6 6 145.0000
7 NA 6 183.3143
8 2 8 345.5000
9 3 8 275.8000
10 4 8 405.5000
11 8 8 301.0000
12 NA 8 353.1000
13 1 NA 134.2714
14 2 NA 208.1600
15 3 NA 275.8000
16 4 NA 308.8200
17 6 NA 145.0000
18 8 NA 301.0000
data.table 的 rbindlist
和 fill
:
rbindlist(list(
mtcars %>% group_by(cyl) %>% summarise(mean(disp)),
mtcars %>% group_by(carb) %>% summarise(mean(disp)),
mtcars %>% group_by(cyl,carb) %>% summarise(mean(disp))
),fill=TRUE) %>% arrange(cyl,carb)
rbindlist(list(
DT[,mean(disp),by=.(cyl,carb)],
DT[,mean(disp),by=.(cyl)],
DT[,mean(disp),by=.(carb)]
),fill=TRUE)[order(cyl,carb)]
我知道这可能不是一个非常优雅的解决方案,但我希望它能有所帮助:
p <-mtcars %>% group_by(cyl,carb)
p$cyl <- as.factor(p$cyl)
average_disp <- sapply(1:length(levels(p$cyl)), function(x)mean(subset(p,p$cyl==levels(p$cyl)[x])$disp))
df <- data.frame(levels(p$cyl),average_disp)
colnames(df)[1]<-"cyl"
#> df
# cyl average_disp
#1 4 105.1364
#2 6 183.3143
#3 8 353.1000
(编辑:在 p
的定义中稍作修改后,现在产生与 @Frank 和 @akrun 的解决方案相同的结果)
类似于 table
和 addmargins
的内容(尽管实际上是 data.frame
)
library(dplyr)
library(reshape2)
out <- bind_cols(
mtcars %>% group_by(cyl, carb) %>%
summarise(mu = mean(disp)) %>%
dcast(cyl ~ carb),
(mtcars %>% group_by(cyl) %>% summarise(Total=mean(disp)))[,2]
)
margin <- t((mtcars %>% group_by(carb) %>% summarise(Total=mean(disp)))[,2])
rbind(out, c(NA, margin, mean(mtcars$disp))) %>%
`rownames<-`(c(paste("cyl", c(4,6,8)), "Total")) # add some row names
# cyl 1 2 3 4 6 8 Total
# cyl 4 4 91.3800 116.60 NA NA NA NA 105.1364
# cyl 6 6 241.5000 NA NA 163.80 145 NA 183.3143
# cyl 8 8 NA 345.50 275.8 405.50 NA 301 353.1000
# Total NA 134.2714 208.16 275.8 308.82 145 301 230.7219
底行是列边距,名为 1:8 的列是碳水化合物,总计是行边距。
也可以通过简单地加入两组结果:
cyl_carb <- mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl <- mtcars %>% group_by(cyl) %>% summarize(mean(disp))
joined <- full_join(cyl_carb, cyl)
result <- arrange(joined, cyl)
result
给出:
Source: local data frame [12 x 3]
Groups: cyl [3]
cyl carb mean(disp)
(dbl) (dbl) (dbl)
1 4 1 91.3800
2 4 2 116.6000
3 4 NA 105.1364
4 6 1 241.5000
5 6 4 163.8000
6 6 6 145.0000
7 6 NA 183.3143
8 8 2 345.5000
9 8 3 275.8000
10 8 4 405.5000
11 8 8 301.0000
12 8 NA 353.1000
或附加一列:
cyl_carb <- mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl <- mtcars %>% group_by(cyl) %>% summarize(mean.cyl = mean(disp))
joined <- full_join(cyl_carb, cyl)
joined
给出:
Source: local data frame [9 x 4]
Groups: cyl [?]
cyl carb mean(disp) mean.cyl
(dbl) (dbl) (dbl) (dbl)
1 4 1 91.38 105.1364
2 4 2 116.60 105.1364
3 6 1 241.50 183.3143
4 6 4 163.80 183.3143
5 6 6 145.00 183.3143
6 8 2 345.50 353.1000
7 8 3 275.80 353.1000
8 8 4 405.50 353.1000
9 8 8 301.00 353.1000
这是一个简单的 one-liner 在 data_frame:
中创建边距library(plyr)
library(dplyr)
# Margins without labels
mtcars %>%
group_by(cyl,carb) %>%
summarize(Mean_Disp=mean(disp)) %>%
do(plyr::rbind.fill(., data_frame(cyl=first(.$cyl), Mean_Disp=sum(.$Mean_Disp, na.rm=T))))
输出:
Source: local data frame [12 x 3]
Groups: cyl [3]
cyl carb Mean_Disp
<dbl> <dbl> <dbl>
1 4 1 91.38
2 4 2 116.60
3 4 NA 207.98
4 6 1 241.50
5 6 4 163.80
6 6 6 145.00
7 6 NA 550.30
8 8 2 345.50
9 8 3 275.80
10 8 4 405.50
11 8 8 301.00
12 8 NA 1327.80
您还可以为汇总统计信息添加标签,例如:
mtcars %>%
group_by(cyl,carb) %>%
summarize(Mean_Disp=mean(disp)) %>%
do(plyr::rbind.fill(., data_frame(cyl=first(.$cyl), carb=c("Total", "Mean"), Mean_Disp=c(sum(.$Mean_Disp, na.rm=T), mean(.$Mean_Disp, na.rm=T)))))
输出:
Source: local data frame [15 x 3]
Groups: cyl [3]
cyl carb Mean_Disp
<dbl> <chr> <dbl>
1 4 1 91.38
2 4 2 116.60
3 4 Total 207.98
4 4 Mean 103.99
5 6 1 241.50
6 6 4 163.80
7 6 6 145.00
8 6 Total 550.30
9 6 Mean 183.43
10 8 2 345.50
11 8 3 275.80
12 8 4 405.50
13 8 8 301.00
14 8 Total 1327.80
15 8 Mean 331.95
您可以在 ddply
周围使用此包装器,它对每个可能的边距应用 ddply
,并 rbinds
使用其通常的输出结果。
边缘化所有分组因素:
mtcars %>% ddplym(.variables = .(cyl, carb), .fun = summarise, mean(disp))
仅边缘化 carb
:
mtcars %>% ddplym(
.variables = .(carb),
.fun = function(data) data %>% group_by(cyl) %>% summarise(mean(disp)))
包装器:
require(plyr)
require(dplyr)
ddplym <- function(.data, .variables, .fun, ..., .margin = TRUE, .margin_name = '(all)') {
if (.margin) {
df <- .ddplym(.data, .variables, .fun, ..., .margin_name = .margin_name)
} else {
df <- ddply(.data, .variables, .fun, ...)
if (.variables %>% length == 0) {
df$.id <- NULL
}
}
return(df)
}
.ddplym <- function(.data,
.variables,
.fun,
...,
.margin_name = '(all)'
) {
.variables <- as.quoted(.variables)
n <- length(.variables)
var_combn_idx <- lapply(0:n, function(x) {
combn(1:n, n - x) %>% alply(2, c)
}) %>%
unlist(recursive = FALSE, use.names = FALSE)
data_list <- lapply(var_combn_idx, function(x) {
data <- ddply(.data, .variables[x], .fun, ...)
# drop '.id' column created when no variables to split by specified
if (!length(.variables[x]))
data <- data[, -1, drop = FALSE]
return(data)
})
# workaround for NULL .variables
if (unlist(.variables) %>% is.null && names(.variables) %>% is.null) {
data_list <- data_list[1]
} else if (unlist(.variables) %>% is.null) {
data_list <- data_list[2]
}
if (length(data_list) > 1) {
data_list <- lapply(data_list, function(data)
rbind_pre(
data = data,
colnames = colnames(data_list[[1]]),
fill = .margin_name
))
}
Reduce(rbind, data_list)
}
rbind_pre <- function(data, colnames, fill = NA) {
colnames_fill <- setdiff(colnames, colnames(data))
data_fill <- matrix(fill,
nrow = nrow(data),
ncol = length(colnames_fill)) %>%
as.data.frame %>% setNames(colnames_fill)
cbind(data, data_fill)[, colnames]
}
data.table
版本高于v1.11
library(data.table)
cubed <- cube(
as.data.table(mtcars),
.(`mean(disp)` = mean(disp)),
by = c("cyl", "carb")
)
#> cyl carb mean(disp)
#> 1: 6 4 163.8000
#> 2: 4 1 91.3800
#> 3: 6 1 241.5000
#> 4: 8 2 345.5000
#> 5: 8 4 405.5000
#> 6: 4 2 116.6000
#> 7: 8 3 275.8000
#> 8: 6 6 145.0000
#> 9: 8 8 301.0000
#> 10: 6 NA 183.3143
#> 11: 4 NA 105.1364
#> 12: 8 NA 353.1000
#> 13: NA 4 308.8200
#> 14: NA 1 134.2714
#> 15: NA 2 208.1600
#> 16: NA 3 275.8000
#> 17: NA 6 145.0000
#> 18: NA 8 301.0000
#> 19: NA NA 230.7219
res <- dcast(
cubed,
cyl ~ carb,
value.var = "mean(disp)"
)
#> cyl NA 1 2 3 4 6 8
#> 1: NA 230.7219 134.2714 208.16 275.8 308.82 145 301
#> 2: 4 105.1364 91.3800 116.60 NA NA NA NA
#> 3: 6 183.3143 241.5000 NA NA 163.80 145 NA
#> 4: 8 353.1000 NA 345.50 275.8 405.50 NA 301
由 reprex package (v0.3.0)
于 2020 年 2 月 20 日创建来源:https://jozef.io/r912-datatable-grouping-sets/
library(kableExtra)
options(knitr.kable.NA = "")
res <- as.data.frame(res)
names(res)[2] <- "overall"
res[1, 1] <- "overall"
x <- kable(res, "html")
x <- kable_styling(x, "striped")
add_header_above(x, c(" " = 1, "carb" = ncol(res) - 1))
分享我的方法(如果有帮助的话)。这种方法允许非常容易地添加自定义小计和总计。
data = data.frame( thing1=sprintf("group %i",trunc(runif(200,0,5))),
thing2=sprintf("type %i",trunc(runif(200,0,5))),
value=rnorm(200,0,1) )
data %>%
group_by( thing1, thing2 ) %>%
summarise( sum=sum(value),
count=n() ) %>%
ungroup() %>%
bind_rows(.,
identity(.) %>%
group_by(thing1) %>%
summarise( aggregation="sub total",
sum=sum(sum),
count=sum(count) ) %>%
ungroup(),
identity(.) %>%
summarise( aggregation="total",
sum=sum(sum),
count=sum(count) ) %>%
ungroup() ) %>%
arrange( thing1, thing2, aggregation ) %>%
select( aggregation, everything() )
经过长期努力解决非常相似的问题,我发现 data.table
提供了最简单、最快速的解决方案,完全符合这个目的
data.table::cube(
data.table::as.data.table(mtcars),
.(mean_disp = mean(disp)),
by = c("cyl","carb"))
cyl carb mean_disp
1: 6 4 163.8000
2: 4 1 91.3800
3: 6 1 241.5000
4: 8 2 345.5000
5: 8 4 405.5000
6: 4 2 116.6000
7: 8 3 275.8000
8: 6 6 145.0000
9: 8 8 301.0000
10: 6 NA 183.3143
11: 4 NA 105.1364
12: 8 NA 353.1000
13: NA 4 308.8200
14: NA 1 134.2714
15: NA 2 208.1600
16: NA 3 275.8000
17: NA 6 145.0000
18: NA 8 301.0000
19: NA NA 230.7219
NA
项是您要查找的小计;例如在第 10 行,183.31
结果是所有 6 个气缸的平均值。最后一行带有双 NA
的是具有总体平均值的行。
从那里,您可以轻松地用 as_tibble()
包装结果以跳回 dplyr
语义世界。
遇到了同样的问题,我正在开发一个函数来解决这个问题(参见 https://github.com/jrf1111/TCCD/blob/dev/R/with_subtotals.R)。它仍处于开发阶段,但它可以满足您的需求。
mtcars %>%
group_by(cyl, carb) %>%
with_subtotals() %>%
summarize(mean(disp))
# A tibble: 19 x 3
# Groups: cyl [5]
cyl carb `mean(disp)`
<chr> <chr> <dbl>
1 4 1 91.4
2 4 2 117.
3 4 subtotal 105.
4 6 1 242.
5 6 4 164.
6 6 6 145
7 6 subtotal 183.
8 8 2 346.
9 8 3 276.
10 8 4 406.
11 8 8 301
12 8 subtotal 353.
13 subtotal 1 134.
14 subtotal 2 208.
15 subtotal 3 276.
16 subtotal 4 309.
17 subtotal 6 145
18 subtotal 8 301
19 total total 231.