如何使用 ddply 以编程方式汇总多列?
How to summarize over multiple columns programatically using ddply?
是否可以在不使用 eval + parse 的情况下从函数的参数中指定要使用 ddply 聚合哪些列?这是我目前所拥有的:
x <- c(2,4,3,1,5,7)
y <- c(3,2,6,3,4,6)
group1 <- c("A","A","A","A","B","B")
group2 <- c("X","X","Y","Y","Z","X")
data <- data.frame(group1, group2, x, y)
这是我想要的输出:
aggFunction <- function(dataframe, toAverage, toGroup) {
out <- ddply(dataframe, toGroup, summarise,
x = mean(x),
y = mean(y))
return(out)
}
aggFunction(data, c("x", "y"), c("group1", "group2"))
# group1 group2 x y
# 1 A X 3 2.5
# 2 A Y 2 4.5
# 3 B X 7 6.0
# 4 B Z 5 4.0
这是我使用 parse(eval) 的解决方案:
aggFunction <- function(dataframe, toAverage, toGroup) {
toAverageArgs <- paste(toAverage, " = mean(", toAverage, ")", sep = "", collapse = ", ")
out <- eval(parse(text = paste("ddply(dataframe, toGroup, here(summarize),", toAverageArgs, ")")))
return(out)
}
这给了我想要的输出。
我想知道是否有更好的方法来做到这一点。我知道使用 do.call() 和 get(),但是 none 我对这些的尝试都奏效了。
这是一次尝试;
get(string) 没有用,但是在这里(总结)让我得到字符串值。不幸的是,这意味着 ddply 将它们视为字符串:
aggFunction <- function(dataframe, toAverage, toGroup) {
string <- paste(toAverage, " = mean(", toAverage, ")", sep = "", collapse = ", ")
out <- ddply(dataframe, toGroup, here(summarise), string)
return(out)
}
aggFunction(data, c("x", "y"), c("group1", "group2"))
# group1 group2 ..2
# 1 A X x = mean(x), y = mean(y)
# 2 A Y x = mean(x), y = mean(y)
# 3 B X x = mean(x), y = mean(y)
# 4 B Z x = mean(x), y = mean(y)
也试过do.call,但还是被当作字符串处理:
aggFunction <- function(dataframe, toAverage, toGroup) {
string <- paste(toAverage, " = mean(", toAverage, ")", sep = "", collapse = ", ")
print(string)
args <- list(dataframe, toGroup, here(summarise), string)
out <- do.call(ddply, args)
return(out)
}
aggFunction(data, c("x", "y"), c("group1", "group2"))
# group1 group2 "x = mean(x), y = mean(y)"
# 1 A X x = mean(x), y = mean(y)
# 2 A Y x = mean(x), y = mean(y)
# 3 B X x = mean(x), y = mean(y)
# 4 B Z x = mean(x), y = mean(y)
最后我尝试在 mean() 中进行硬编码,但后来我无法设置列名。如果我使用 get(testVar) = mean(get(testVar)) 我会得到意想不到的 =.
aggFunction <- function(dataframe, toAverage, toGroup) {
testVar <- "x"
out <- ddply(dataframe, toGroup, here(summarise),
get(testVar) = mean(get(testVar))
##
return(out)
}
您可以考虑 dplyr
包 - 通常它比 plyr
快得多,而且语法也很漂亮。
library(dplyr)
x <- c(2,4,3,1,5,7)
y <- c(3,2,6,3,4,6)
group1 <- c("A","A","A","A","B","B")
group2 <- c("X","X","Y","Y","Z","X")
aggFunction <- function(dataframe, toAverage, toGroup) {
dataframe %>%
group_by_(.dots = toGroup) %>%
summarise_(.dots = setNames(sprintf("mean(%s)", toAverage), toAverage))
}
data <- data.frame(group1, group2, x, y)
aggFunction(data, c("x", "y"), c("group1", "group2"))
它给出:
group1 group2 x y
1 A X 3 2.5
2 A Y 2 4.5
3 B X 7 6.0
4 B Z 5 4.0
在基础 R
中使用 aggregate
aggFunction <- function(dataframe, toAverage, toGroup) {
aggregate(dataframe[, toAverage], dataframe[, toGroup], mean)
}
aggFunction(data, c("x", "y"), c("group1", "group2"))
group1 group2 x y
1 A X 3 2.5
2 B X 7 6.0
3 A Y 2 4.5
4 B Z 5 4.0
如果先融化数据框,在长格式时进行计算,然后再转换回来,这会容易得多。
library(reshape2)
library(plyr)
aggFunction <- function(d1, toAverage, toGroup) {
d2 <- melt(d1, id.vars=toGroup, measure.vars=toAverage)
d3 <- ddply(d2, ~group1 + group2 + variable, summarize, mean=mean(value))
dcast(d3, group1 + group2 ~ variable, value.var="mean")
}
aggFunction(data, c("x", "y"), c("group1", "group2"))
## group1 group2 x y
## 1 A X 3 2.5
## 2 A Y 2 4.5
## 3 B X 7 6.0
## 4 B Z 5 4.0
是否可以在不使用 eval + parse 的情况下从函数的参数中指定要使用 ddply 聚合哪些列?这是我目前所拥有的:
x <- c(2,4,3,1,5,7)
y <- c(3,2,6,3,4,6)
group1 <- c("A","A","A","A","B","B")
group2 <- c("X","X","Y","Y","Z","X")
data <- data.frame(group1, group2, x, y)
这是我想要的输出:
aggFunction <- function(dataframe, toAverage, toGroup) {
out <- ddply(dataframe, toGroup, summarise,
x = mean(x),
y = mean(y))
return(out)
}
aggFunction(data, c("x", "y"), c("group1", "group2"))
# group1 group2 x y
# 1 A X 3 2.5
# 2 A Y 2 4.5
# 3 B X 7 6.0
# 4 B Z 5 4.0
这是我使用 parse(eval) 的解决方案:
aggFunction <- function(dataframe, toAverage, toGroup) {
toAverageArgs <- paste(toAverage, " = mean(", toAverage, ")", sep = "", collapse = ", ")
out <- eval(parse(text = paste("ddply(dataframe, toGroup, here(summarize),", toAverageArgs, ")")))
return(out)
}
这给了我想要的输出。
我想知道是否有更好的方法来做到这一点。我知道使用 do.call() 和 get(),但是 none 我对这些的尝试都奏效了。
这是一次尝试; get(string) 没有用,但是在这里(总结)让我得到字符串值。不幸的是,这意味着 ddply 将它们视为字符串:
aggFunction <- function(dataframe, toAverage, toGroup) {
string <- paste(toAverage, " = mean(", toAverage, ")", sep = "", collapse = ", ")
out <- ddply(dataframe, toGroup, here(summarise), string)
return(out)
}
aggFunction(data, c("x", "y"), c("group1", "group2"))
# group1 group2 ..2
# 1 A X x = mean(x), y = mean(y)
# 2 A Y x = mean(x), y = mean(y)
# 3 B X x = mean(x), y = mean(y)
# 4 B Z x = mean(x), y = mean(y)
也试过do.call,但还是被当作字符串处理:
aggFunction <- function(dataframe, toAverage, toGroup) {
string <- paste(toAverage, " = mean(", toAverage, ")", sep = "", collapse = ", ")
print(string)
args <- list(dataframe, toGroup, here(summarise), string)
out <- do.call(ddply, args)
return(out)
}
aggFunction(data, c("x", "y"), c("group1", "group2"))
# group1 group2 "x = mean(x), y = mean(y)"
# 1 A X x = mean(x), y = mean(y)
# 2 A Y x = mean(x), y = mean(y)
# 3 B X x = mean(x), y = mean(y)
# 4 B Z x = mean(x), y = mean(y)
最后我尝试在 mean() 中进行硬编码,但后来我无法设置列名。如果我使用 get(testVar) = mean(get(testVar)) 我会得到意想不到的 =.
aggFunction <- function(dataframe, toAverage, toGroup) {
testVar <- "x"
out <- ddply(dataframe, toGroup, here(summarise),
get(testVar) = mean(get(testVar))
##
return(out)
}
您可以考虑 dplyr
包 - 通常它比 plyr
快得多,而且语法也很漂亮。
library(dplyr)
x <- c(2,4,3,1,5,7)
y <- c(3,2,6,3,4,6)
group1 <- c("A","A","A","A","B","B")
group2 <- c("X","X","Y","Y","Z","X")
aggFunction <- function(dataframe, toAverage, toGroup) {
dataframe %>%
group_by_(.dots = toGroup) %>%
summarise_(.dots = setNames(sprintf("mean(%s)", toAverage), toAverage))
}
data <- data.frame(group1, group2, x, y)
aggFunction(data, c("x", "y"), c("group1", "group2"))
它给出:
group1 group2 x y
1 A X 3 2.5
2 A Y 2 4.5
3 B X 7 6.0
4 B Z 5 4.0
在基础 R
中使用aggregate
aggFunction <- function(dataframe, toAverage, toGroup) {
aggregate(dataframe[, toAverage], dataframe[, toGroup], mean)
}
aggFunction(data, c("x", "y"), c("group1", "group2"))
group1 group2 x y
1 A X 3 2.5
2 B X 7 6.0
3 A Y 2 4.5
4 B Z 5 4.0
如果先融化数据框,在长格式时进行计算,然后再转换回来,这会容易得多。
library(reshape2)
library(plyr)
aggFunction <- function(d1, toAverage, toGroup) {
d2 <- melt(d1, id.vars=toGroup, measure.vars=toAverage)
d3 <- ddply(d2, ~group1 + group2 + variable, summarize, mean=mean(value))
dcast(d3, group1 + group2 ~ variable, value.var="mean")
}
aggFunction(data, c("x", "y"), c("group1", "group2"))
## group1 group2 x y
## 1 A X 3 2.5
## 2 A Y 2 4.5
## 3 B X 7 6.0
## 4 B Z 5 4.0