表示每个组内有超过 1 列的组索引
Means within each group with more than 1 column of group indices
我有一个变量,我想获得每个组内的均值,其中为列中的每个观察列出了组,我有很多这样的列。然后我想将组均值与适当的观察相关联,这样如果我从 m obs x n 不同分组的矩阵开始,我将获得 m x n 均值矩阵。例如:
> var <- round(runif(10),digits=2)
> var
[1] 0.47 0.21 0.80 0.65 0.32 0.72 0.29 0.93 0.77 0.64
> groupings <- cbind(sample(c(1,2,3), 10, replace=TRUE),
sample(c(1,2,3), 10, replace=TRUE),
sample(c(1,2,3,5), 10, replace=TRUE))
> groupings
[,1] [,2] [,3]
[1,] 3 1 5
[2,] 1 1 5
[3,] 2 1 5
[4,] 3 2 3
[5,] 2 3 1
[6,] 1 1 1
[7,] 2 3 1
[8,] 1 2 1
[9,] 3 1 5
[10,] 1 3 2
我可以通过以下方式分别获取每个组内的均值(例如)
> means.1 <- sapply(split(var, groupings[,1]), function(x) mean(x))
> means.2 <- sapply(split(var, groupings[,2]), function(x) mean(x))
> means.3 <- sapply(split(var, groupings[,3]), function(x) mean(x))
> means.1
1 2 3
0.625 0.470 0.630
> means.2
1 2 3
0.5940000 0.7900000 0.4166667
> means.3
1 2 3 5
0.5650 0.6400 0.6500 0.5625
但这些单独的调用不仅效率低下,而且仍然无法得到我想要的,如下
[,1] [,2] [,3]
[1,] 0.630 0.5940000 0.5625
[2,] 0.625 0.5940000 0.5625
[3,] 0.470 0.5940000 0.5625
[4,] 0.630 0.7900000 0.6500
[5,] 0.470 0.4166667 0.5650
[6,] 0.625 0.5940000 0.5650
[7,] 0.470 0.4166667 0.5650
[8,] 0.625 0.7900000 0.5650
[9,] 0.630 0.5940000 0.5625
[10,] 0.625 0.4166667 0.6400
这是使用 ave
以及 split
和 sapply
的一种方法
sapply(split(groupings, rep(seq_len(ncol(groupings)), each=nrow(groupings))),
function(x) ave(var, x, FUN=mean))
1 2 3
[1,] 0.4566667 0.5550000 0.3925000
[2,] 0.6200000 0.5550000 0.3925000
[3,] 0.4816667 0.5550000 0.3925000
[4,] 0.4566667 0.5550000 0.6200000
[5,] 0.4816667 0.5550000 0.4350000
[6,] 0.4566667 0.5133333 0.6066667
[7,] 0.4816667 0.0100000 0.4350000
[8,] 0.4816667 0.5133333 0.3925000
[9,] 0.4816667 0.5133333 0.6066667
[10,] 0.4816667 0.5550000 0.6066667
sapply
、split(groupings, rep(seq_len(ncol(groupings)), each=nrow(groupings)))
的条目生成一个列表,其中每个列表项都是矩阵分组的一列。此列表的每个元素都被馈送到 sapply
,它应用 ave
使用向量对结果进行分组。
数据
set.seed(1234)
var <- round(runif(10),digits=2)
groupings <- cbind(sample(c(1,2,3), 10, replace=TRUE),
sample(c(1,2,3), 10, replace=TRUE),
sample(c(1,2,3,5), 10, replace=TRUE))
library(dplyr)
set.seed(1000)
var <- round(runif(10),digits=2)
groupings <- cbind(sample(c(1,2,3), 10, replace=TRUE),
sample(c(1,2,3), 10, replace=TRUE),
sample(c(1,2,3,5), 10, replace=TRUE), var)
df = data.frame(groupings)
df %>%
group_by(V1)%>% mutate(x1 =mean(var))%>% ungroup(V1) %>%
group_by(V2) %>% mutate(x2=mean(var)) %>% ungroup(V2) %>%
group_by(V3) %>% mutate(x3=mean(var)) %>% ungroup(V3)
# V1 V2 V3 var x1 x2 x3
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2 1 3 0.33 0.4775000 0.322 0.250
#2 3 3 1 0.76 0.6566667 0.470 0.750
#3 1 1 3 0.11 0.1333333 0.322 0.250
#4 3 1 5 0.69 0.6566667 0.322 0.635
#5 3 2 3 0.52 0.6566667 0.630 0.250
#6 1 3 3 0.07 0.1333333 0.470 0.250
#7 2 2 1 0.74 0.4775000 0.630 0.750
#8 2 3 5 0.58 0.4775000 0.470 0.635
#9 1 1 3 0.22 0.1333333 0.322 0.250
#10 2 1 2 0.26 0.4775000 0.322 0.260
# you can simply subset the columns
另一种选择,你可以使用apply
(因为你已经有一个矩阵)循环遍历列(Margin设置为2)并将列传递给ave
函数作为组变量,您可以显式指定 FUN 参数为均值或不指定它,因为 mean 是默认函数使用:
apply(groupings, 2, ave, x = var) # pass the var as a named parameter since it is the
# parameter at the first position of ave function, if not
# ave will treat the column as the first position parameter
# which you don't want to
# [,1] [,2] [,3]
#[1,] 0.630 0.5940000 0.5625
#[2,] 0.625 0.5940000 0.5625
#[3,] 0.470 0.5940000 0.5625
#[4,] 0.630 0.7900000 0.6500
#[5,] 0.470 0.4166667 0.5650
#[6,] 0.625 0.5940000 0.5650
#[7,] 0.470 0.4166667 0.5650
#[8,] 0.625 0.7900000 0.5650
#[9,] 0.630 0.5940000 0.5625
#[10,] 0.625 0.4166667 0.6400
或者用dplyr
,可以使用mutate_all()
函数:
library(dplyr)
mutate_all(as.data.frame(groupings), funs(ave(var, .)))
# V1 V2 V3
#1 0.630 0.5940000 0.5625
#2 0.625 0.5940000 0.5625
#3 0.470 0.5940000 0.5625
#4 0.630 0.7900000 0.6500
#5 0.470 0.4166667 0.5650
#6 0.625 0.5940000 0.5650
#7 0.470 0.4166667 0.5650
#8 0.625 0.7900000 0.5650
#9 0.630 0.5940000 0.5625
#10 0.625 0.4166667 0.6400
我有一个变量,我想获得每个组内的均值,其中为列中的每个观察列出了组,我有很多这样的列。然后我想将组均值与适当的观察相关联,这样如果我从 m obs x n 不同分组的矩阵开始,我将获得 m x n 均值矩阵。例如:
> var <- round(runif(10),digits=2)
> var
[1] 0.47 0.21 0.80 0.65 0.32 0.72 0.29 0.93 0.77 0.64
> groupings <- cbind(sample(c(1,2,3), 10, replace=TRUE),
sample(c(1,2,3), 10, replace=TRUE),
sample(c(1,2,3,5), 10, replace=TRUE))
> groupings
[,1] [,2] [,3]
[1,] 3 1 5
[2,] 1 1 5
[3,] 2 1 5
[4,] 3 2 3
[5,] 2 3 1
[6,] 1 1 1
[7,] 2 3 1
[8,] 1 2 1
[9,] 3 1 5
[10,] 1 3 2
我可以通过以下方式分别获取每个组内的均值(例如)
> means.1 <- sapply(split(var, groupings[,1]), function(x) mean(x))
> means.2 <- sapply(split(var, groupings[,2]), function(x) mean(x))
> means.3 <- sapply(split(var, groupings[,3]), function(x) mean(x))
> means.1
1 2 3
0.625 0.470 0.630
> means.2
1 2 3
0.5940000 0.7900000 0.4166667
> means.3
1 2 3 5
0.5650 0.6400 0.6500 0.5625
但这些单独的调用不仅效率低下,而且仍然无法得到我想要的,如下
[,1] [,2] [,3]
[1,] 0.630 0.5940000 0.5625
[2,] 0.625 0.5940000 0.5625
[3,] 0.470 0.5940000 0.5625
[4,] 0.630 0.7900000 0.6500
[5,] 0.470 0.4166667 0.5650
[6,] 0.625 0.5940000 0.5650
[7,] 0.470 0.4166667 0.5650
[8,] 0.625 0.7900000 0.5650
[9,] 0.630 0.5940000 0.5625
[10,] 0.625 0.4166667 0.6400
这是使用 ave
以及 split
和 sapply
sapply(split(groupings, rep(seq_len(ncol(groupings)), each=nrow(groupings))),
function(x) ave(var, x, FUN=mean))
1 2 3
[1,] 0.4566667 0.5550000 0.3925000
[2,] 0.6200000 0.5550000 0.3925000
[3,] 0.4816667 0.5550000 0.3925000
[4,] 0.4566667 0.5550000 0.6200000
[5,] 0.4816667 0.5550000 0.4350000
[6,] 0.4566667 0.5133333 0.6066667
[7,] 0.4816667 0.0100000 0.4350000
[8,] 0.4816667 0.5133333 0.3925000
[9,] 0.4816667 0.5133333 0.6066667
[10,] 0.4816667 0.5550000 0.6066667
sapply
、split(groupings, rep(seq_len(ncol(groupings)), each=nrow(groupings)))
的条目生成一个列表,其中每个列表项都是矩阵分组的一列。此列表的每个元素都被馈送到 sapply
,它应用 ave
使用向量对结果进行分组。
数据
set.seed(1234)
var <- round(runif(10),digits=2)
groupings <- cbind(sample(c(1,2,3), 10, replace=TRUE),
sample(c(1,2,3), 10, replace=TRUE),
sample(c(1,2,3,5), 10, replace=TRUE))
library(dplyr)
set.seed(1000)
var <- round(runif(10),digits=2)
groupings <- cbind(sample(c(1,2,3), 10, replace=TRUE),
sample(c(1,2,3), 10, replace=TRUE),
sample(c(1,2,3,5), 10, replace=TRUE), var)
df = data.frame(groupings)
df %>%
group_by(V1)%>% mutate(x1 =mean(var))%>% ungroup(V1) %>%
group_by(V2) %>% mutate(x2=mean(var)) %>% ungroup(V2) %>%
group_by(V3) %>% mutate(x3=mean(var)) %>% ungroup(V3)
# V1 V2 V3 var x1 x2 x3
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2 1 3 0.33 0.4775000 0.322 0.250
#2 3 3 1 0.76 0.6566667 0.470 0.750
#3 1 1 3 0.11 0.1333333 0.322 0.250
#4 3 1 5 0.69 0.6566667 0.322 0.635
#5 3 2 3 0.52 0.6566667 0.630 0.250
#6 1 3 3 0.07 0.1333333 0.470 0.250
#7 2 2 1 0.74 0.4775000 0.630 0.750
#8 2 3 5 0.58 0.4775000 0.470 0.635
#9 1 1 3 0.22 0.1333333 0.322 0.250
#10 2 1 2 0.26 0.4775000 0.322 0.260
# you can simply subset the columns
另一种选择,你可以使用apply
(因为你已经有一个矩阵)循环遍历列(Margin设置为2)并将列传递给ave
函数作为组变量,您可以显式指定 FUN 参数为均值或不指定它,因为 mean 是默认函数使用:
apply(groupings, 2, ave, x = var) # pass the var as a named parameter since it is the
# parameter at the first position of ave function, if not
# ave will treat the column as the first position parameter
# which you don't want to
# [,1] [,2] [,3]
#[1,] 0.630 0.5940000 0.5625
#[2,] 0.625 0.5940000 0.5625
#[3,] 0.470 0.5940000 0.5625
#[4,] 0.630 0.7900000 0.6500
#[5,] 0.470 0.4166667 0.5650
#[6,] 0.625 0.5940000 0.5650
#[7,] 0.470 0.4166667 0.5650
#[8,] 0.625 0.7900000 0.5650
#[9,] 0.630 0.5940000 0.5625
#[10,] 0.625 0.4166667 0.6400
或者用dplyr
,可以使用mutate_all()
函数:
library(dplyr)
mutate_all(as.data.frame(groupings), funs(ave(var, .)))
# V1 V2 V3
#1 0.630 0.5940000 0.5625
#2 0.625 0.5940000 0.5625
#3 0.470 0.5940000 0.5625
#4 0.630 0.7900000 0.6500
#5 0.470 0.4166667 0.5650
#6 0.625 0.5940000 0.5650
#7 0.470 0.4166667 0.5650
#8 0.625 0.7900000 0.5650
#9 0.630 0.5940000 0.5625
#10 0.625 0.4166667 0.6400