按组 pmap 列以获得加权行平均
pmap columns by group to get weighted row-wise mean
我正在尝试按组获取某些列的加权行平均值。
library(tidyverse)
set.seed(1)
df <- data.frame(group = rep(LETTERS[3:4], each = 10),
x = runif(n = 10, min = 10, max = 15),
y = runif(n = 10, min = 100, max = 150),
z = runif(n = 10, min = 100, max = 150))
df
# group x y z
# 1 C 11.32754 110.2987 146.7353
# 2 C 11.86062 108.8278 110.6071
# 3 C 12.86427 134.3511 132.5837
# 4 C 14.54104 119.2052 106.2778
# 5 C 11.00841 138.4921 113.3610
# 6 C 14.49195 124.8850 119.3057
# 7 C 14.72338 135.8809 100.6695
# 8 C 13.30399 149.5953 119.1194
# 9 C 13.14557 119.0018 143.4845
# 10 C 10.30893 138.8723 117.0174
# 11 D 11.32754 110.2987 146.7353
# 12 D 11.86062 108.8278 110.6071
# 13 D 12.86427 134.3511 132.5837
# 14 D 14.54104 119.2052 106.2778
# 15 D 11.00841 138.4921 113.3610
# 16 D 14.49195 124.8850 119.3057
# 17 D 14.72338 135.8809 100.6695
# 18 D 13.30399 149.5953 119.1194
# 19 D 13.14557 119.0018 143.4845
# 20 D 10.30893 138.8723 117.0174
要获得 x
、y
、z
的粗行平均值,我可以这样做:
df %>%
mutate(rmean = pmap_dbl(list(x, y, z), ~mean(c(...))))
但我想用这些权重来衡量它们
dfweight <- data.frame(group = c("C", "C", "C",
"D", "D", "D"),
cat = c("x", "y", "z",
"x", "y", "z"),
weights = c(.2, .7, .1,
.4, .1, .5))
# group cat weights
# 1 C x 0.2
# 2 C y 0.7
# 3 C z 0.1
# 4 D x 0.4
# 5 D y 0.1
# 6 D z 0.5
我想我应该先提取权重:
dfweight_split <- lapply(split(dfweight, dfweight$group), function (x) x$weights)
dfweight_split
# $C
# [1] 0.2 0.7 0.1
# $D
# [1] 0.4 0.1 0.5
但我不确定如何 pmap/map
处理这些?
df %>%
group_by(group) %>%
mutate(wmean = pmap_dbl(list(x, y, z), ~weight.mean(c(..., dfweight_split))))
#OR
df %>%
group_by(group) %>%
mutate(wmean = map2(list(x, y, z), dfweight_split, ~weight.mean(.x, .y)))
也很高兴看到 base
解决方案。类似的post是.
谢谢
我认为如果将数据重塑为长格式,执行此计算会更容易。然后将数据与每个 group
和列名的 dfweight
连接起来,并找到每一行的加权平均值。
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number()) %>%
pivot_longer(cols = x:z, names_to = 'cat') %>%
left_join(dfweight, by = c('group', 'cat')) %>%
group_by(group, row) %>%
mutate(weight_mean = weighted.mean(value, weights)) %>%
ungroup %>%
select(-weights) %>%
pivot_wider(names_from = cat, values_from = value) %>%
select(-row)
# group weight_mean x y z
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 C 94.1 11.3 110. 147.
# 2 C 89.6 11.9 109. 111.
# 3 C 110. 12.9 134. 133.
# 4 C 97.0 14.5 119. 106.
# 5 C 110. 11.0 138. 113.
# 6 C 102. 14.5 125. 119.
# 7 C 108. 14.7 136. 101.
# 8 C 119. 13.3 150. 119.
# 9 C 100. 13.1 119. 143.
#10 C 111. 10.3 139. 117.
#11 D 88.9 11.3 110. 147.
#12 D 70.9 11.9 109. 111.
#13 D 84.9 12.9 134. 133.
#14 D 70.9 14.5 119. 106.
#15 D 74.9 11.0 138. 113.
#16 D 77.9 14.5 125. 119.
#17 D 69.8 14.7 136. 101.
#18 D 79.8 13.3 150. 119.
#19 D 88.9 13.1 119. 143.
#20 D 76.5 10.3 139. 117.
我用 set.seed(1)
得到了不同的随机数。
如果我们想使用pmap
,请确保'dfweight'数据列也在同一个数据集中。一个选项是使用 pivot_wider
重塑为宽,然后进行连接 (right_join
) 并使用 pmap
遍历行,使用符号 [] 按相同顺序提取列元素=15=] 在索引之前,将它们作为 vector
参数传递给 weighted.mean
以在 mutate
中创建列
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
dfweight %>%
pivot_wider(names_from = cat, values_from = weights) %>%
rename_at(-1, ~ str_c(., '_weight')) %>%
right_join(df) %>%
mutate(wmean = pmap_dbl(select(., -group),
~ weighted.mean(c(..4, ..5, ..6), c(..1, ..2, ..3)))) %>%
select(-ends_with('weight'))
# A tibble: 20 x 5
# group x y z wmean
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 C 11.3 110. 147. 94.1
# 2 C 11.9 109. 111. 89.6
# 3 C 12.9 134. 133. 110.
# 4 C 14.5 119. 106. 97.0
# 5 C 11.0 138. 113. 110.
# 6 C 14.5 125. 119. 102.
# 7 C 14.7 136. 101. 108.
# 8 C 13.3 150. 119. 119.
# 9 C 13.1 119. 143. 100.
#10 C 10.3 139. 117. 111.
#11 D 11.3 110. 147. 88.9
#12 D 11.9 109. 111. 70.9
#13 D 12.9 134. 133. 84.9
#14 D 14.5 119. 106. 70.9
#15 D 11.0 138. 113. 74.9
#16 D 14.5 125. 119. 77.9
#17 D 14.7 136. 101. 69.8
#18 D 13.3 150. 119. 79.8
#19 D 13.1 119. 143. 88.9
#20 D 10.3 139. 117. 76.5
我正在尝试按组获取某些列的加权行平均值。
library(tidyverse)
set.seed(1)
df <- data.frame(group = rep(LETTERS[3:4], each = 10),
x = runif(n = 10, min = 10, max = 15),
y = runif(n = 10, min = 100, max = 150),
z = runif(n = 10, min = 100, max = 150))
df
# group x y z
# 1 C 11.32754 110.2987 146.7353
# 2 C 11.86062 108.8278 110.6071
# 3 C 12.86427 134.3511 132.5837
# 4 C 14.54104 119.2052 106.2778
# 5 C 11.00841 138.4921 113.3610
# 6 C 14.49195 124.8850 119.3057
# 7 C 14.72338 135.8809 100.6695
# 8 C 13.30399 149.5953 119.1194
# 9 C 13.14557 119.0018 143.4845
# 10 C 10.30893 138.8723 117.0174
# 11 D 11.32754 110.2987 146.7353
# 12 D 11.86062 108.8278 110.6071
# 13 D 12.86427 134.3511 132.5837
# 14 D 14.54104 119.2052 106.2778
# 15 D 11.00841 138.4921 113.3610
# 16 D 14.49195 124.8850 119.3057
# 17 D 14.72338 135.8809 100.6695
# 18 D 13.30399 149.5953 119.1194
# 19 D 13.14557 119.0018 143.4845
# 20 D 10.30893 138.8723 117.0174
要获得 x
、y
、z
的粗行平均值,我可以这样做:
df %>%
mutate(rmean = pmap_dbl(list(x, y, z), ~mean(c(...))))
但我想用这些权重来衡量它们
dfweight <- data.frame(group = c("C", "C", "C",
"D", "D", "D"),
cat = c("x", "y", "z",
"x", "y", "z"),
weights = c(.2, .7, .1,
.4, .1, .5))
# group cat weights
# 1 C x 0.2
# 2 C y 0.7
# 3 C z 0.1
# 4 D x 0.4
# 5 D y 0.1
# 6 D z 0.5
我想我应该先提取权重:
dfweight_split <- lapply(split(dfweight, dfweight$group), function (x) x$weights)
dfweight_split
# $C
# [1] 0.2 0.7 0.1
# $D
# [1] 0.4 0.1 0.5
但我不确定如何 pmap/map
处理这些?
df %>%
group_by(group) %>%
mutate(wmean = pmap_dbl(list(x, y, z), ~weight.mean(c(..., dfweight_split))))
#OR
df %>%
group_by(group) %>%
mutate(wmean = map2(list(x, y, z), dfweight_split, ~weight.mean(.x, .y)))
也很高兴看到 base
解决方案。类似的post是
谢谢
我认为如果将数据重塑为长格式,执行此计算会更容易。然后将数据与每个 group
和列名的 dfweight
连接起来,并找到每一行的加权平均值。
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number()) %>%
pivot_longer(cols = x:z, names_to = 'cat') %>%
left_join(dfweight, by = c('group', 'cat')) %>%
group_by(group, row) %>%
mutate(weight_mean = weighted.mean(value, weights)) %>%
ungroup %>%
select(-weights) %>%
pivot_wider(names_from = cat, values_from = value) %>%
select(-row)
# group weight_mean x y z
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 C 94.1 11.3 110. 147.
# 2 C 89.6 11.9 109. 111.
# 3 C 110. 12.9 134. 133.
# 4 C 97.0 14.5 119. 106.
# 5 C 110. 11.0 138. 113.
# 6 C 102. 14.5 125. 119.
# 7 C 108. 14.7 136. 101.
# 8 C 119. 13.3 150. 119.
# 9 C 100. 13.1 119. 143.
#10 C 111. 10.3 139. 117.
#11 D 88.9 11.3 110. 147.
#12 D 70.9 11.9 109. 111.
#13 D 84.9 12.9 134. 133.
#14 D 70.9 14.5 119. 106.
#15 D 74.9 11.0 138. 113.
#16 D 77.9 14.5 125. 119.
#17 D 69.8 14.7 136. 101.
#18 D 79.8 13.3 150. 119.
#19 D 88.9 13.1 119. 143.
#20 D 76.5 10.3 139. 117.
我用 set.seed(1)
得到了不同的随机数。
如果我们想使用pmap
,请确保'dfweight'数据列也在同一个数据集中。一个选项是使用 pivot_wider
重塑为宽,然后进行连接 (right_join
) 并使用 pmap
遍历行,使用符号 [] 按相同顺序提取列元素=15=] 在索引之前,将它们作为 vector
参数传递给 weighted.mean
以在 mutate
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
dfweight %>%
pivot_wider(names_from = cat, values_from = weights) %>%
rename_at(-1, ~ str_c(., '_weight')) %>%
right_join(df) %>%
mutate(wmean = pmap_dbl(select(., -group),
~ weighted.mean(c(..4, ..5, ..6), c(..1, ..2, ..3)))) %>%
select(-ends_with('weight'))
# A tibble: 20 x 5
# group x y z wmean
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 C 11.3 110. 147. 94.1
# 2 C 11.9 109. 111. 89.6
# 3 C 12.9 134. 133. 110.
# 4 C 14.5 119. 106. 97.0
# 5 C 11.0 138. 113. 110.
# 6 C 14.5 125. 119. 102.
# 7 C 14.7 136. 101. 108.
# 8 C 13.3 150. 119. 119.
# 9 C 13.1 119. 143. 100.
#10 C 10.3 139. 117. 111.
#11 D 11.3 110. 147. 88.9
#12 D 11.9 109. 111. 70.9
#13 D 12.9 134. 133. 84.9
#14 D 14.5 119. 106. 70.9
#15 D 11.0 138. 113. 74.9
#16 D 14.5 125. 119. 77.9
#17 D 14.7 136. 101. 69.8
#18 D 13.3 150. 119. 79.8
#19 D 13.1 119. 143. 88.9
#20 D 10.3 139. 117. 76.5