按组 pmap 列以获得加权行平均

pmap columns by group to get weighted row-wise mean

我正在尝试按组获取某些列的加权行平均值。

library(tidyverse)
set.seed(1)
df <- data.frame(group = rep(LETTERS[3:4], each = 10),
           x = runif(n = 10, min = 10, max = 15),
           y = runif(n = 10, min = 100, max = 150),
           z = runif(n = 10, min = 100, max = 150))
df
#    group        x        y        z
# 1      C 11.32754 110.2987 146.7353
# 2      C 11.86062 108.8278 110.6071
# 3      C 12.86427 134.3511 132.5837
# 4      C 14.54104 119.2052 106.2778
# 5      C 11.00841 138.4921 113.3610
# 6      C 14.49195 124.8850 119.3057
# 7      C 14.72338 135.8809 100.6695
# 8      C 13.30399 149.5953 119.1194
# 9      C 13.14557 119.0018 143.4845
# 10     C 10.30893 138.8723 117.0174
# 11     D 11.32754 110.2987 146.7353
# 12     D 11.86062 108.8278 110.6071
# 13     D 12.86427 134.3511 132.5837
# 14     D 14.54104 119.2052 106.2778
# 15     D 11.00841 138.4921 113.3610
# 16     D 14.49195 124.8850 119.3057
# 17     D 14.72338 135.8809 100.6695
# 18     D 13.30399 149.5953 119.1194
# 19     D 13.14557 119.0018 143.4845
# 20     D 10.30893 138.8723 117.0174

要获得 xyz 的粗行平均值,我可以这样做:

df %>% 
  mutate(rmean = pmap_dbl(list(x, y, z), ~mean(c(...))))

但我想用这些权重来衡量它们

dfweight <- data.frame(group = c("C", "C", "C",
                                 "D", "D", "D"),
                       cat = c("x", "y", "z", 
                               "x", "y", "z"),
                       weights = c(.2, .7, .1, 
                                   .4, .1, .5))
#   group cat weights
# 1     C   x     0.2
# 2     C   y     0.7
# 3     C   z     0.1
# 4     D   x     0.4
# 5     D   y     0.1
# 6     D   z     0.5

我想我应该先提取权重:

dfweight_split <- lapply(split(dfweight, dfweight$group), function (x) x$weights)
dfweight_split
# $C
# [1] 0.2 0.7 0.1

# $D
# [1] 0.4 0.1 0.5

但我不确定如何 pmap/map 处理这些?

df %>% 
  group_by(group) %>% 
  mutate(wmean = pmap_dbl(list(x, y, z), ~weight.mean(c(..., dfweight_split))))

#OR
df %>% 
  group_by(group) %>% 
  mutate(wmean = map2(list(x, y, z), dfweight_split, ~weight.mean(.x, .y)))

也很高兴看到 base 解决方案。类似的post是.

谢谢

我认为如果将数据重塑为长格式,执行此计算会更容易。然后将数据与每个 group 和列名的 dfweight 连接起来,并找到每一行的加权平均值。

library(dplyr)
library(tidyr)

df %>%
  mutate(row = row_number()) %>%
  pivot_longer(cols = x:z, names_to = 'cat') %>%
  left_join(dfweight, by = c('group', 'cat')) %>%
  group_by(group, row) %>%
  mutate(weight_mean = weighted.mean(value, weights)) %>%
  ungroup %>%
  select(-weights) %>%
  pivot_wider(names_from = cat, values_from = value) %>%
  select(-row)

#  group weight_mean     x     y     z
#   <chr>       <dbl> <dbl> <dbl> <dbl>
# 1 C            94.1  11.3  110.  147.
# 2 C            89.6  11.9  109.  111.
# 3 C           110.   12.9  134.  133.
# 4 C            97.0  14.5  119.  106.
# 5 C           110.   11.0  138.  113.
# 6 C           102.   14.5  125.  119.
# 7 C           108.   14.7  136.  101.
# 8 C           119.   13.3  150.  119.
# 9 C           100.   13.1  119.  143.
#10 C           111.   10.3  139.  117.
#11 D            88.9  11.3  110.  147.
#12 D            70.9  11.9  109.  111.
#13 D            84.9  12.9  134.  133.
#14 D            70.9  14.5  119.  106.
#15 D            74.9  11.0  138.  113.
#16 D            77.9  14.5  125.  119.
#17 D            69.8  14.7  136.  101.
#18 D            79.8  13.3  150.  119.
#19 D            88.9  13.1  119.  143.
#20 D            76.5  10.3  139.  117.

我用 set.seed(1) 得到了不同的随机数。

如果我们想使用pmap,请确保'dfweight'数据列也在同一个数据集中。一个选项是使用 pivot_wider 重塑为宽,然后进行连接 (right_join) 并使用 pmap 遍历行,使用符号 [] 按相同顺序提取列元素=15=] 在索引之前,将它们作为 vector 参数传递给 weighted.mean 以在 mutate

中创建列
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
dfweight %>% 
   pivot_wider(names_from = cat, values_from = weights) %>% 
   rename_at(-1, ~ str_c(., '_weight')) %>%
   right_join(df) %>% 
   mutate(wmean = pmap_dbl(select(., -group), 
     ~ weighted.mean(c(..4, ..5, ..6), c(..1, ..2, ..3)))) %>% 
   select(-ends_with('weight'))
# A tibble: 20 x 5
#   group     x     y     z wmean
#   <chr> <dbl> <dbl> <dbl> <dbl>
# 1 C      11.3  110.  147.  94.1
# 2 C      11.9  109.  111.  89.6
# 3 C      12.9  134.  133. 110. 
# 4 C      14.5  119.  106.  97.0
# 5 C      11.0  138.  113. 110. 
# 6 C      14.5  125.  119. 102. 
# 7 C      14.7  136.  101. 108. 
# 8 C      13.3  150.  119. 119. 
# 9 C      13.1  119.  143. 100. 
#10 C      10.3  139.  117. 111. 
#11 D      11.3  110.  147.  88.9
#12 D      11.9  109.  111.  70.9
#13 D      12.9  134.  133.  84.9
#14 D      14.5  119.  106.  70.9
#15 D      11.0  138.  113.  74.9
#16 D      14.5  125.  119.  77.9
#17 D      14.7  136.  101.  69.8
#18 D      13.3  150.  119.  79.8
#19 D      13.1  119.  143.  88.9
#20 D      10.3  139.  117.  76.5