具有调查权重的组别比例

Proportions by group with survey weights

data = data.frame(ID = 1:1000,
                  GROUP = factor(sample(1:5, rep = T)),
                  CAT = factor(sample(1:5, rep = T)),
                  DOG = factor(sample(1:5, rep = T)),
                  FOX = factor(sample(1:5, rep = T)),
                  MOUSE = factor(sample(1:5, rep = T)),
                  WEIGHT = round(runif(1000)*100,0)
                  )

data_WANT = data.frame(VARS = c("CAT", "DOG", "FOX", "MOUSE", "WEIGHT"),
                       GROUP1_N = NA,
                       GROUP1_PROP = NA,
                       GROUP2_N = NA,
                       GROUP2_PROP = NA,
                      GROUP3_N = NA,
                       GROUP3_PROP = NA,
                       GROUP4_N = NA,
                       GROUP4_PROP = NA,
                       GROUP5_N = NA,
                       GROUP5_PROP = NA)

我有一个名为 'data' 的数据框,我希望创建一个数据框或数据表,按 GROUP 显示每个变量的 COUNT(_N) 以及每个组的每个变量的加权比例 (_PROP)在名为 'data' 的数据框中使用变量 WEIGHT。这是给我的概率权重,用于获得有代表性的估计值。

也许,你正在尝试做:

library(dplyr)
library(tidyr)

data %>%
  type.convert(as.is = TRUE) %>%
  group_by(GROUP) %>%
  summarise(across(CAT:MOUSE, list(N = ~n(), 
                                   PROP = ~weighted.mean(., WEIGHT)))) %>%
  pivot_longer(-GROUP, 
               names_to = c('Animal', 'prop'), 
               names_sep = '_') %>%
  pivot_wider(names_from = c(GROUP, prop), values_from = value, 
              names_prefix = 'GROUP_')

# A tibble: 4 x 11
#  Animal GROUP_1_N GROUP_1_PROP GROUP_2_N GROUP_2_PROP GROUP_3_N
#  <chr>      <dbl>        <dbl>     <dbl>        <dbl>     <dbl>
#1 CAT          200            5       200            1       200
#2 DOG          200            5       200            2       200
#3 FOX          200            1       200            3       200
#4 MOUSE        200            2       200            1       200
# … with 5 more variables: GROUP_3_PROP <dbl>, GROUP_4_N <dbl>,
#   GROUP_4_PROP <dbl>, GROUP_5_N <dbl>, GROUP_5_PROP <dbl>

pivot_longerpivot_wider步骤是获取与data_WANT格式相同的数据,它们不是执行计算所必需的。

我们可以使用data.table方法

library(data.table)
dcast(melt(setDT(type.convert(data, as.is = TRUE))[, 
  c(list(N = .N), lapply(.SD, weighted.mean, WEIGHT)), 
 GROUP, .SDcols = CAT:MOUSE], id.var = c('GROUP', 'N'), 
   variable.name = 'Animal'), Animal  ~ 
        paste0('GROUP_', GROUP), value.var = c('value', 'N'))