具有调查权重的组别比例
Proportions by group with survey weights
data = data.frame(ID = 1:1000,
GROUP = factor(sample(1:5, rep = T)),
CAT = factor(sample(1:5, rep = T)),
DOG = factor(sample(1:5, rep = T)),
FOX = factor(sample(1:5, rep = T)),
MOUSE = factor(sample(1:5, rep = T)),
WEIGHT = round(runif(1000)*100,0)
)
data_WANT = data.frame(VARS = c("CAT", "DOG", "FOX", "MOUSE", "WEIGHT"),
GROUP1_N = NA,
GROUP1_PROP = NA,
GROUP2_N = NA,
GROUP2_PROP = NA,
GROUP3_N = NA,
GROUP3_PROP = NA,
GROUP4_N = NA,
GROUP4_PROP = NA,
GROUP5_N = NA,
GROUP5_PROP = NA)
我有一个名为 'data' 的数据框,我希望创建一个数据框或数据表,按 GROUP 显示每个变量的 COUNT(_N) 以及每个组的每个变量的加权比例 (_PROP)在名为 'data' 的数据框中使用变量 WEIGHT。这是给我的概率权重,用于获得有代表性的估计值。
也许,你正在尝试做:
library(dplyr)
library(tidyr)
data %>%
type.convert(as.is = TRUE) %>%
group_by(GROUP) %>%
summarise(across(CAT:MOUSE, list(N = ~n(),
PROP = ~weighted.mean(., WEIGHT)))) %>%
pivot_longer(-GROUP,
names_to = c('Animal', 'prop'),
names_sep = '_') %>%
pivot_wider(names_from = c(GROUP, prop), values_from = value,
names_prefix = 'GROUP_')
# A tibble: 4 x 11
# Animal GROUP_1_N GROUP_1_PROP GROUP_2_N GROUP_2_PROP GROUP_3_N
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 CAT 200 5 200 1 200
#2 DOG 200 5 200 2 200
#3 FOX 200 1 200 3 200
#4 MOUSE 200 2 200 1 200
# … with 5 more variables: GROUP_3_PROP <dbl>, GROUP_4_N <dbl>,
# GROUP_4_PROP <dbl>, GROUP_5_N <dbl>, GROUP_5_PROP <dbl>
pivot_longer
和pivot_wider
步骤是获取与data_WANT
格式相同的数据,它们不是执行计算所必需的。
我们可以使用data.table
方法
library(data.table)
dcast(melt(setDT(type.convert(data, as.is = TRUE))[,
c(list(N = .N), lapply(.SD, weighted.mean, WEIGHT)),
GROUP, .SDcols = CAT:MOUSE], id.var = c('GROUP', 'N'),
variable.name = 'Animal'), Animal ~
paste0('GROUP_', GROUP), value.var = c('value', 'N'))
data = data.frame(ID = 1:1000,
GROUP = factor(sample(1:5, rep = T)),
CAT = factor(sample(1:5, rep = T)),
DOG = factor(sample(1:5, rep = T)),
FOX = factor(sample(1:5, rep = T)),
MOUSE = factor(sample(1:5, rep = T)),
WEIGHT = round(runif(1000)*100,0)
)
data_WANT = data.frame(VARS = c("CAT", "DOG", "FOX", "MOUSE", "WEIGHT"),
GROUP1_N = NA,
GROUP1_PROP = NA,
GROUP2_N = NA,
GROUP2_PROP = NA,
GROUP3_N = NA,
GROUP3_PROP = NA,
GROUP4_N = NA,
GROUP4_PROP = NA,
GROUP5_N = NA,
GROUP5_PROP = NA)
我有一个名为 'data' 的数据框,我希望创建一个数据框或数据表,按 GROUP 显示每个变量的 COUNT(_N) 以及每个组的每个变量的加权比例 (_PROP)在名为 'data' 的数据框中使用变量 WEIGHT。这是给我的概率权重,用于获得有代表性的估计值。
也许,你正在尝试做:
library(dplyr)
library(tidyr)
data %>%
type.convert(as.is = TRUE) %>%
group_by(GROUP) %>%
summarise(across(CAT:MOUSE, list(N = ~n(),
PROP = ~weighted.mean(., WEIGHT)))) %>%
pivot_longer(-GROUP,
names_to = c('Animal', 'prop'),
names_sep = '_') %>%
pivot_wider(names_from = c(GROUP, prop), values_from = value,
names_prefix = 'GROUP_')
# A tibble: 4 x 11
# Animal GROUP_1_N GROUP_1_PROP GROUP_2_N GROUP_2_PROP GROUP_3_N
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 CAT 200 5 200 1 200
#2 DOG 200 5 200 2 200
#3 FOX 200 1 200 3 200
#4 MOUSE 200 2 200 1 200
# … with 5 more variables: GROUP_3_PROP <dbl>, GROUP_4_N <dbl>,
# GROUP_4_PROP <dbl>, GROUP_5_N <dbl>, GROUP_5_PROP <dbl>
pivot_longer
和pivot_wider
步骤是获取与data_WANT
格式相同的数据,它们不是执行计算所必需的。
我们可以使用data.table
方法
library(data.table)
dcast(melt(setDT(type.convert(data, as.is = TRUE))[,
c(list(N = .N), lapply(.SD, weighted.mean, WEIGHT)),
GROUP, .SDcols = CAT:MOUSE], id.var = c('GROUP', 'N'),
variable.name = 'Animal'), Animal ~
paste0('GROUP_', GROUP), value.var = c('value', 'N'))