为什么这些方法在 dplyr mutate 与 summarize in group_by 计算时不同?
Why are these means different when computed by dplyr mutate vs summarize in group_by?
我的数据框包含:
- 一列
deceased
我稍后计算聚合平均值(死亡率,按性别)
- 一个权重变量
n.group
- 分类
sex
(1:女性,2:男性)
我不明白为什么在下面的单个 mutate
/summarize
表达式中计算均值和加权均值 m.mortf, w.mortf
是错误的。
数据框:
red11 <- structure(list(hosptg = structure(c(3L, 3L, 1L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 3L,
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L,
3L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 1L,
3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1",
"2", "3"), class = "factor"), quarter.adm = structure(c(4L, 11L,
3L, 12L, 7L, 8L, 12L, 9L, 1L, 11L, 7L, 1L, 2L, 2L, 10L, 10L,
8L, 11L, 6L, 1L, 4L, 6L, 10L, 10L, 6L, 11L, 11L, 7L, 3L, 6L,
10L, 12L, 7L, 6L, 6L, 3L, 6L, 12L, 4L, 4L, 12L, 1L, 6L, 5L, 11L,
9L, 4L, 4L, 3L, 10L, 4L, 8L, 10L, 3L, 7L, 1L, 12L, 5L, 4L, 6L,
6L, 3L, 9L, 7L, 8L, 3L, 7L, 8L, 7L, 6L, 5L, 11L, 9L, 11L, 1L,
4L, 6L, 5L, 5L, 6L, 5L, 5L, 11L, 3L, 4L, 12L, 12L, 1L, 9L, 9L,
6L, 9L, 1L, 4L, 8L, 1L, 5L, 2L, 9L, 11L), .Label = c("2011Q1",
"2011Q2", "2011Q3", "2011Q4", "2012Q1", "2012Q2", "2012Q3", "2012Q4",
"2013Q1", "2013Q2", "2013Q3", "2013Q4"), class = "factor"), g.mdc = c("08",
"05", "09", "08", "14", "15", "15", "11", "09", "01", "08", "11",
"16", "14", "08", "06", "08", "06", "06", "08", "15", "14", "14",
"08", "11", "09", "08", "08", "06", "06", "06", "08", "03", "05",
"05", "15", "02", "05", "08", "04", "04", "10", "06", "01", "08",
"05", "03", "06", "01", "01", "06", "08", "08", "04", "12", "05",
"01", "15", "08", "01", "08", "01", "05", "15", "15", "01", "06",
"15", "01", "08", "01", "05", "08", "02", "15", "03", "06", "05",
"05", "03", "09", "08", "11", "12", "06", "04", "08", "01", "06",
"01", "08", "06", "15", "05", "08", "07", "08", "13", "08", "08"
), sex = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
age = c(23L, 83L, 51L, 54L, 37L, 0L, 0L, 82L, 45L, 88L, 84L,
58L, 41L, 33L, 71L, 79L, 67L, 42L, 73L, 66L, 0L, 26L, 38L,
65L, 31L, 87L, 38L, 38L, 77L, 44L, 54L, 74L, 38L, 70L, 44L,
0L, 78L, 65L, 56L, 85L, 70L, 83L, 89L, 46L, 39L, 34L, 5L,
85L, 18L, 5L, 41L, 73L, 18L, 41L, 75L, 77L, 36L, 0L, 84L,
83L, 58L, 93L, 83L, 0L, 0L, 2L, 49L, 0L, 55L, 46L, 40L, 81L,
60L, 51L, 0L, 22L, 78L, 69L, 75L, 65L, 31L, 15L, 79L, 87L,
72L, 78L, 48L, 16L, 81L, 63L, 84L, 17L, 0L, 60L, 60L, 74L,
44L, 44L, 53L, 71L), deceased = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
n.group = c(3L, 2L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L,
2L, 1L, 3L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 3L, 1L, 2L, 1L,
1L, 2L, 2L, 2L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L,
1L, 3L, 1L, 3L, 3L, 2L, 1L, 3L, 3L, 1L, 3L, 1L, 3L, 2L, 2L,
2L, 1L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 2L, 1L,
1L, 1L, 1L, 3L, 1L, 2L, 1L, 3L, 1L, 2L, 1L, 2L, 2L)), class = c("tbl_df",
"tbl", "data.frame"), .Names = c("hosptg", "quarter.adm", "g.mdc",
"sex", "age", "deceased", "n.group"), row.names = c(NA, -100L
))
分组 - 使用mutate
:
red111 <- red11 %>%
group_by(hosptg, quarter.adm, g.mdc) %>%
mutate(n=n(),
female = mean(sex == '1', na.rm=T),
age = mean(age, na.rm=T),
m.mortf = mean(deceased == '1', na.rm=T),
w.mortf = weighted.mean(deceased == '1', n.group, na.rm=T))
分组 - 使用summarize
(即聚合):
red211 <- red11 %>%
group_by(hosptg, quarter.adm, g.mdc) %>%
summarize(n=n(),
female = mean(sex == '1', na.rm=T),
age = mean(age, na.rm=T),
m.mortf = mean(deceased == '1', na.rm=T),
w.mortf = weighted.mean(deceased == '1', n.group, na.rm=T))
我原以为比率是相同的,最重要的是保持初始均值。 sum(redxx$n)
也说明了聚合的作用,但我很难理解完整的背景。
初始数据帧均值:
mean(red11$deceased == 1, na.rm=T) [1] 0.02
改变均值和总和:
sum(red211$n) [1] 170
> mean(red111$female) [1] 0.52
> mean(red111$w.mortf) [1] 0.02
> mean(red111$m.mortf) [1] 0.02
汇总平均值和总和:
sum(red211$n) [1] 100
mean(red211$female) [1] 0.4977169
mean(red211$w.mortf) [1] 0.02739726
mean(red211$m.mortf) [1] 0.02739726
我想要的是一个聚合数据框(即减少行数),始终保持初始平均值。而且,为什么权重变量没有补偿它?
编辑:
我的基本意图是我正在使用一个大数据文件,其中我有一个案例可能已经死亡的单个条目。然后我计算死亡率。但这在逻辑上只能在聚合级别上完成。这就是我创建像 red211 这样的数据框的原因。此后,我将我的回归模型建立在它的基础上。但它们再次意味着基于第二个数据框而不是原始值。因此,我的结果在大小上是扭曲的。这就是为什么我 "desperately" 正在寻找一种可以让我更接近原始平均值的解决方案。希望对您有所帮助。
我使用的模型是一个直接的差异:
lm(w.mortf ~ treatment * year, data = red)
其中:治疗是治疗组/年干预年/红色聚合数据框
===========================================================
w.mortf m.mortf
-----------------------------------------------------------
(Intercept) 0.037 (0.001) *** 0.037 (0.001) ***
year 0.003 (0.001) * 0.003 (0.001) *
tg1 -0.003 (0.001) * -0.003 (0.001) *
year:tg1 -0.001 (0.002) -0.001 (0.002)
-----------------------------------------------------------
Adj. R^2 0.000 0.000
Num. obs. 126031 126031
RMSE 0.172 0.179
===========================================================
原始数据框的平均值约为。 0.018 - 因此我认为离可解释性还很远 - 或者我在哪里被误导了?
下图说明了这个问题。其中2012Q1应该是根据上述回归可以找到的参考值。
您必须在聚合后应用权重:
red311 <- red11 %>%
group_by(hosptg, quarter.adm, g.mdc) %>%
summarize(n= n()
, female = mean(sex == '1', na.rm=T)
, age = mean(age, na.rm=T)
, m.mortf = mean(deceased == '1', na.rm=T))
weighted.mean(red311$female, red311$n)
#> [1] 0.52
weighted.mean(red311$m.mortf, red311$n)
#> [1] 0.02
编辑: 如果 red311
中的(未加权)平均值对应于 red11
中的平均值,则 red311
中的值将毫无意义。人们可以通过数学计算或从一个简单的例子中看出这一点:
suppressPackageStartupMessages(library(dplyr))
df <- data.frame(key = c('a', 'b', 'b', 'b'), value = 1:4, stringsAsFactors = FALSE)
df
#> key value
#> 1 a 1
#> 2 b 2
#> 3 b 3
#> 4 b 4
mean(df$value)
#> [1] 2.5
df1 <- df %>%
group_by(key) %>%
summarize(n = n(), value = mean(value)) %>%
ungroup() %>%
mutate(weighted = value * n * n() / sum(n))
df1
#> # A tibble: 2 x 4
#> key n value weighted
#> <chr> <int> <dbl> <dbl>
#> 1 a 1 1.00 0.500
#> 2 b 3 3.00 4.50
mean(df1$value)
#> [1] 2
mean(df1$weighted)
#> [1] 2.5
weighted.mean(df1$value, df1$n)
#> [1] 2.5
因此,虽然可以引入平均值等于原始平均值的 weighted
列,但从我的角度来看,其中的值毫无意义。
编辑2:上面使用的重新加权方案是通用的,也可以应用于原始数据:
red411 <- red11 %>%
group_by(hosptg, quarter.adm, g.mdc) %>%
summarize(n= n()
, female = mean(sex == '1', na.rm=T)
, age = mean(age, na.rm=T)
, m.mortf = mean(deceased == '1', na.rm=T)) %>%
ungroup() %>%
mutate(w.mortf = m.mortf * n * n() / sum(n))
mean(red411$w.mortf)
#> [1] 0.02
但是,我不确定如何解释 w.mortf
。
我的数据框包含:
- 一列
deceased
我稍后计算聚合平均值(死亡率,按性别) - 一个权重变量
n.group
- 分类
sex
(1:女性,2:男性)
我不明白为什么在下面的单个 mutate
/summarize
表达式中计算均值和加权均值 m.mortf, w.mortf
是错误的。
数据框:
red11 <- structure(list(hosptg = structure(c(3L, 3L, 1L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 3L,
3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L,
3L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 1L,
3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1",
"2", "3"), class = "factor"), quarter.adm = structure(c(4L, 11L,
3L, 12L, 7L, 8L, 12L, 9L, 1L, 11L, 7L, 1L, 2L, 2L, 10L, 10L,
8L, 11L, 6L, 1L, 4L, 6L, 10L, 10L, 6L, 11L, 11L, 7L, 3L, 6L,
10L, 12L, 7L, 6L, 6L, 3L, 6L, 12L, 4L, 4L, 12L, 1L, 6L, 5L, 11L,
9L, 4L, 4L, 3L, 10L, 4L, 8L, 10L, 3L, 7L, 1L, 12L, 5L, 4L, 6L,
6L, 3L, 9L, 7L, 8L, 3L, 7L, 8L, 7L, 6L, 5L, 11L, 9L, 11L, 1L,
4L, 6L, 5L, 5L, 6L, 5L, 5L, 11L, 3L, 4L, 12L, 12L, 1L, 9L, 9L,
6L, 9L, 1L, 4L, 8L, 1L, 5L, 2L, 9L, 11L), .Label = c("2011Q1",
"2011Q2", "2011Q3", "2011Q4", "2012Q1", "2012Q2", "2012Q3", "2012Q4",
"2013Q1", "2013Q2", "2013Q3", "2013Q4"), class = "factor"), g.mdc = c("08",
"05", "09", "08", "14", "15", "15", "11", "09", "01", "08", "11",
"16", "14", "08", "06", "08", "06", "06", "08", "15", "14", "14",
"08", "11", "09", "08", "08", "06", "06", "06", "08", "03", "05",
"05", "15", "02", "05", "08", "04", "04", "10", "06", "01", "08",
"05", "03", "06", "01", "01", "06", "08", "08", "04", "12", "05",
"01", "15", "08", "01", "08", "01", "05", "15", "15", "01", "06",
"15", "01", "08", "01", "05", "08", "02", "15", "03", "06", "05",
"05", "03", "09", "08", "11", "12", "06", "04", "08", "01", "06",
"01", "08", "06", "15", "05", "08", "07", "08", "13", "08", "08"
), sex = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
age = c(23L, 83L, 51L, 54L, 37L, 0L, 0L, 82L, 45L, 88L, 84L,
58L, 41L, 33L, 71L, 79L, 67L, 42L, 73L, 66L, 0L, 26L, 38L,
65L, 31L, 87L, 38L, 38L, 77L, 44L, 54L, 74L, 38L, 70L, 44L,
0L, 78L, 65L, 56L, 85L, 70L, 83L, 89L, 46L, 39L, 34L, 5L,
85L, 18L, 5L, 41L, 73L, 18L, 41L, 75L, 77L, 36L, 0L, 84L,
83L, 58L, 93L, 83L, 0L, 0L, 2L, 49L, 0L, 55L, 46L, 40L, 81L,
60L, 51L, 0L, 22L, 78L, 69L, 75L, 65L, 31L, 15L, 79L, 87L,
72L, 78L, 48L, 16L, 81L, 63L, 84L, 17L, 0L, 60L, 60L, 74L,
44L, 44L, 53L, 71L), deceased = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
n.group = c(3L, 2L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L,
2L, 1L, 3L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 3L, 1L, 2L, 1L,
1L, 2L, 2L, 2L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L,
1L, 3L, 1L, 3L, 3L, 2L, 1L, 3L, 3L, 1L, 3L, 1L, 3L, 2L, 2L,
2L, 1L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 2L, 1L,
1L, 1L, 1L, 3L, 1L, 2L, 1L, 3L, 1L, 2L, 1L, 2L, 2L)), class = c("tbl_df",
"tbl", "data.frame"), .Names = c("hosptg", "quarter.adm", "g.mdc",
"sex", "age", "deceased", "n.group"), row.names = c(NA, -100L
))
分组 - 使用mutate
:
red111 <- red11 %>%
group_by(hosptg, quarter.adm, g.mdc) %>%
mutate(n=n(),
female = mean(sex == '1', na.rm=T),
age = mean(age, na.rm=T),
m.mortf = mean(deceased == '1', na.rm=T),
w.mortf = weighted.mean(deceased == '1', n.group, na.rm=T))
分组 - 使用summarize
(即聚合):
red211 <- red11 %>%
group_by(hosptg, quarter.adm, g.mdc) %>%
summarize(n=n(),
female = mean(sex == '1', na.rm=T),
age = mean(age, na.rm=T),
m.mortf = mean(deceased == '1', na.rm=T),
w.mortf = weighted.mean(deceased == '1', n.group, na.rm=T))
我原以为比率是相同的,最重要的是保持初始均值。 sum(redxx$n)
也说明了聚合的作用,但我很难理解完整的背景。
初始数据帧均值:
mean(red11$deceased == 1, na.rm=T) [1] 0.02
改变均值和总和:
sum(red211$n) [1] 170
> mean(red111$female) [1] 0.52
> mean(red111$w.mortf) [1] 0.02
> mean(red111$m.mortf) [1] 0.02
汇总平均值和总和:
sum(red211$n) [1] 100
mean(red211$female) [1] 0.4977169
mean(red211$w.mortf) [1] 0.02739726
mean(red211$m.mortf) [1] 0.02739726
我想要的是一个聚合数据框(即减少行数),始终保持初始平均值。而且,为什么权重变量没有补偿它?
编辑: 我的基本意图是我正在使用一个大数据文件,其中我有一个案例可能已经死亡的单个条目。然后我计算死亡率。但这在逻辑上只能在聚合级别上完成。这就是我创建像 red211 这样的数据框的原因。此后,我将我的回归模型建立在它的基础上。但它们再次意味着基于第二个数据框而不是原始值。因此,我的结果在大小上是扭曲的。这就是为什么我 "desperately" 正在寻找一种可以让我更接近原始平均值的解决方案。希望对您有所帮助。
我使用的模型是一个直接的差异:
lm(w.mortf ~ treatment * year, data = red)
其中:治疗是治疗组/年干预年/红色聚合数据框
===========================================================
w.mortf m.mortf
-----------------------------------------------------------
(Intercept) 0.037 (0.001) *** 0.037 (0.001) ***
year 0.003 (0.001) * 0.003 (0.001) *
tg1 -0.003 (0.001) * -0.003 (0.001) *
year:tg1 -0.001 (0.002) -0.001 (0.002)
-----------------------------------------------------------
Adj. R^2 0.000 0.000
Num. obs. 126031 126031
RMSE 0.172 0.179
===========================================================
原始数据框的平均值约为。 0.018 - 因此我认为离可解释性还很远 - 或者我在哪里被误导了?
下图说明了这个问题。其中2012Q1应该是根据上述回归可以找到的参考值。
您必须在聚合后应用权重:
red311 <- red11 %>%
group_by(hosptg, quarter.adm, g.mdc) %>%
summarize(n= n()
, female = mean(sex == '1', na.rm=T)
, age = mean(age, na.rm=T)
, m.mortf = mean(deceased == '1', na.rm=T))
weighted.mean(red311$female, red311$n)
#> [1] 0.52
weighted.mean(red311$m.mortf, red311$n)
#> [1] 0.02
编辑: 如果 red311
中的(未加权)平均值对应于 red11
中的平均值,则 red311
中的值将毫无意义。人们可以通过数学计算或从一个简单的例子中看出这一点:
suppressPackageStartupMessages(library(dplyr))
df <- data.frame(key = c('a', 'b', 'b', 'b'), value = 1:4, stringsAsFactors = FALSE)
df
#> key value
#> 1 a 1
#> 2 b 2
#> 3 b 3
#> 4 b 4
mean(df$value)
#> [1] 2.5
df1 <- df %>%
group_by(key) %>%
summarize(n = n(), value = mean(value)) %>%
ungroup() %>%
mutate(weighted = value * n * n() / sum(n))
df1
#> # A tibble: 2 x 4
#> key n value weighted
#> <chr> <int> <dbl> <dbl>
#> 1 a 1 1.00 0.500
#> 2 b 3 3.00 4.50
mean(df1$value)
#> [1] 2
mean(df1$weighted)
#> [1] 2.5
weighted.mean(df1$value, df1$n)
#> [1] 2.5
因此,虽然可以引入平均值等于原始平均值的 weighted
列,但从我的角度来看,其中的值毫无意义。
编辑2:上面使用的重新加权方案是通用的,也可以应用于原始数据:
red411 <- red11 %>%
group_by(hosptg, quarter.adm, g.mdc) %>%
summarize(n= n()
, female = mean(sex == '1', na.rm=T)
, age = mean(age, na.rm=T)
, m.mortf = mean(deceased == '1', na.rm=T)) %>%
ungroup() %>%
mutate(w.mortf = m.mortf * n * n() / sum(n))
mean(red411$w.mortf)
#> [1] 0.02
但是,我不确定如何解释 w.mortf
。