加权比例
Weighted proportions
我想尽可能使用 prop.table()
和 xtabs()
函数计算加权比例。
示例:
library(data.table)
set.seed(1)
n <- 20
X <- data.table(color = sample(c("Blue", "Green", "Red"), n, replace = TRUE),
letter = sample(c("a", "b"), n, replace = TRUE),
number = sample(1:3, n, replace = TRUE),
weight = runif(n))
X[, weight := weight/sum(weight)]
# > X
# color letter number weight
# 1: Blue a 2 0.03300399
# 2: Red a 3 0.08170773
# 3: Blue a 2 0.03374477
# 4: Green a 1 0.03248830
# 5: Blue a 3 0.04636611
# 6: Red b 2 0.08684298
# 7: Red a 1 0.08413131
# 8: Green a 1 0.03796001
# 9: Green b 3 0.07566126
# 10: Red b 2 0.09350268
# 11: Red b 2 0.04230800
# 12: Blue a 3 0.06935330
# 13: Blue b 3 0.03893384
# 14: Blue a 2 0.03166846
# 15: Green a 2 0.07369181
# 16: Green b 2 0.01972925
# 17: Green a 2 0.06921767
# 18: Green b 1 0.01184500
# 19: Red b 2 0.02389486
# 20: Blue b 2 0.01394867
P <- X[, prop.table(xtabs(~color + letter + number), c("color", "letter"))]
P <- data.table(P)
setnames(P, "N", "percentage")
# > P
# color letter number percentage
# 1: Blue a 1 0.0000000
# 2: Green a 1 0.5000000
# 3: Red a 1 0.5000000
# 4: Blue b 1 0.0000000
# 5: Green b 1 0.3333333
# 6: Red b 1 0.0000000
# 7: Blue a 2 0.6000000
# 8: Green a 2 0.5000000
# 9: Red a 2 0.0000000
# 10: Blue b 2 0.5000000
# 11: Green b 2 0.3333333
# 12: Red b 2 1.0000000
# 13: Blue a 3 0.4000000
# 14: Green a 3 0.0000000
# 15: Red a 3 0.5000000
# 16: Blue b 3 0.5000000
# 17: Green b 3 0.3333333
# 18: Red b 3 0.0000000
P[, sum(percentage), .(color, letter)]
# > P[, sum(percentage), .(color, letter)]
# color letter V1
# 1: Blue a 1
# 2: Green a 1
# 3: Red a 1
# 4: Blue b 1
# 5: Green b 1
# 6: Red b 1
如您所见,这些是未加权的比例。我想根据我的 weight
变量对它们进行加权。最终,百分比应该仍然加到 1。我知道用 prop.table()
和 xtabs()
这两个函数可能无法实现这个结果,所以如果有其他解决方案也可以。
使用data.table
,我们通过'color'、'letter'、'number'得到'weight'的sum
,然后通过'color'、'letter' 并通过将 'n' 除以 'n'
的 sum
来创建道具
library(data.table)
X[, .(n = sum(weight)), .(color, letter, number)][,
prop := n/sum(n), .(color, letter)][order(color)]
-输出
# color letter number n prop
# 1: Blue a 2 0.09841723 0.4596001
# 2: Blue a 3 0.11571941 0.5403999
# 3: Blue b 3 0.03893384 0.7362328
# 4: Blue b 2 0.01394867 0.2637672
# 5: Green a 1 0.07044831 0.3301886
# 6: Green b 3 0.07566126 0.7055616
# 7: Green a 2 0.14290947 0.6698114
# 8: Green b 2 0.01972925 0.1839805
# 9: Green b 1 0.01184500 0.1104578
#10: Red a 3 0.08170773 0.4926930
#11: Red b 2 0.24654852 1.0000000
#12: Red a 1 0.08413131 0.5073070
或使用dplyr
library(dplyr)
X %>%
count(color, letter, number, wt = weight) %>%
group_by(color, letter) %>%
mutate(prop = n/sum(n))
# A tibble: 12 x 5
# Groups: color, letter [6]
# color letter number n prop
# <chr> <chr> <int> <dbl> <dbl>
# 1 Blue a 2 0.0984 0.460
# 2 Blue a 3 0.116 0.540
# 3 Blue b 2 0.0139 0.264
# 4 Blue b 3 0.0389 0.736
# 5 Green a 1 0.0704 0.330
# 6 Green a 2 0.143 0.670
# 7 Green b 1 0.0118 0.110
# 8 Green b 2 0.0197 0.184
# 9 Green b 3 0.0757 0.706
#10 Red a 1 0.0841 0.507
#11 Red a 3 0.0817 0.493
#12 Red b 2 0.247 1
如果我们想要所有组合,请使用 complete
library(tidyr)
X %>%
count(color, letter, number, wt = weight) %>%
group_by(color, letter) %>%
mutate(prop = n/sum(n)) %>%
ungroup %>%
complete(color, letter, number, fill = list(n = 0, prop = 0))
# A tibble: 18 x 5
# color letter number n prop
# <chr> <chr> <int> <dbl> <dbl>
# 1 Blue a 1 0 0
# 2 Blue a 2 0.0984 0.460
# 3 Blue a 3 0.116 0.540
# 4 Blue b 1 0 0
# 5 Blue b 2 0.0139 0.264
# 6 Blue b 3 0.0389 0.736
# 7 Green a 1 0.0704 0.330
# 8 Green a 2 0.143 0.670
# 9 Green a 3 0 0
#10 Green b 1 0.0118 0.110
#11 Green b 2 0.0197 0.184
#12 Green b 3 0.0757 0.706
#13 Red a 1 0.0841 0.507
#14 Red a 2 0 0
#15 Red a 3 0.0817 0.493
#16 Red b 1 0 0
#17 Red b 2 0.247 1
#18 Red b 3 0 0
或 CJ
在 data.table
X[, .(n = sum(weight)), .(color, letter, number)][,
prop := n/sum(n), .(color, letter)][order(color)][
CJ(color, letter, number, unique = TRUE), on = .(color, letter, number)]
或使用 xtabs/prop.table
setnames(as.data.table(X[, prop.table(xtabs(weight~color +
letter + number), c("color", "letter"))])[N > 0][order(color)], "N", "prop")[]
# color letter number prop
# 1: Blue a 2 0.4596001
# 2: Blue b 2 0.2637672
# 3: Blue a 3 0.5403999
# 4: Blue b 3 0.7362328
# 5: Green a 1 0.3301886
# 6: Green b 1 0.1104578
# 7: Green a 2 0.6698114
# 8: Green b 2 0.1839805
# 9: Green b 3 0.7055616
#10: Red a 1 0.5073070
#11: Red b 2 1.0000000
#12: Red a 3 0.4926930
我想尽可能使用 prop.table()
和 xtabs()
函数计算加权比例。
示例:
library(data.table)
set.seed(1)
n <- 20
X <- data.table(color = sample(c("Blue", "Green", "Red"), n, replace = TRUE),
letter = sample(c("a", "b"), n, replace = TRUE),
number = sample(1:3, n, replace = TRUE),
weight = runif(n))
X[, weight := weight/sum(weight)]
# > X
# color letter number weight
# 1: Blue a 2 0.03300399
# 2: Red a 3 0.08170773
# 3: Blue a 2 0.03374477
# 4: Green a 1 0.03248830
# 5: Blue a 3 0.04636611
# 6: Red b 2 0.08684298
# 7: Red a 1 0.08413131
# 8: Green a 1 0.03796001
# 9: Green b 3 0.07566126
# 10: Red b 2 0.09350268
# 11: Red b 2 0.04230800
# 12: Blue a 3 0.06935330
# 13: Blue b 3 0.03893384
# 14: Blue a 2 0.03166846
# 15: Green a 2 0.07369181
# 16: Green b 2 0.01972925
# 17: Green a 2 0.06921767
# 18: Green b 1 0.01184500
# 19: Red b 2 0.02389486
# 20: Blue b 2 0.01394867
P <- X[, prop.table(xtabs(~color + letter + number), c("color", "letter"))]
P <- data.table(P)
setnames(P, "N", "percentage")
# > P
# color letter number percentage
# 1: Blue a 1 0.0000000
# 2: Green a 1 0.5000000
# 3: Red a 1 0.5000000
# 4: Blue b 1 0.0000000
# 5: Green b 1 0.3333333
# 6: Red b 1 0.0000000
# 7: Blue a 2 0.6000000
# 8: Green a 2 0.5000000
# 9: Red a 2 0.0000000
# 10: Blue b 2 0.5000000
# 11: Green b 2 0.3333333
# 12: Red b 2 1.0000000
# 13: Blue a 3 0.4000000
# 14: Green a 3 0.0000000
# 15: Red a 3 0.5000000
# 16: Blue b 3 0.5000000
# 17: Green b 3 0.3333333
# 18: Red b 3 0.0000000
P[, sum(percentage), .(color, letter)]
# > P[, sum(percentage), .(color, letter)]
# color letter V1
# 1: Blue a 1
# 2: Green a 1
# 3: Red a 1
# 4: Blue b 1
# 5: Green b 1
# 6: Red b 1
如您所见,这些是未加权的比例。我想根据我的 weight
变量对它们进行加权。最终,百分比应该仍然加到 1。我知道用 prop.table()
和 xtabs()
这两个函数可能无法实现这个结果,所以如果有其他解决方案也可以。
使用data.table
,我们通过'color'、'letter'、'number'得到'weight'的sum
,然后通过'color'、'letter' 并通过将 'n' 除以 'n'
sum
来创建道具
library(data.table)
X[, .(n = sum(weight)), .(color, letter, number)][,
prop := n/sum(n), .(color, letter)][order(color)]
-输出
# color letter number n prop
# 1: Blue a 2 0.09841723 0.4596001
# 2: Blue a 3 0.11571941 0.5403999
# 3: Blue b 3 0.03893384 0.7362328
# 4: Blue b 2 0.01394867 0.2637672
# 5: Green a 1 0.07044831 0.3301886
# 6: Green b 3 0.07566126 0.7055616
# 7: Green a 2 0.14290947 0.6698114
# 8: Green b 2 0.01972925 0.1839805
# 9: Green b 1 0.01184500 0.1104578
#10: Red a 3 0.08170773 0.4926930
#11: Red b 2 0.24654852 1.0000000
#12: Red a 1 0.08413131 0.5073070
或使用dplyr
library(dplyr)
X %>%
count(color, letter, number, wt = weight) %>%
group_by(color, letter) %>%
mutate(prop = n/sum(n))
# A tibble: 12 x 5
# Groups: color, letter [6]
# color letter number n prop
# <chr> <chr> <int> <dbl> <dbl>
# 1 Blue a 2 0.0984 0.460
# 2 Blue a 3 0.116 0.540
# 3 Blue b 2 0.0139 0.264
# 4 Blue b 3 0.0389 0.736
# 5 Green a 1 0.0704 0.330
# 6 Green a 2 0.143 0.670
# 7 Green b 1 0.0118 0.110
# 8 Green b 2 0.0197 0.184
# 9 Green b 3 0.0757 0.706
#10 Red a 1 0.0841 0.507
#11 Red a 3 0.0817 0.493
#12 Red b 2 0.247 1
如果我们想要所有组合,请使用 complete
library(tidyr)
X %>%
count(color, letter, number, wt = weight) %>%
group_by(color, letter) %>%
mutate(prop = n/sum(n)) %>%
ungroup %>%
complete(color, letter, number, fill = list(n = 0, prop = 0))
# A tibble: 18 x 5
# color letter number n prop
# <chr> <chr> <int> <dbl> <dbl>
# 1 Blue a 1 0 0
# 2 Blue a 2 0.0984 0.460
# 3 Blue a 3 0.116 0.540
# 4 Blue b 1 0 0
# 5 Blue b 2 0.0139 0.264
# 6 Blue b 3 0.0389 0.736
# 7 Green a 1 0.0704 0.330
# 8 Green a 2 0.143 0.670
# 9 Green a 3 0 0
#10 Green b 1 0.0118 0.110
#11 Green b 2 0.0197 0.184
#12 Green b 3 0.0757 0.706
#13 Red a 1 0.0841 0.507
#14 Red a 2 0 0
#15 Red a 3 0.0817 0.493
#16 Red b 1 0 0
#17 Red b 2 0.247 1
#18 Red b 3 0 0
或 CJ
在 data.table
X[, .(n = sum(weight)), .(color, letter, number)][,
prop := n/sum(n), .(color, letter)][order(color)][
CJ(color, letter, number, unique = TRUE), on = .(color, letter, number)]
或使用 xtabs/prop.table
setnames(as.data.table(X[, prop.table(xtabs(weight~color +
letter + number), c("color", "letter"))])[N > 0][order(color)], "N", "prop")[]
# color letter number prop
# 1: Blue a 2 0.4596001
# 2: Blue b 2 0.2637672
# 3: Blue a 3 0.5403999
# 4: Blue b 3 0.7362328
# 5: Green a 1 0.3301886
# 6: Green b 1 0.1104578
# 7: Green a 2 0.6698114
# 8: Green b 2 0.1839805
# 9: Green b 3 0.7055616
#10: Red a 1 0.5073070
#11: Red b 2 1.0000000
#12: Red a 3 0.4926930