加权比例

Weighted proportions

我想尽可能使用 prop.table()xtabs() 函数计算加权比例。

示例:

library(data.table)
set.seed(1)
n <- 20
X <- data.table(color = sample(c("Blue", "Green", "Red"), n, replace = TRUE),
                letter = sample(c("a", "b"), n, replace = TRUE),
                number = sample(1:3, n, replace = TRUE),
                weight = runif(n))
X[, weight := weight/sum(weight)]

# > X
#      color letter number      weight
# 1:   Blue      a       2  0.03300399
# 2:    Red      a       3  0.08170773
# 3:   Blue      a       2  0.03374477
# 4:  Green      a       1  0.03248830
# 5:   Blue      a       3  0.04636611
# 6:    Red      b       2  0.08684298
# 7:    Red      a       1  0.08413131
# 8:  Green      a       1  0.03796001
# 9:  Green      b       3  0.07566126
# 10:   Red      b       2  0.09350268
# 11:   Red      b       2  0.04230800
# 12:  Blue      a       3  0.06935330
# 13:  Blue      b       3  0.03893384
# 14:  Blue      a       2  0.03166846
# 15: Green      a       2  0.07369181
# 16: Green      b       2  0.01972925
# 17: Green      a       2  0.06921767
# 18: Green      b       1  0.01184500
# 19:   Red      b       2  0.02389486
# 20:  Blue      b       2  0.01394867

P <- X[, prop.table(xtabs(~color + letter + number), c("color", "letter"))]
P <- data.table(P)
setnames(P, "N", "percentage")

# > P
#     color letter number percentage
# 1:   Blue      a      1  0.0000000
# 2:  Green      a      1  0.5000000
# 3:    Red      a      1  0.5000000
# 4:   Blue      b      1  0.0000000
# 5:  Green      b      1  0.3333333
# 6:    Red      b      1  0.0000000
# 7:   Blue      a      2  0.6000000
# 8:  Green      a      2  0.5000000
# 9:    Red      a      2  0.0000000
# 10:  Blue      b      2  0.5000000
# 11: Green      b      2  0.3333333
# 12:   Red      b      2  1.0000000
# 13:  Blue      a      3  0.4000000
# 14: Green      a      3  0.0000000
# 15:   Red      a      3  0.5000000
# 16:  Blue      b      3  0.5000000
# 17: Green      b      3  0.3333333
# 18:   Red      b      3  0.0000000

P[, sum(percentage), .(color, letter)]

# > P[, sum(percentage), .(color, letter)]
#     color letter V1
# 1:  Blue       a  1
# 2: Green       a  1
# 3:   Red       a  1
# 4:  Blue       b  1
# 5: Green       b  1
# 6:   Red       b  1

如您所见,这些是未加权的比例。我想根据我的 weight 变量对它们进行加权。最终,百分比应该仍然加到 1。我知道用 prop.table()xtabs() 这两个函数可能无法实现这个结果,所以如果有其他解决方案也可以。

使用data.table,我们通过'color'、'letter'、'number'得到'weight'的sum,然后通过'color'、'letter' 并通过将 'n' 除以 'n'

sum 来创建道具
library(data.table)
X[, .(n = sum(weight)), .(color, letter, number)][,
        prop  := n/sum(n), .(color, letter)][order(color)]

-输出

#  color letter number          n      prop
# 1:  Blue      a      2 0.09841723 0.4596001
# 2:  Blue      a      3 0.11571941 0.5403999
# 3:  Blue      b      3 0.03893384 0.7362328
# 4:  Blue      b      2 0.01394867 0.2637672
# 5: Green      a      1 0.07044831 0.3301886
# 6: Green      b      3 0.07566126 0.7055616
# 7: Green      a      2 0.14290947 0.6698114
# 8: Green      b      2 0.01972925 0.1839805
# 9: Green      b      1 0.01184500 0.1104578
#10:   Red      a      3 0.08170773 0.4926930
#11:   Red      b      2 0.24654852 1.0000000
#12:   Red      a      1 0.08413131 0.5073070

或使用dplyr

library(dplyr)
X %>% 
     count(color, letter, number, wt = weight) %>%
     group_by(color, letter) %>% 
     mutate(prop = n/sum(n))
# A tibble: 12 x 5
# Groups:   color, letter [6]
#   color letter number      n  prop
#   <chr> <chr>   <int>  <dbl> <dbl>
# 1 Blue  a           2 0.0984 0.460
# 2 Blue  a           3 0.116  0.540
# 3 Blue  b           2 0.0139 0.264
# 4 Blue  b           3 0.0389 0.736
# 5 Green a           1 0.0704 0.330
# 6 Green a           2 0.143  0.670
# 7 Green b           1 0.0118 0.110
# 8 Green b           2 0.0197 0.184
# 9 Green b           3 0.0757 0.706
#10 Red   a           1 0.0841 0.507
#11 Red   a           3 0.0817 0.493
#12 Red   b           2 0.247  1    

如果我们想要所有组合,请使用 complete

library(tidyr)
X %>% 
     count(color, letter, number, wt = weight) %>%
     group_by(color, letter) %>% 
     mutate(prop = n/sum(n)) %>% 
     ungroup %>% 
     complete(color, letter, number, fill = list(n = 0, prop = 0))
# A tibble: 18 x 5
#   color letter number      n  prop
#   <chr> <chr>   <int>  <dbl> <dbl>
# 1 Blue  a           1 0      0    
# 2 Blue  a           2 0.0984 0.460
# 3 Blue  a           3 0.116  0.540
# 4 Blue  b           1 0      0    
# 5 Blue  b           2 0.0139 0.264
# 6 Blue  b           3 0.0389 0.736
# 7 Green a           1 0.0704 0.330
# 8 Green a           2 0.143  0.670
# 9 Green a           3 0      0    
#10 Green b           1 0.0118 0.110
#11 Green b           2 0.0197 0.184
#12 Green b           3 0.0757 0.706
#13 Red   a           1 0.0841 0.507
#14 Red   a           2 0      0    
#15 Red   a           3 0.0817 0.493
#16 Red   b           1 0      0    
#17 Red   b           2 0.247  1    
#18 Red   b           3 0      0    

CJdata.table

X[, .(n = sum(weight)), .(color, letter, number)][,
    prop  := n/sum(n), .(color, letter)][order(color)][
   CJ(color, letter, number, unique = TRUE), on = .(color, letter, number)]

或使用 xtabs/prop.table

setnames(as.data.table(X[, prop.table(xtabs(weight~color + 
   letter + number), c("color", "letter"))])[N > 0][order(color)], "N", "prop")[]
#    color letter number      prop
# 1:  Blue      a      2 0.4596001
# 2:  Blue      b      2 0.2637672
# 3:  Blue      a      3 0.5403999
# 4:  Blue      b      3 0.7362328
# 5: Green      a      1 0.3301886
# 6: Green      b      1 0.1104578
# 7: Green      a      2 0.6698114
# 8: Green      b      2 0.1839805
# 9: Green      b      3 0.7055616
#10:   Red      a      1 0.5073070
#11:   Red      b      2 1.0000000
#12:   Red      a      3 0.4926930