为多元分类数据创建比例矩阵

Create proportion matrix for multivariate categorical data

假设我从下面的 R 代码中模拟了这个数据:

library(RNGforGPD)
set.seed(1)
sample.size = 10; no.gpois = 3
lambda.vec = c(-0.2, 0.2, -0.3); theta.vec = c(1, 3, 4)
M = c(0.352, 0.265, 0.342); N = diag(3); N[lower.tri(N)] = M
TV = N + t(N); diag(TV) = 1
cstar = CmatStarGpois(TV, theta.vec, lambda.vec, verbose = TRUE)
data = GenMVGpois(sample.size, no.gpois, cstar, theta.vec, lambda.vec, details = FALSE)

> prop.table(table(data[,1]))

  0   1   2 
0.3 0.4 0.3 
> prop.table(table(data[,2]))

  2   3   6   8  10 
0.2 0.4 0.1 0.2 0.1 
> prop.table(table(data[,3]))

  2   3   4   5   6 
0.2 0.3 0.1 0.3 0.1 
> table(data)
data
 0  1  2  3  4  5  6  8 10 
 3  4  7  7  1  3  2  2  1 

我想为三个分类变量中的每一个创建一个比例矩阵。如果特定列缺少类别,它将被标识为 0。

Cat X1 X2 X3
0 0.3 0.0 0.0
1 0.4 0.0 0.0
2 0.3 0.2 0.2
3 0.0 0.4 0.3
4 0.0 0.0 0.1
5 0.0 0.0 0.3
6 0.0 0.1 0.1
8 0.0 0.2 0.0
10 0.0 0.1 0.0

这是 data-对象:

dput(data)
structure(c(1, 0, 2, 1, 0, 0, 1, 2, 2, 1, 3, 8, 3, 3, 2, 2, 6, 
3, 10, 8, 2, 5, 2, 6, 3, 3, 4, 3, 5, 5), .Dim = c(10L, 3L), .Dimnames = list(
    NULL, NULL))
colnames(data) <- c("X1", "X2", "X3")
as_tibble(data) %>%
    pivot_longer(cols = "X1":"X3", values_to = "Cat") %>%
    group_by(name, Cat) %>%
    count() %>%
    ungroup(Cat) %>%
    summarize(name, Cat, proportion = n / sum(n)) %>%
    pivot_wider(names_from = name, values_from = proportion) %>%
    arrange(Cat) %>%
    replace(is.na(.), 0)

# A tibble: 9 × 4
    Cat    X1    X2    X3
  <dbl> <dbl> <dbl> <dbl>
1     0   0.3   0     0  
2     1   0.4   0     0  
3     2   0.3   0.2   0.2
4     3   0     0.4   0.3
5     4   0     0     0.1
6     5   0     0     0.3
7     6   0     0.1   0.1
8     8   0     0.2   0  
9    10   0     0.1   0 

如果你想要它作为矩阵,你可以使用as.matrix()

尝试将逻辑放在代码序列中的适当位置。

props <- data.frame(Cat = sort(unique(c(data))) )  # Just the Cat column
#Now fill in the entries
# the entries will be obtained with table function
    apply(data, 2, table)  # run `table(.)` over the columns individually
[[1]]

0 1 2   # these are actually character valued names
3 4 3   # while these are the count values

[[2]]

 2  3  6  8 10 
 2  4  1  2  1 

[[3]]

2 3 4 5 6 
2 3 1 3 1 

现在遍历该列表以填充与 Cat 列匹配的值:

 props2 <- cbind(props,  # using dfrm first argument returns dataframe object 
    lapply( apply(data, 2, table) ,   # irregular results are a list
                function(col) { # first make a named vector of zeros
                             x <- setNames(rep(0,length(props$Cat)), props$Cat)
                             # could have skipped that step by using `tabulate`
                             # then fill with values using names as indices
                             x[names(col)] <- col # values to matching names
                             x}) )
props2
#-------------
   Cat V1 V2 V3
0    0  3  0  0
1    1  4  0  0
2    2  3  2  2
3    3  0  4  3
4    4  0  0  1
5    5  0  0  3
6    6  0  1  1
8    8  0  2  0
10  10  0  1  0
#---
# now just "proportionalize" those counts
props2[2:4] <- prop.table(data.matrix(props2[2:4]), margin=2)
props2
 #-------------
   Cat  V1  V2  V3
0    0 0.3 0.0 0.0
1    1 0.4 0.0 0.0
2    2 0.3 0.2 0.2
3    3 0.0 0.4 0.3
4    4 0.0 0.0 0.1
5    5 0.0 0.0 0.3
6    6 0.0 0.1 0.1
8    8 0.0 0.2 0.0
10  10 0.0 0.1 0.0