为多元分类数据创建比例矩阵
Create proportion matrix for multivariate categorical data
假设我从下面的 R 代码中模拟了这个数据:
library(RNGforGPD)
set.seed(1)
sample.size = 10; no.gpois = 3
lambda.vec = c(-0.2, 0.2, -0.3); theta.vec = c(1, 3, 4)
M = c(0.352, 0.265, 0.342); N = diag(3); N[lower.tri(N)] = M
TV = N + t(N); diag(TV) = 1
cstar = CmatStarGpois(TV, theta.vec, lambda.vec, verbose = TRUE)
data = GenMVGpois(sample.size, no.gpois, cstar, theta.vec, lambda.vec, details = FALSE)
> prop.table(table(data[,1]))
0 1 2
0.3 0.4 0.3
> prop.table(table(data[,2]))
2 3 6 8 10
0.2 0.4 0.1 0.2 0.1
> prop.table(table(data[,3]))
2 3 4 5 6
0.2 0.3 0.1 0.3 0.1
> table(data)
data
0 1 2 3 4 5 6 8 10
3 4 7 7 1 3 2 2 1
我想为三个分类变量中的每一个创建一个比例矩阵。如果特定列缺少类别,它将被标识为 0。
Cat
X1
X2
X3
0
0.3
0.0
0.0
1
0.4
0.0
0.0
2
0.3
0.2
0.2
3
0.0
0.4
0.3
4
0.0
0.0
0.1
5
0.0
0.0
0.3
6
0.0
0.1
0.1
8
0.0
0.2
0.0
10
0.0
0.1
0.0
这是 data
-对象:
dput(data)
structure(c(1, 0, 2, 1, 0, 0, 1, 2, 2, 1, 3, 8, 3, 3, 2, 2, 6,
3, 10, 8, 2, 5, 2, 6, 3, 3, 4, 3, 5, 5), .Dim = c(10L, 3L), .Dimnames = list(
NULL, NULL))
colnames(data) <- c("X1", "X2", "X3")
as_tibble(data) %>%
pivot_longer(cols = "X1":"X3", values_to = "Cat") %>%
group_by(name, Cat) %>%
count() %>%
ungroup(Cat) %>%
summarize(name, Cat, proportion = n / sum(n)) %>%
pivot_wider(names_from = name, values_from = proportion) %>%
arrange(Cat) %>%
replace(is.na(.), 0)
# A tibble: 9 × 4
Cat X1 X2 X3
<dbl> <dbl> <dbl> <dbl>
1 0 0.3 0 0
2 1 0.4 0 0
3 2 0.3 0.2 0.2
4 3 0 0.4 0.3
5 4 0 0 0.1
6 5 0 0 0.3
7 6 0 0.1 0.1
8 8 0 0.2 0
9 10 0 0.1 0
如果你想要它作为矩阵,你可以使用as.matrix()
尝试将逻辑放在代码序列中的适当位置。
props <- data.frame(Cat = sort(unique(c(data))) ) # Just the Cat column
#Now fill in the entries
# the entries will be obtained with table function
apply(data, 2, table) # run `table(.)` over the columns individually
[[1]]
0 1 2 # these are actually character valued names
3 4 3 # while these are the count values
[[2]]
2 3 6 8 10
2 4 1 2 1
[[3]]
2 3 4 5 6
2 3 1 3 1
现在遍历该列表以填充与 Cat 列匹配的值:
props2 <- cbind(props, # using dfrm first argument returns dataframe object
lapply( apply(data, 2, table) , # irregular results are a list
function(col) { # first make a named vector of zeros
x <- setNames(rep(0,length(props$Cat)), props$Cat)
# could have skipped that step by using `tabulate`
# then fill with values using names as indices
x[names(col)] <- col # values to matching names
x}) )
props2
#-------------
Cat V1 V2 V3
0 0 3 0 0
1 1 4 0 0
2 2 3 2 2
3 3 0 4 3
4 4 0 0 1
5 5 0 0 3
6 6 0 1 1
8 8 0 2 0
10 10 0 1 0
#---
# now just "proportionalize" those counts
props2[2:4] <- prop.table(data.matrix(props2[2:4]), margin=2)
props2
#-------------
Cat V1 V2 V3
0 0 0.3 0.0 0.0
1 1 0.4 0.0 0.0
2 2 0.3 0.2 0.2
3 3 0.0 0.4 0.3
4 4 0.0 0.0 0.1
5 5 0.0 0.0 0.3
6 6 0.0 0.1 0.1
8 8 0.0 0.2 0.0
10 10 0.0 0.1 0.0
假设我从下面的 R 代码中模拟了这个数据:
library(RNGforGPD)
set.seed(1)
sample.size = 10; no.gpois = 3
lambda.vec = c(-0.2, 0.2, -0.3); theta.vec = c(1, 3, 4)
M = c(0.352, 0.265, 0.342); N = diag(3); N[lower.tri(N)] = M
TV = N + t(N); diag(TV) = 1
cstar = CmatStarGpois(TV, theta.vec, lambda.vec, verbose = TRUE)
data = GenMVGpois(sample.size, no.gpois, cstar, theta.vec, lambda.vec, details = FALSE)
> prop.table(table(data[,1]))
0 1 2
0.3 0.4 0.3
> prop.table(table(data[,2]))
2 3 6 8 10
0.2 0.4 0.1 0.2 0.1
> prop.table(table(data[,3]))
2 3 4 5 6
0.2 0.3 0.1 0.3 0.1
> table(data)
data
0 1 2 3 4 5 6 8 10
3 4 7 7 1 3 2 2 1
我想为三个分类变量中的每一个创建一个比例矩阵。如果特定列缺少类别,它将被标识为 0。
Cat | X1 | X2 | X3 |
---|---|---|---|
0 | 0.3 | 0.0 | 0.0 |
1 | 0.4 | 0.0 | 0.0 |
2 | 0.3 | 0.2 | 0.2 |
3 | 0.0 | 0.4 | 0.3 |
4 | 0.0 | 0.0 | 0.1 |
5 | 0.0 | 0.0 | 0.3 |
6 | 0.0 | 0.1 | 0.1 |
8 | 0.0 | 0.2 | 0.0 |
10 | 0.0 | 0.1 | 0.0 |
这是 data
-对象:
dput(data)
structure(c(1, 0, 2, 1, 0, 0, 1, 2, 2, 1, 3, 8, 3, 3, 2, 2, 6,
3, 10, 8, 2, 5, 2, 6, 3, 3, 4, 3, 5, 5), .Dim = c(10L, 3L), .Dimnames = list(
NULL, NULL))
colnames(data) <- c("X1", "X2", "X3")
as_tibble(data) %>%
pivot_longer(cols = "X1":"X3", values_to = "Cat") %>%
group_by(name, Cat) %>%
count() %>%
ungroup(Cat) %>%
summarize(name, Cat, proportion = n / sum(n)) %>%
pivot_wider(names_from = name, values_from = proportion) %>%
arrange(Cat) %>%
replace(is.na(.), 0)
# A tibble: 9 × 4
Cat X1 X2 X3
<dbl> <dbl> <dbl> <dbl>
1 0 0.3 0 0
2 1 0.4 0 0
3 2 0.3 0.2 0.2
4 3 0 0.4 0.3
5 4 0 0 0.1
6 5 0 0 0.3
7 6 0 0.1 0.1
8 8 0 0.2 0
9 10 0 0.1 0
如果你想要它作为矩阵,你可以使用as.matrix()
尝试将逻辑放在代码序列中的适当位置。
props <- data.frame(Cat = sort(unique(c(data))) ) # Just the Cat column
#Now fill in the entries
# the entries will be obtained with table function
apply(data, 2, table) # run `table(.)` over the columns individually
[[1]]
0 1 2 # these are actually character valued names
3 4 3 # while these are the count values
[[2]]
2 3 6 8 10
2 4 1 2 1
[[3]]
2 3 4 5 6
2 3 1 3 1
现在遍历该列表以填充与 Cat 列匹配的值:
props2 <- cbind(props, # using dfrm first argument returns dataframe object
lapply( apply(data, 2, table) , # irregular results are a list
function(col) { # first make a named vector of zeros
x <- setNames(rep(0,length(props$Cat)), props$Cat)
# could have skipped that step by using `tabulate`
# then fill with values using names as indices
x[names(col)] <- col # values to matching names
x}) )
props2
#-------------
Cat V1 V2 V3
0 0 3 0 0
1 1 4 0 0
2 2 3 2 2
3 3 0 4 3
4 4 0 0 1
5 5 0 0 3
6 6 0 1 1
8 8 0 2 0
10 10 0 1 0
#---
# now just "proportionalize" those counts
props2[2:4] <- prop.table(data.matrix(props2[2:4]), margin=2)
props2
#-------------
Cat V1 V2 V3
0 0 0.3 0.0 0.0
1 1 0.4 0.0 0.0
2 2 0.3 0.2 0.2
3 3 0.0 0.4 0.3
4 4 0.0 0.0 0.1
5 5 0.0 0.0 0.3
6 6 0.0 0.1 0.1
8 8 0.0 0.2 0.0
10 10 0.0 0.1 0.0