从模型输出中收集信息的简单方法

Simple way to gather information from model output

考虑这个数据框:

set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
                   State = rep(c("NY","MA","FL","GA"), each = 10),
                   Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
                   ID = rep(c(1:10), each = 2),
                   var1 = rnorm(200),
                   var2 = rnorm(200),
                   var3 = rnorm(200),
                   var4 = rnorm(200),
                   var5 = rnorm(200),
                   var6 = rnorm(200))
dat1$ID <- factor(dat1$ID)

我正在使用 mclust 包来拟合混合模型并获得自举标准误差,如下所示:

library(tidyverse)
library(mclust)
set.seed(123)
mod <- Mclust(dat1[,5:10], G=5, modelNames = "VEI", initialization = list(hcPairs = randomPairs(dat1[,5:10], seed = 123)))
plot(mod, what = "classification")
plot(mod, what = "density")
boot <- MclustBootstrap(mod, nboot = 25, type = "bs")#25 for now to speed up

接下来我想创建一个新的数据框来显示每个观察的混合概率,所以我将原始的 dat1mod$z 结合起来。

probs <- mod$z
colnames(probs) <- paste0("Prob", 1:mod$G)
probs <- cbind(dat1, probs)
probs <- cbind(probs, cluster = mod$classification)

现在我要收集: 1) 每个集群中每个变量的平均值,以及 2) 每个平均值的标准误差(来自 summary(boot, what = "SE")$mean)。我将使用这些来创建下面的图,return 一个 table 显示平均值和 SE 值。

newdat <- cbind(dat1, cluster = mod$classification)
a <-
  newdat%>%
  dplyr::select_if(is.numeric)%>%
  dplyr::group_by(cluster)%>%
  summarise_all(mean)%>%
  pivot_longer(-c("cluster"), names_to = "Vars", values_to = "mean")
b<-
as.data.frame(cbind(cluster=1:mod$G, t(summary(boot, what = "se")$mean)))%>%
   pivot_longer(., -c("cluster"), names_to = "Vars", values_to = "SE")
a<-dplyr::mutate(a, "SE" = b$SE)

ggplot(a, aes(x=Vars, y=mean, group=factor(cluster))) + 
  geom_line(aes(colour=factor(cluster)))+
  geom_point()+
  geom_errorbar(aes(ymin=mean-SE, ymax=mean+SE), width = .1)+
  ggtitle("Mean by cluster") +
    expand_limits(y=0) +                        
    scale_y_continuous(breaks=0:20*4) +         
    labs(x = "Var", y = "Cluster Average")+
    theme_bw() +
    theme(legend.justification=c(1,0),
          legend.position=c(1,0))
library(knitr)
kable(a)             

最终,我想编写一个函数来执行这些步骤中的每一个,并立即 return 输出。它将应用于结构类似于 dat1 的数据帧,只是它们具有不同数量的 var 列。我还需要指定要使用的集群数量以及创建 mod 时要使用的模型。什么是我收集存储在 a 中的信息的更好方法,以便可以将相同的过程应用于变量(var. 列)和混合成分(G 的任意组合在mod) 中从一个函数中指定?

也许你可以按照这些思路做一些事情:

set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
                   State = rep(c("NY","MA","FL","GA"), each = 10),
                   Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
                   ID = rep(c(1:10), each = 2),
                   var1 = rnorm(200),
                   var2 = rnorm(200),
                   var3 = rnorm(200),
                   var4 = rnorm(200),
                   var5 = rnorm(200),
                   var6 = rnorm(200))
dat1$ID <- factor(dat1$ID)
library(ggplot2)
library(data.table)
library(mclust)
#> Package 'mclust' version 5.4.6
#> Type 'citation("mclust")' for citing this R package in publications.

clusterVars <- function(dat, col_idx, G=5, seed=123){
    mod <- Mclust(dat[, col_idx], G=G, modelNames = "VEI", 
        initialization = list(hcPairs = randomPairs(dat[, col_idx], seed = seed)))
    plot(mod, what = "classification")
    plot(mod, what = "density")
    boot <- MclustBootstrap(mod, nboot = 25, type = "bs")#25 for now to speed up
    mydat <- data.table(dat, mod$z, cluster = mod$classification)
    setnames(mydat, old= paste0("V", seq_len(G)), new=paste0("Prob", seq_len(G)))
    a <- mydat[,(mean = lapply(.SD, mean)), by = c("cluster"), 
                .SDcols=colnames(dat)[col_idx]]
    a <- setkey(melt(
        a, id.vars = "cluster", variable.name = "Vars", value.name = "mean"), 
        cluster, Vars)
    b <- setkey(melt(
        data.table(cluster = seq_len(G), t(summary(boot, what = "se")$mean)), 
        id.vars = "cluster", variable.name = "Vars", value.name = "SE"),
        cluster, Vars)
    out <- a[b]
    p <- ggplot(out, aes(x=Vars, y=mean, group=factor(cluster))) + 
        geom_line(aes(colour=factor(cluster)))+
        geom_point()+
        geom_errorbar(aes(ymin=mean-SE, ymax=mean+SE), width = .1)+
        ggtitle("Mean by cluster") +
        expand_limits(y=0) +                        
        scale_y_continuous(breaks=0:20*4) +         
        labs(x = "Var", y = "Cluster Average")+
        theme_bw() +
        theme(legend.justification=c(1,0),
              legend.position=c(1,0))
    list(plot = p, table = out)       
}

out <- clusterVars(dat1, col_idx = 5:10, G = 5)

plot(out$plot)

out$table
#>     cluster Vars         mean        SE
#>  1:       1 var1 -0.239846636 0.2286331
#>  2:       1 var2  0.496303497 0.4237945
#>  3:       1 var3  0.045489269 0.2392820
#>  4:       1 var4  0.035899698 0.3157938
#>  5:       1 var5  0.852212286 0.4683157
#>  6:       1 var6 -0.652884807 0.3000536
#>  7:       2 var1 -0.177353942 0.4626206
#>  8:       2 var2 -0.155370426 0.4879140
#>  9:       2 var3  0.760466010 0.4773900
#> 10:       2 var4  0.000245426 0.5210745
#> 11:       2 var5 -0.534355806 0.3960204
#> 12:       2 var6  0.023759375 0.3364228
#> 13:       3 var1  2.308773892 0.7389746
#> 14:       3 var2 -0.095760375 0.6652415
#> 15:       3 var3  1.538477219 0.6248455
#> 16:       3 var4  0.423748712 0.9034914
#> 17:       3 var5  0.526404192 0.4475889
#> 18:       3 var6  0.147645599 0.4308516
#> 19:       4 var1  0.223028905 0.5078972
#> 20:       4 var2 -0.028866278 0.2597171
#> 21:       4 var3 -1.005246422 0.5143203
#> 22:       4 var4 -0.084080497 0.5511872
#> 23:       4 var5 -0.194968718 0.3312450
#> 24:       4 var6  0.255504356 0.3616244
#> 25:       5 var1  0.023540106 0.4559344
#> 26:       5 var2 -0.572302120 0.5058384
#> 27:       5 var3  0.782263291 0.3809646
#> 28:       5 var4 -0.202702860 0.3913615
#> 29:       5 var5  0.061927990 1.0652221
#> 30:       5 var6  2.019626078 0.5999117
#>     cluster Vars         mean        SE

reprex package (v0.3.0)

于 2020-06-15 创建