从模型输出中收集信息的简单方法
Simple way to gather information from model output
考虑这个数据框:
set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
State = rep(c("NY","MA","FL","GA"), each = 10),
Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
ID = rep(c(1:10), each = 2),
var1 = rnorm(200),
var2 = rnorm(200),
var3 = rnorm(200),
var4 = rnorm(200),
var5 = rnorm(200),
var6 = rnorm(200))
dat1$ID <- factor(dat1$ID)
我正在使用 mclust 包来拟合混合模型并获得自举标准误差,如下所示:
library(tidyverse)
library(mclust)
set.seed(123)
mod <- Mclust(dat1[,5:10], G=5, modelNames = "VEI", initialization = list(hcPairs = randomPairs(dat1[,5:10], seed = 123)))
plot(mod, what = "classification")
plot(mod, what = "density")
boot <- MclustBootstrap(mod, nboot = 25, type = "bs")#25 for now to speed up
接下来我想创建一个新的数据框来显示每个观察的混合概率,所以我将原始的 dat1
与 mod$z
结合起来。
probs <- mod$z
colnames(probs) <- paste0("Prob", 1:mod$G)
probs <- cbind(dat1, probs)
probs <- cbind(probs, cluster = mod$classification)
现在我要收集:
1) 每个集群中每个变量的平均值,以及 2) 每个平均值的标准误差(来自 summary(boot, what = "SE")$mean
)。我将使用这些来创建下面的图,return 一个 table 显示平均值和 SE 值。
newdat <- cbind(dat1, cluster = mod$classification)
a <-
newdat%>%
dplyr::select_if(is.numeric)%>%
dplyr::group_by(cluster)%>%
summarise_all(mean)%>%
pivot_longer(-c("cluster"), names_to = "Vars", values_to = "mean")
b<-
as.data.frame(cbind(cluster=1:mod$G, t(summary(boot, what = "se")$mean)))%>%
pivot_longer(., -c("cluster"), names_to = "Vars", values_to = "SE")
a<-dplyr::mutate(a, "SE" = b$SE)
ggplot(a, aes(x=Vars, y=mean, group=factor(cluster))) +
geom_line(aes(colour=factor(cluster)))+
geom_point()+
geom_errorbar(aes(ymin=mean-SE, ymax=mean+SE), width = .1)+
ggtitle("Mean by cluster") +
expand_limits(y=0) +
scale_y_continuous(breaks=0:20*4) +
labs(x = "Var", y = "Cluster Average")+
theme_bw() +
theme(legend.justification=c(1,0),
legend.position=c(1,0))
library(knitr)
kable(a)
最终,我想编写一个函数来执行这些步骤中的每一个,并立即 return 输出。它将应用于结构类似于 dat1
的数据帧,只是它们具有不同数量的 var
列。我还需要指定要使用的集群数量以及创建 mod
时要使用的模型。什么是我收集存储在 a
中的信息的更好方法,以便可以将相同的过程应用于变量(var.
列)和混合成分(G
的任意组合在mod
) 中从一个函数中指定?
也许你可以按照这些思路做一些事情:
set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
State = rep(c("NY","MA","FL","GA"), each = 10),
Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
ID = rep(c(1:10), each = 2),
var1 = rnorm(200),
var2 = rnorm(200),
var3 = rnorm(200),
var4 = rnorm(200),
var5 = rnorm(200),
var6 = rnorm(200))
dat1$ID <- factor(dat1$ID)
library(ggplot2)
library(data.table)
library(mclust)
#> Package 'mclust' version 5.4.6
#> Type 'citation("mclust")' for citing this R package in publications.
clusterVars <- function(dat, col_idx, G=5, seed=123){
mod <- Mclust(dat[, col_idx], G=G, modelNames = "VEI",
initialization = list(hcPairs = randomPairs(dat[, col_idx], seed = seed)))
plot(mod, what = "classification")
plot(mod, what = "density")
boot <- MclustBootstrap(mod, nboot = 25, type = "bs")#25 for now to speed up
mydat <- data.table(dat, mod$z, cluster = mod$classification)
setnames(mydat, old= paste0("V", seq_len(G)), new=paste0("Prob", seq_len(G)))
a <- mydat[,(mean = lapply(.SD, mean)), by = c("cluster"),
.SDcols=colnames(dat)[col_idx]]
a <- setkey(melt(
a, id.vars = "cluster", variable.name = "Vars", value.name = "mean"),
cluster, Vars)
b <- setkey(melt(
data.table(cluster = seq_len(G), t(summary(boot, what = "se")$mean)),
id.vars = "cluster", variable.name = "Vars", value.name = "SE"),
cluster, Vars)
out <- a[b]
p <- ggplot(out, aes(x=Vars, y=mean, group=factor(cluster))) +
geom_line(aes(colour=factor(cluster)))+
geom_point()+
geom_errorbar(aes(ymin=mean-SE, ymax=mean+SE), width = .1)+
ggtitle("Mean by cluster") +
expand_limits(y=0) +
scale_y_continuous(breaks=0:20*4) +
labs(x = "Var", y = "Cluster Average")+
theme_bw() +
theme(legend.justification=c(1,0),
legend.position=c(1,0))
list(plot = p, table = out)
}
out <- clusterVars(dat1, col_idx = 5:10, G = 5)
plot(out$plot)
out$table
#> cluster Vars mean SE
#> 1: 1 var1 -0.239846636 0.2286331
#> 2: 1 var2 0.496303497 0.4237945
#> 3: 1 var3 0.045489269 0.2392820
#> 4: 1 var4 0.035899698 0.3157938
#> 5: 1 var5 0.852212286 0.4683157
#> 6: 1 var6 -0.652884807 0.3000536
#> 7: 2 var1 -0.177353942 0.4626206
#> 8: 2 var2 -0.155370426 0.4879140
#> 9: 2 var3 0.760466010 0.4773900
#> 10: 2 var4 0.000245426 0.5210745
#> 11: 2 var5 -0.534355806 0.3960204
#> 12: 2 var6 0.023759375 0.3364228
#> 13: 3 var1 2.308773892 0.7389746
#> 14: 3 var2 -0.095760375 0.6652415
#> 15: 3 var3 1.538477219 0.6248455
#> 16: 3 var4 0.423748712 0.9034914
#> 17: 3 var5 0.526404192 0.4475889
#> 18: 3 var6 0.147645599 0.4308516
#> 19: 4 var1 0.223028905 0.5078972
#> 20: 4 var2 -0.028866278 0.2597171
#> 21: 4 var3 -1.005246422 0.5143203
#> 22: 4 var4 -0.084080497 0.5511872
#> 23: 4 var5 -0.194968718 0.3312450
#> 24: 4 var6 0.255504356 0.3616244
#> 25: 5 var1 0.023540106 0.4559344
#> 26: 5 var2 -0.572302120 0.5058384
#> 27: 5 var3 0.782263291 0.3809646
#> 28: 5 var4 -0.202702860 0.3913615
#> 29: 5 var5 0.061927990 1.0652221
#> 30: 5 var6 2.019626078 0.5999117
#> cluster Vars mean SE
由 reprex package (v0.3.0)
于 2020-06-15 创建
考虑这个数据框:
set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
State = rep(c("NY","MA","FL","GA"), each = 10),
Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
ID = rep(c(1:10), each = 2),
var1 = rnorm(200),
var2 = rnorm(200),
var3 = rnorm(200),
var4 = rnorm(200),
var5 = rnorm(200),
var6 = rnorm(200))
dat1$ID <- factor(dat1$ID)
我正在使用 mclust 包来拟合混合模型并获得自举标准误差,如下所示:
library(tidyverse)
library(mclust)
set.seed(123)
mod <- Mclust(dat1[,5:10], G=5, modelNames = "VEI", initialization = list(hcPairs = randomPairs(dat1[,5:10], seed = 123)))
plot(mod, what = "classification")
plot(mod, what = "density")
boot <- MclustBootstrap(mod, nboot = 25, type = "bs")#25 for now to speed up
接下来我想创建一个新的数据框来显示每个观察的混合概率,所以我将原始的 dat1
与 mod$z
结合起来。
probs <- mod$z
colnames(probs) <- paste0("Prob", 1:mod$G)
probs <- cbind(dat1, probs)
probs <- cbind(probs, cluster = mod$classification)
现在我要收集:
1) 每个集群中每个变量的平均值,以及 2) 每个平均值的标准误差(来自 summary(boot, what = "SE")$mean
)。我将使用这些来创建下面的图,return 一个 table 显示平均值和 SE 值。
newdat <- cbind(dat1, cluster = mod$classification)
a <-
newdat%>%
dplyr::select_if(is.numeric)%>%
dplyr::group_by(cluster)%>%
summarise_all(mean)%>%
pivot_longer(-c("cluster"), names_to = "Vars", values_to = "mean")
b<-
as.data.frame(cbind(cluster=1:mod$G, t(summary(boot, what = "se")$mean)))%>%
pivot_longer(., -c("cluster"), names_to = "Vars", values_to = "SE")
a<-dplyr::mutate(a, "SE" = b$SE)
ggplot(a, aes(x=Vars, y=mean, group=factor(cluster))) +
geom_line(aes(colour=factor(cluster)))+
geom_point()+
geom_errorbar(aes(ymin=mean-SE, ymax=mean+SE), width = .1)+
ggtitle("Mean by cluster") +
expand_limits(y=0) +
scale_y_continuous(breaks=0:20*4) +
labs(x = "Var", y = "Cluster Average")+
theme_bw() +
theme(legend.justification=c(1,0),
legend.position=c(1,0))
library(knitr)
kable(a)
最终,我想编写一个函数来执行这些步骤中的每一个,并立即 return 输出。它将应用于结构类似于 dat1
的数据帧,只是它们具有不同数量的 var
列。我还需要指定要使用的集群数量以及创建 mod
时要使用的模型。什么是我收集存储在 a
中的信息的更好方法,以便可以将相同的过程应用于变量(var.
列)和混合成分(G
的任意组合在mod
) 中从一个函数中指定?
也许你可以按照这些思路做一些事情:
set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
State = rep(c("NY","MA","FL","GA"), each = 10),
Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
ID = rep(c(1:10), each = 2),
var1 = rnorm(200),
var2 = rnorm(200),
var3 = rnorm(200),
var4 = rnorm(200),
var5 = rnorm(200),
var6 = rnorm(200))
dat1$ID <- factor(dat1$ID)
library(ggplot2)
library(data.table)
library(mclust)
#> Package 'mclust' version 5.4.6
#> Type 'citation("mclust")' for citing this R package in publications.
clusterVars <- function(dat, col_idx, G=5, seed=123){
mod <- Mclust(dat[, col_idx], G=G, modelNames = "VEI",
initialization = list(hcPairs = randomPairs(dat[, col_idx], seed = seed)))
plot(mod, what = "classification")
plot(mod, what = "density")
boot <- MclustBootstrap(mod, nboot = 25, type = "bs")#25 for now to speed up
mydat <- data.table(dat, mod$z, cluster = mod$classification)
setnames(mydat, old= paste0("V", seq_len(G)), new=paste0("Prob", seq_len(G)))
a <- mydat[,(mean = lapply(.SD, mean)), by = c("cluster"),
.SDcols=colnames(dat)[col_idx]]
a <- setkey(melt(
a, id.vars = "cluster", variable.name = "Vars", value.name = "mean"),
cluster, Vars)
b <- setkey(melt(
data.table(cluster = seq_len(G), t(summary(boot, what = "se")$mean)),
id.vars = "cluster", variable.name = "Vars", value.name = "SE"),
cluster, Vars)
out <- a[b]
p <- ggplot(out, aes(x=Vars, y=mean, group=factor(cluster))) +
geom_line(aes(colour=factor(cluster)))+
geom_point()+
geom_errorbar(aes(ymin=mean-SE, ymax=mean+SE), width = .1)+
ggtitle("Mean by cluster") +
expand_limits(y=0) +
scale_y_continuous(breaks=0:20*4) +
labs(x = "Var", y = "Cluster Average")+
theme_bw() +
theme(legend.justification=c(1,0),
legend.position=c(1,0))
list(plot = p, table = out)
}
out <- clusterVars(dat1, col_idx = 5:10, G = 5)
plot(out$plot)
out$table
#> cluster Vars mean SE
#> 1: 1 var1 -0.239846636 0.2286331
#> 2: 1 var2 0.496303497 0.4237945
#> 3: 1 var3 0.045489269 0.2392820
#> 4: 1 var4 0.035899698 0.3157938
#> 5: 1 var5 0.852212286 0.4683157
#> 6: 1 var6 -0.652884807 0.3000536
#> 7: 2 var1 -0.177353942 0.4626206
#> 8: 2 var2 -0.155370426 0.4879140
#> 9: 2 var3 0.760466010 0.4773900
#> 10: 2 var4 0.000245426 0.5210745
#> 11: 2 var5 -0.534355806 0.3960204
#> 12: 2 var6 0.023759375 0.3364228
#> 13: 3 var1 2.308773892 0.7389746
#> 14: 3 var2 -0.095760375 0.6652415
#> 15: 3 var3 1.538477219 0.6248455
#> 16: 3 var4 0.423748712 0.9034914
#> 17: 3 var5 0.526404192 0.4475889
#> 18: 3 var6 0.147645599 0.4308516
#> 19: 4 var1 0.223028905 0.5078972
#> 20: 4 var2 -0.028866278 0.2597171
#> 21: 4 var3 -1.005246422 0.5143203
#> 22: 4 var4 -0.084080497 0.5511872
#> 23: 4 var5 -0.194968718 0.3312450
#> 24: 4 var6 0.255504356 0.3616244
#> 25: 5 var1 0.023540106 0.4559344
#> 26: 5 var2 -0.572302120 0.5058384
#> 27: 5 var3 0.782263291 0.3809646
#> 28: 5 var4 -0.202702860 0.3913615
#> 29: 5 var5 0.061927990 1.0652221
#> 30: 5 var6 2.019626078 0.5999117
#> cluster Vars mean SE
由 reprex package (v0.3.0)
于 2020-06-15 创建