在二项式 GLMM 上使用预测函数:生成数值而非二项式值
Using predict function on binomial GLMM: producing a numerical not binomial value
我一直在 运行在大型数据集上使用 glmmTMB 包构建 GLMM。下面的示例数据
structure(list(code = structure(c(1L, 1L, 1L, 1L), .Label = c("2388",
"2390", "12950", "12952", "12954", "12956", "12958", "12960",
"12962", "12964", "12966", "12968", "13573", "13574", "13575",
"13576", "13577", "14203", "19318", "19319", "19320", "19321",
"19322", "19515", "19517", "19523", "19524", "25534", "25535",
"25536", "25537", "25539", "25540", "25541", "25542", "25543",
"25544", "25545", "25546", "25547", "25548", "25549", "25550",
"25552", "25553", "27583", "27584", "27585", "27586", "27588",
"27589", "27590", "27591", "27592", "27593", "27594", "27595",
"27596", "27597", "27598", "27599", "27600", "27601", "27602",
"27603", "27604", "27605", "27606", "27607", "27608", "27609",
"27610", "27611", "27613", "27614", "27615", "27616", "27617",
"27618", "27619", "27620", "27621", "27622", "27624", "27625",
"27626", "27627", "27629", "27630", "27631", "27632", "34176",
"34177", "34178", "34179", "52975", "52977", "52978", "54814",
"54815", "54816", "54817", "54821", "54822", "54823", "54824",
"54825", "54835", "54837", "54838", "54839", "54840", "54841",
"54842", "54843", "54844", "54845", "54846", "54847", "54848",
"54849", "54851", "54852", "54853", "54856", "54858", "54859",
"54860", "54863", "54864", "54866", "54869", "54872", "54873",
"54874", "54875", "54876", "54877", "54878", "54880", "54882",
"54883", "54884", "54886", "54887", "54889", "54890", "54892",
"54893", "54895", "54896", "54898", "54899", "54900", "54901",
"54903", "54904", "54905", "54906", "54911", "54912", "54914",
"54915", "54931", "54933", "54934", "54935", "54937", "54939",
"54940", "54941", "54942", "54943", "54944", "54945", "54946",
"54947", "54948", "54950", "54952", "54954", "54955", "54957",
"54958", "54959", "54961", "54962"), class = "factor"), station =
c("PB14",
"PB14", "PB16", "PB16"), species = c("Silvertip Shark", "Silvertip Shark",
"Silvertip Shark", "Silvertip Shark"), sex = c("F", "F", "F",
"F"), size = c(112, 112, 112, 112), datetime = c("1466247120",
"1466247420", "1467026100", "1469621400"), year = c("2016", "2016",
"2016", "2016"), month = c(6, 6, 6, 7), hour = c("11", "11",
"12", "13"), season = c("dry season", "dry season", "dry season",
"dry season"), daynight = c("day", "day", "day", "day"), time_diff = c(4,
5, 5821, 43255), offshore = structure(c(2L, 2L, 1L, 1L), .Label =
c("offshore",
"onshore"), class = "factor"), rowN = 1:4), row.names = c(NA,
4L), class = "data.frame")
我正在寻找 运行 我 80% 数据上的模型,然后使用 predict() 函数对其进行验证。使用下面的代码
Off_80 <- Off %>% sample_frac(.80)
Off_20 <- anti_join(Off, Off_80, by = 'rowN')
OffMod_80 <- glmmTMB(offshore ~ sex + log(size) + species*daynight + species*season + (1|code), family=binomial(), data=Off_80)
pred_Off20 <- as.data.frame(predict(OffMod_80, newdata=Off_20, type="response", allow.new.levels=TRUE))
然后我会将预测结果与观察结果进行比较,以验证模型的强度。
但是使用这个而不是得到 'offshore' 或 'onshore' 响应,我得到了一个数值。
predict()
1 0.2807461
2 0.2631816
3 0.2631816
4 0.2807461
有没有办法让预测函数吐出二项式响应?或者我如何将这些值解释为二项式?
最初我将响应变量设置为 1 或 0,但随后 post 我将值更改为因子。但是predict()还是会吐出一个数值。
感谢任何帮助!
根据您在模型中为测试数据集中表达的值提供的预测变量,预测函数为您提供离岸变量 "Offshore" 的概率。您可能想查看此问题的答案:Type parameter of the predict() function. Where they highlight the difference between ?predict.glm and ?predict.rpart. I can not replicate your analysis from the data you provided but look at the simple example from: https://www.theanalysisfactor.com/r-tutorial-glm1/。这显示了 glm 的工作原理(通常)以及 predict.glm 函数为您提供的功能。我希望这有帮助。
predict()
对于二项式模型 return 是成功或失败的 概率 ,它不会 return 1 或 0(因为您只能以一定的 概率 预测该结果)。所以如果你想检查你的模型性能,你可以尝试计算曲线下的面积:
library(glmmTMB)
library(pROC)
data(mtcars)
n <- nrow(mtcars)
train <- sample(1:n, n * .8, TRUE)
mtcars_train <- mtcars[train, ]
mtcars_test <- mtcars[-train, ]
m <- glmmTMB(formula = vs ~ hp + wt + (1 | gear), family = binomial, data = mtcars_train)
p <- predict(m, newdata = mtcars_test)
auc(roc(response = mtcars_test$vs, predictor = p))
#> Area under the curve: 0.9107
由 reprex package (v0.3.0)
于 2019-05-29 创建
另一种方法是将完整模型的正确预测百分比 (PCP) 与空模型进行比较(参见 Herron, M. (1999)。有限因变量模型中的后估计不确定性。政治分析,8, 83–98)。在这里,完整模型的 PCP 应该明显更高:
library(glmmTMB)
library(insight)
data(mtcars)
m <- glmmTMB(formula = vs ~ hp + wt + (1 | gear), family = binomial, data = mtcars)
m0 <- glmmTMB(formula = vs ~ 1 + (1 | gear), family = binomial, data = mtcars)
y <- insight::get_response(m)
y0 <- insight::get_response(m0)
n <- nobs(m)
n0 <- nobs(m0)
p <- predict(m, type = "response")
p0 <- predict(m0, type = "response")
pcp_full <- (sum(1 - p[y == 0]) + sum(p[y == 1])) / n
pcp_null <- (sum(1 - p0[y0 == 0]) + sum(p0[y0 == 1])) / n0
# percentage correct predictions full model
pcp_full
#> [1] 0.8374718
# percentage correct predictions null model
pcp_null
#> [1] 0.6614221
由 reprex package (v0.3.0)
于 2019-05-29 创建
最后,如果您真的想要 0
和 1
之类的东西,您可以使用 simulate()
,它的 returns 值与原始响应的规模相同.你可以比较有多少模拟响应与实际响应相匹配:
library(glmmTMB)
data(mtcars)
m <- glmmTMB(formula = vs ~ hp + wt + (1 | gear), family = binomial, data = mtcars)
# simulate response, first column = successes
s <- simulate(m)$sim_1[, 1]
# proportion of response values that equal simulated responses
mean(mtcars$vs == s)
#> [1] 0.875
由 reprex package (v0.3.0)
于 2019-05-29 创建
我一直在 运行在大型数据集上使用 glmmTMB 包构建 GLMM。下面的示例数据
structure(list(code = structure(c(1L, 1L, 1L, 1L), .Label = c("2388",
"2390", "12950", "12952", "12954", "12956", "12958", "12960",
"12962", "12964", "12966", "12968", "13573", "13574", "13575",
"13576", "13577", "14203", "19318", "19319", "19320", "19321",
"19322", "19515", "19517", "19523", "19524", "25534", "25535",
"25536", "25537", "25539", "25540", "25541", "25542", "25543",
"25544", "25545", "25546", "25547", "25548", "25549", "25550",
"25552", "25553", "27583", "27584", "27585", "27586", "27588",
"27589", "27590", "27591", "27592", "27593", "27594", "27595",
"27596", "27597", "27598", "27599", "27600", "27601", "27602",
"27603", "27604", "27605", "27606", "27607", "27608", "27609",
"27610", "27611", "27613", "27614", "27615", "27616", "27617",
"27618", "27619", "27620", "27621", "27622", "27624", "27625",
"27626", "27627", "27629", "27630", "27631", "27632", "34176",
"34177", "34178", "34179", "52975", "52977", "52978", "54814",
"54815", "54816", "54817", "54821", "54822", "54823", "54824",
"54825", "54835", "54837", "54838", "54839", "54840", "54841",
"54842", "54843", "54844", "54845", "54846", "54847", "54848",
"54849", "54851", "54852", "54853", "54856", "54858", "54859",
"54860", "54863", "54864", "54866", "54869", "54872", "54873",
"54874", "54875", "54876", "54877", "54878", "54880", "54882",
"54883", "54884", "54886", "54887", "54889", "54890", "54892",
"54893", "54895", "54896", "54898", "54899", "54900", "54901",
"54903", "54904", "54905", "54906", "54911", "54912", "54914",
"54915", "54931", "54933", "54934", "54935", "54937", "54939",
"54940", "54941", "54942", "54943", "54944", "54945", "54946",
"54947", "54948", "54950", "54952", "54954", "54955", "54957",
"54958", "54959", "54961", "54962"), class = "factor"), station =
c("PB14",
"PB14", "PB16", "PB16"), species = c("Silvertip Shark", "Silvertip Shark",
"Silvertip Shark", "Silvertip Shark"), sex = c("F", "F", "F",
"F"), size = c(112, 112, 112, 112), datetime = c("1466247120",
"1466247420", "1467026100", "1469621400"), year = c("2016", "2016",
"2016", "2016"), month = c(6, 6, 6, 7), hour = c("11", "11",
"12", "13"), season = c("dry season", "dry season", "dry season",
"dry season"), daynight = c("day", "day", "day", "day"), time_diff = c(4,
5, 5821, 43255), offshore = structure(c(2L, 2L, 1L, 1L), .Label =
c("offshore",
"onshore"), class = "factor"), rowN = 1:4), row.names = c(NA,
4L), class = "data.frame")
我正在寻找 运行 我 80% 数据上的模型,然后使用 predict() 函数对其进行验证。使用下面的代码
Off_80 <- Off %>% sample_frac(.80)
Off_20 <- anti_join(Off, Off_80, by = 'rowN')
OffMod_80 <- glmmTMB(offshore ~ sex + log(size) + species*daynight + species*season + (1|code), family=binomial(), data=Off_80)
pred_Off20 <- as.data.frame(predict(OffMod_80, newdata=Off_20, type="response", allow.new.levels=TRUE))
然后我会将预测结果与观察结果进行比较,以验证模型的强度。
但是使用这个而不是得到 'offshore' 或 'onshore' 响应,我得到了一个数值。
predict()
1 0.2807461
2 0.2631816
3 0.2631816
4 0.2807461
有没有办法让预测函数吐出二项式响应?或者我如何将这些值解释为二项式?
最初我将响应变量设置为 1 或 0,但随后 post 我将值更改为因子。但是predict()还是会吐出一个数值。
感谢任何帮助!
根据您在模型中为测试数据集中表达的值提供的预测变量,预测函数为您提供离岸变量 "Offshore" 的概率。您可能想查看此问题的答案:Type parameter of the predict() function. Where they highlight the difference between ?predict.glm and ?predict.rpart. I can not replicate your analysis from the data you provided but look at the simple example from: https://www.theanalysisfactor.com/r-tutorial-glm1/。这显示了 glm 的工作原理(通常)以及 predict.glm 函数为您提供的功能。我希望这有帮助。
predict()
对于二项式模型 return 是成功或失败的 概率 ,它不会 return 1 或 0(因为您只能以一定的 概率 预测该结果)。所以如果你想检查你的模型性能,你可以尝试计算曲线下的面积:
library(glmmTMB)
library(pROC)
data(mtcars)
n <- nrow(mtcars)
train <- sample(1:n, n * .8, TRUE)
mtcars_train <- mtcars[train, ]
mtcars_test <- mtcars[-train, ]
m <- glmmTMB(formula = vs ~ hp + wt + (1 | gear), family = binomial, data = mtcars_train)
p <- predict(m, newdata = mtcars_test)
auc(roc(response = mtcars_test$vs, predictor = p))
#> Area under the curve: 0.9107
由 reprex package (v0.3.0)
于 2019-05-29 创建另一种方法是将完整模型的正确预测百分比 (PCP) 与空模型进行比较(参见 Herron, M. (1999)。有限因变量模型中的后估计不确定性。政治分析,8, 83–98)。在这里,完整模型的 PCP 应该明显更高:
library(glmmTMB)
library(insight)
data(mtcars)
m <- glmmTMB(formula = vs ~ hp + wt + (1 | gear), family = binomial, data = mtcars)
m0 <- glmmTMB(formula = vs ~ 1 + (1 | gear), family = binomial, data = mtcars)
y <- insight::get_response(m)
y0 <- insight::get_response(m0)
n <- nobs(m)
n0 <- nobs(m0)
p <- predict(m, type = "response")
p0 <- predict(m0, type = "response")
pcp_full <- (sum(1 - p[y == 0]) + sum(p[y == 1])) / n
pcp_null <- (sum(1 - p0[y0 == 0]) + sum(p0[y0 == 1])) / n0
# percentage correct predictions full model
pcp_full
#> [1] 0.8374718
# percentage correct predictions null model
pcp_null
#> [1] 0.6614221
由 reprex package (v0.3.0)
于 2019-05-29 创建最后,如果您真的想要 0
和 1
之类的东西,您可以使用 simulate()
,它的 returns 值与原始响应的规模相同.你可以比较有多少模拟响应与实际响应相匹配:
library(glmmTMB)
data(mtcars)
m <- glmmTMB(formula = vs ~ hp + wt + (1 | gear), family = binomial, data = mtcars)
# simulate response, first column = successes
s <- simulate(m)$sim_1[, 1]
# proportion of response values that equal simulated responses
mean(mtcars$vs == s)
#> [1] 0.875
由 reprex package (v0.3.0)
于 2019-05-29 创建