向插入符号中的自定义 VGAM::vglm 模型添加交互时出错
Error adding interactions to custom VGAM::vglm model in caret
我使用 VGAM
中的 vglm()
使用 caret
构建了自定义模型。它可以很好地处理简单的效果,但是当我尝试添加交互时它会失败并显示 object 'x1:x2' not found
错误消息,其中 x1
和 x2
是我输入到模型中的预测变量相互作用。问题与预测有关,除非我弄错了,否则它似乎会发生,因为 predict.train
或 predictvglm
试图使用 x1:x2
来预测 类.
我在下面提供了一个工作示例。
# Set up data
set.seed(123)
n <- 100
x1 <- rnorm(n, 175, 7)
x2 <- rnorm(n, 30, 8)
cont <- 0.5 * x1 - 0.3 * x2 + 10 + rnorm(n, 0, 6)
y <- cut(cont, breaks = quantile(cont), include.lowest = TRUE,
labels = c("A", "B", "C", "D"), ordered = TRUE)
d <- data.frame(x1, x2, y)
# My custom caret function
vglmTrain <- list(
label = "VGAM prop odds",
library = "VGAM",
loop = NULL,
type = "Classification",
parameters = data.frame(parameter = "parameter",
class = "character",
label = "parameter"),
grid = function(x, y,
len = NULL, search = "grid") data.frame(parameter = "none"),
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
dat <- if(is.data.frame(x)) x else as.data.frame(x)
dat$.outcome <- y
if(!is.null(wts))
{
out <- vglm(.outcome ~ ., propodds, data = dat, weights = wts, ...)
} else {
out <- vglm(.outcome ~ ., propodds, data = dat, ...)
}
out
},
predict = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
probs <- predict(modelFit, data.frame(newdata), type = "response")
predClass <- function (x) {
n <- colnames(x)
factor(as.vector(apply(x, 1, which.max)),
levels = 1:length(n),
labels = n)
}
predClass(probs)
},
prob = function(modelFit, newdata, preProc = NULL, submodels = NULL)
predict(modelFit, data.frame(newdata), type = "response"),
predictors = function(x, ...) names(attributes(terms(x))$dataClasses[-1]),
levels = function(x) x@misc$ynames,
sort = function(x) x)
现在,如果我尝试使用该函数,如果我为公式提供交互,它会退出并出现错误。
# Load caret
library(caret)
ctrl <- trainControl(method = "cv", number = 2, verboseIter = T)
# A model with no interactions - works
f1 <- train(y ~ x1 + x2, data = d,
method = vglmTrain,
trControl = ctrl)
# A model with interactions - fails
f2 <- train(y ~ x1*x2, data = d,
method = vglmTrain,
trControl = ctrl)
Error in train.default(x, y, weights = w, ...) : Stopping
In addition: Warning messages:
1: In eval(expr, envir, enclos) :
predictions failed for Fold1: parameter=none Error in eval(expr, envir, enclos) : object 'x1:x2' not found
2: In eval(expr, envir, enclos) :
predictions failed for Fold2: parameter=none Error in eval(expr, envir, enclos) : object 'x1:x2' not found
3: In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
There were missing values in resampled performance measures.
这是我的 sessionInfo():
> sessionInfo()
R version 3.2.4 (2016-03-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)
locale:
[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252 LC_MONETARY=English_United States.1252 LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] splines stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] VGAM_1.0-0 caret_6.0-64 ggplot2_2.1.0 lattice_0.20-33
loaded via a namespace (and not attached):
[1] Rcpp_0.12.3 magrittr_1.5 MASS_7.3-45 munsell_0.4.3 colorspace_1.2-6 foreach_1.4.3 minqa_1.2.4 stringr_1.0.0 car_2.1-1
[10] plyr_1.8.3 tools_3.2.4 nnet_7.3-12 pbkrtest_0.4-6 parallel_3.2.4 grid_3.2.4 gtable_0.2.0 nlme_3.1-125 mgcv_1.8-12
[19] quantreg_5.21 e1071_1.6-7 class_7.3-14 MatrixModels_0.4-1 iterators_1.0.8 lme4_1.1-11 Matrix_1.2-3 nloptr_1.0.4 reshape2_1.4.1
[28] codetools_0.2-14 stringi_1.0-1 compiler_3.2.4 scales_0.4.0 SparseM_1.7
有人知道如何解决这个问题吗?
Caret 确实处理交互。但是我找到了解决方法。您可以先调用 model.matrix 来创建具有交互作用的矩阵。您还需要删除拦截。
以你的f2为例,我们指定的数据不是公式,而是x和y。 x 包含带有交互的 model.matrix 规范,-1 删除拦截。这已转换为 data.frame,您可以开始了。
f2 <- train(y = y, x = data.frame(model.matrix(y ~ x1*x2 - 1, data = d)),
method = vglmTrain,
trControl = ctrl)
编辑:
在调试 train.default 并检查您的模型类型规范和其他一些之后,我发现了一项检查是在插入符号模型中完成的,而不是在您的模型中完成的。检查与 predict 和 probs 函数有关。这些都对 Dataframe 进行了检查。如果您将此检查添加到这两个函数,它会按预期工作。
if (!is.data.frame(newdata))
newdata <- as.data.frame(newdata)
整个函数将是:
vglmTrain <- list(
label = "VGAM prop odds",
library = "VGAM",
loop = NULL,
type = "Classification",
parameters = data.frame(parameter = "parameter",
class = "character",
label = "parameter"),
grid = function(x, y,
len = NULL, search = "grid") data.frame(parameter = "none"),
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
dat <- if(is.data.frame(x)) x else as.data.frame(x)
dat$.outcome <- y
if(!is.null(wts))
{
out <- vglm(.outcome ~ ., propodds, data = dat, weights = wts, ...)
} else {
out <- vglm(.outcome ~ ., propodds, data = dat, ...)
}
out
},
predict = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
if (!is.data.frame(newdata))
newdata <- as.data.frame(newdata)
probs <- predict(modelFit, newdata, type = "response")
predClass <- function (x) {
n <- colnames(x)
factor(as.vector(apply(x, 1, which.max)),
levels = 1:length(n),
labels = n)
}
predClass(probs)
},
prob = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
if (!is.data.frame(newdata))
newdata <- as.data.frame(newdata)
predict(modelFit, newdata, type = "response")
},
levels = function(x) x@misc$ynames,
tags = c("Cumulative Link", "Logistic Regression", "Accepts Case Weights",
"Probit", "Logit"),
sort = function(x) x)
Phiver 的解决方案在此示例中运行良好,但当我添加虚拟编码变量时,模型再次失败。
我做了更多的挖掘,问题似乎确实发生了,因为 data.frame
更改了要预测的数据集中列的名称。在我的代码中对 predict
的两次调用中,我现在添加了 data.frame(newdata, check.names = F)
,这似乎可以解决问题。
现在可以使用公式界面
f2 <- train(y ~ x1 * x2, data = d,
method = vglmTrain,
trControl = ctrl)
和模型矩阵法
f2 <- train(y = y, x = data.frame(model.matrix(y ~ x1*x2 - 1, data = d)),
method = vglmTrain,
trControl = ctrl)
这是新代码:
vglmTrain <- list(
label = "VGAM prop odds",
library = "VGAM",
loop = NULL,
type = "Classification",
parameters = data.frame(parameter = "parameter",
class = "character",
label = "parameter"),
grid = function(x, y,
len = NULL, search = "grid") data.frame(parameter = "none"),
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
dat <- if(is.data.frame(x)) x else as.data.frame(x)
dat$.outcome <- y
if(!is.null(wts))
{
out <- vglm(.outcome ~ ., propodds, data = dat, weights = wts, ...)
} else {
out <- vglm(.outcome ~ ., propodds, data = dat, ...)
}
out
},
predict = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
probs <- predict(modelFit, data.frame(newdata, check.names = F), type = "response")
predClass <- function (x) {
n <- colnames(x)
factor(as.vector(apply(x, 1, which.max)),
levels = 1:length(n),
labels = n)
}
predClass(probs)
},
prob = function(modelFit, newdata, preProc = NULL, submodels = NULL)
predict(modelFit, data.frame(newdata, check.names = F), type = "response"),
levels = function(x) x@misc$ynames,
tags = c("Cumulative Link", "Logistic Regression", "Accepts Case Weights",
"Probit", "Logit"),
sort = function(x) x)
我使用 VGAM
中的 vglm()
使用 caret
构建了自定义模型。它可以很好地处理简单的效果,但是当我尝试添加交互时它会失败并显示 object 'x1:x2' not found
错误消息,其中 x1
和 x2
是我输入到模型中的预测变量相互作用。问题与预测有关,除非我弄错了,否则它似乎会发生,因为 predict.train
或 predictvglm
试图使用 x1:x2
来预测 类.
我在下面提供了一个工作示例。
# Set up data
set.seed(123)
n <- 100
x1 <- rnorm(n, 175, 7)
x2 <- rnorm(n, 30, 8)
cont <- 0.5 * x1 - 0.3 * x2 + 10 + rnorm(n, 0, 6)
y <- cut(cont, breaks = quantile(cont), include.lowest = TRUE,
labels = c("A", "B", "C", "D"), ordered = TRUE)
d <- data.frame(x1, x2, y)
# My custom caret function
vglmTrain <- list(
label = "VGAM prop odds",
library = "VGAM",
loop = NULL,
type = "Classification",
parameters = data.frame(parameter = "parameter",
class = "character",
label = "parameter"),
grid = function(x, y,
len = NULL, search = "grid") data.frame(parameter = "none"),
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
dat <- if(is.data.frame(x)) x else as.data.frame(x)
dat$.outcome <- y
if(!is.null(wts))
{
out <- vglm(.outcome ~ ., propodds, data = dat, weights = wts, ...)
} else {
out <- vglm(.outcome ~ ., propodds, data = dat, ...)
}
out
},
predict = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
probs <- predict(modelFit, data.frame(newdata), type = "response")
predClass <- function (x) {
n <- colnames(x)
factor(as.vector(apply(x, 1, which.max)),
levels = 1:length(n),
labels = n)
}
predClass(probs)
},
prob = function(modelFit, newdata, preProc = NULL, submodels = NULL)
predict(modelFit, data.frame(newdata), type = "response"),
predictors = function(x, ...) names(attributes(terms(x))$dataClasses[-1]),
levels = function(x) x@misc$ynames,
sort = function(x) x)
现在,如果我尝试使用该函数,如果我为公式提供交互,它会退出并出现错误。
# Load caret
library(caret)
ctrl <- trainControl(method = "cv", number = 2, verboseIter = T)
# A model with no interactions - works
f1 <- train(y ~ x1 + x2, data = d,
method = vglmTrain,
trControl = ctrl)
# A model with interactions - fails
f2 <- train(y ~ x1*x2, data = d,
method = vglmTrain,
trControl = ctrl)
Error in train.default(x, y, weights = w, ...) : Stopping
In addition: Warning messages:
1: In eval(expr, envir, enclos) :
predictions failed for Fold1: parameter=none Error in eval(expr, envir, enclos) : object 'x1:x2' not found
2: In eval(expr, envir, enclos) :
predictions failed for Fold2: parameter=none Error in eval(expr, envir, enclos) : object 'x1:x2' not found
3: In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
There were missing values in resampled performance measures.
这是我的 sessionInfo():
> sessionInfo()
R version 3.2.4 (2016-03-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)
locale:
[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252 LC_MONETARY=English_United States.1252 LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] splines stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] VGAM_1.0-0 caret_6.0-64 ggplot2_2.1.0 lattice_0.20-33
loaded via a namespace (and not attached):
[1] Rcpp_0.12.3 magrittr_1.5 MASS_7.3-45 munsell_0.4.3 colorspace_1.2-6 foreach_1.4.3 minqa_1.2.4 stringr_1.0.0 car_2.1-1
[10] plyr_1.8.3 tools_3.2.4 nnet_7.3-12 pbkrtest_0.4-6 parallel_3.2.4 grid_3.2.4 gtable_0.2.0 nlme_3.1-125 mgcv_1.8-12
[19] quantreg_5.21 e1071_1.6-7 class_7.3-14 MatrixModels_0.4-1 iterators_1.0.8 lme4_1.1-11 Matrix_1.2-3 nloptr_1.0.4 reshape2_1.4.1
[28] codetools_0.2-14 stringi_1.0-1 compiler_3.2.4 scales_0.4.0 SparseM_1.7
有人知道如何解决这个问题吗?
Caret 确实处理交互。但是我找到了解决方法。您可以先调用 model.matrix 来创建具有交互作用的矩阵。您还需要删除拦截。
以你的f2为例,我们指定的数据不是公式,而是x和y。 x 包含带有交互的 model.matrix 规范,-1 删除拦截。这已转换为 data.frame,您可以开始了。
f2 <- train(y = y, x = data.frame(model.matrix(y ~ x1*x2 - 1, data = d)),
method = vglmTrain,
trControl = ctrl)
编辑:
在调试 train.default 并检查您的模型类型规范和其他一些之后,我发现了一项检查是在插入符号模型中完成的,而不是在您的模型中完成的。检查与 predict 和 probs 函数有关。这些都对 Dataframe 进行了检查。如果您将此检查添加到这两个函数,它会按预期工作。
if (!is.data.frame(newdata))
newdata <- as.data.frame(newdata)
整个函数将是:
vglmTrain <- list(
label = "VGAM prop odds",
library = "VGAM",
loop = NULL,
type = "Classification",
parameters = data.frame(parameter = "parameter",
class = "character",
label = "parameter"),
grid = function(x, y,
len = NULL, search = "grid") data.frame(parameter = "none"),
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
dat <- if(is.data.frame(x)) x else as.data.frame(x)
dat$.outcome <- y
if(!is.null(wts))
{
out <- vglm(.outcome ~ ., propodds, data = dat, weights = wts, ...)
} else {
out <- vglm(.outcome ~ ., propodds, data = dat, ...)
}
out
},
predict = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
if (!is.data.frame(newdata))
newdata <- as.data.frame(newdata)
probs <- predict(modelFit, newdata, type = "response")
predClass <- function (x) {
n <- colnames(x)
factor(as.vector(apply(x, 1, which.max)),
levels = 1:length(n),
labels = n)
}
predClass(probs)
},
prob = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
if (!is.data.frame(newdata))
newdata <- as.data.frame(newdata)
predict(modelFit, newdata, type = "response")
},
levels = function(x) x@misc$ynames,
tags = c("Cumulative Link", "Logistic Regression", "Accepts Case Weights",
"Probit", "Logit"),
sort = function(x) x)
Phiver 的解决方案在此示例中运行良好,但当我添加虚拟编码变量时,模型再次失败。
我做了更多的挖掘,问题似乎确实发生了,因为 data.frame
更改了要预测的数据集中列的名称。在我的代码中对 predict
的两次调用中,我现在添加了 data.frame(newdata, check.names = F)
,这似乎可以解决问题。
现在可以使用公式界面
f2 <- train(y ~ x1 * x2, data = d,
method = vglmTrain,
trControl = ctrl)
和模型矩阵法
f2 <- train(y = y, x = data.frame(model.matrix(y ~ x1*x2 - 1, data = d)),
method = vglmTrain,
trControl = ctrl)
这是新代码:
vglmTrain <- list(
label = "VGAM prop odds",
library = "VGAM",
loop = NULL,
type = "Classification",
parameters = data.frame(parameter = "parameter",
class = "character",
label = "parameter"),
grid = function(x, y,
len = NULL, search = "grid") data.frame(parameter = "none"),
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
dat <- if(is.data.frame(x)) x else as.data.frame(x)
dat$.outcome <- y
if(!is.null(wts))
{
out <- vglm(.outcome ~ ., propodds, data = dat, weights = wts, ...)
} else {
out <- vglm(.outcome ~ ., propodds, data = dat, ...)
}
out
},
predict = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
probs <- predict(modelFit, data.frame(newdata, check.names = F), type = "response")
predClass <- function (x) {
n <- colnames(x)
factor(as.vector(apply(x, 1, which.max)),
levels = 1:length(n),
labels = n)
}
predClass(probs)
},
prob = function(modelFit, newdata, preProc = NULL, submodels = NULL)
predict(modelFit, data.frame(newdata, check.names = F), type = "response"),
levels = function(x) x@misc$ynames,
tags = c("Cumulative Link", "Logistic Regression", "Accepts Case Weights",
"Probit", "Logit"),
sort = function(x) x)