predict.cv.glmnet() 如何计算二项式模型的 link 值?

How does predict.cv.glmnet() calculate the link value for binomial models?

我认为这是我的编码错误,但我不知道自己做错了什么。当 x+x^2 在二项式 glmnet 模型中时,我正在尝试为 x 创建边际效应图。当我在新数据集上使用 predict.cv.glmnet() 进行预测时,模型系数的符号似乎被切换了。所以我一直在比较硬编码模型公式和预测函数时的预测。我对新数据而不是训练数据得到不同的预测,我不确定为什么。

#############example############
library(tidyverse)
library(glmnet)
data(BinomialExample)
#head(x)
#head(y)


#training data
squareit<-function(x){x^2}
x%>%
  as.data.frame()%>%
  dplyr::select(V11, V23, V30)%>%#select three variables
  mutate_all(list(SQ = squareit))%>%#square them
  as.matrix()%>%{.->>x1}#back to matrix
# x1[1:5,]
#             V11         V23        V30    V11_SQ       V23_SQ    V30_SQ
# [1,]  0.5706039  0.01473546  0.9080937 0.3255888 0.0002171337 0.8246342
# [2,]  0.4138668 -0.81898245 -2.0492817 0.1712857 0.6707322586 4.1995554
# [3,] -0.4876853  0.03927982 -1.2089006 0.2378369 0.0015429039 1.4614407
# [4,]  0.4644753 -0.98728378  1.7796744 0.2157373 0.9747292699 3.1672408
# [5,] -0.5239595 -0.13288456 -1.1970731 0.2745336 0.0176583076 1.4329841

#example model + coefficients
set.seed(4484)
final.glmnet<-cv.glmnet(x1, y, type.measure="auc",alpha=1,family="binomial")
coef(final.glmnet,s="lambda.min")#print model coefficients
# (Intercept)  0.4097381
# V11         -0.4487949
# V23          0.3322128
# V30         -0.2924600
# V11_SQ      -0.3334376
# V23_SQ      -0.2867963
# V30_SQ       0.3864748

#get model formula - I'm sure there is a better way
mc<-data.frame(as.matrix(coef(final.glmnet,s="lambda.min")))%>%
  rownames_to_column(.,var="rowname")%>%mutate(rowname=recode(rowname,`(Intercept)`="1"))
paste("(",mc$rowname,"*",mc$X1,")",sep="",collapse = "+")
#model formula
# (1*0.409738094215867)+(V11*-0.44879489356345)+(V23*0.332212780157358)+
# (V30*-0.292459974168585)+(V11_SQ*-0.333437624504966)+
# (V23_SQ*-0.286796292157334)+(V30_SQ*0.386474813542322)

#####predict on training data -- same predictions!
#1 predict using predict.cv.glmnet()
pred_cv.glmnet<-predict(final.glmnet, s=final.glmnet$lambda.min, newx=x1[1:5,], type='link')
round(pred_cv.glmnet,3)
#2 predict using model formula
pred.model.formula<-data.frame(x1[1:5,])%>%
  mutate(link=(1*0.409738094215867)+(V11*-0.44879489356345)+
           (V23*0.332212780157358)+(V30*-0.292459974168585)+
           (V11_SQ*-0.333437624504966)+(V23_SQ*-0.286796292157334)+
           (V30_SQ*0.386474813542322))%>%
  dplyr::select(link)
round(pred.model.formula,3)
# 1         0.103
# 2         1.925
# 3         1.480
# 4         0.225
# 5         1.408

#new data - set up for maringal effects plot
colnames(x1)
test<-data.frame(V23=seq(min(x1[,2]),max(x1[,2]),0.1))%>%#let V23 vary from min-max by .1
  mutate(V23_SQ=V23^2, V11=0, V11_SQ=0, V30=0, V30_SQ=0)%>%#square V23 and set others to 0
as.matrix()
test[1:5,]
# > test[1:5,]
#           V23   V23_SQ V11 V11_SQ V30 V30_SQ
# [1,] -2.071573 4.291414   0      0   0      0
# [2,] -1.971573 3.887100   0      0   0      0
# [3,] -1.871573 3.502785   0      0   0      0
# [4,] -1.771573 3.138470   0      0   0      0
# [5,] -1.671573 2.794156   0      0   0      0

#####predict on new data -- different predictions!
#1 predict using predict.cv.glmnet()
pred_cv.glmnet<-predict(final.glmnet, s=final.glmnet$lambda.min, newx=test[1:5,], type='link')
round(pred_cv.glmnet,3)
# [1,] 2.765
# [2,] 2.586
# [3,] 2.413
# [4,] 2.247
# [5,] 2.088

#2 predict using model formula
pred.model.formula<-data.frame(test[1:5,])%>%
  mutate(link.longhand=(1*0.409738094215867)+(V11*-0.44879489356345)+
           (V23*0.332212780157358)+(V30*-0.292459974168585)+
           (V11_SQ*-0.333437624504966)+(V23_SQ*-0.286796292157334)+
           (V30_SQ*0.386474813542322))%>%
  dplyr::select(link.longhand)
round(pred.model.formula,3)
# 1        -1.509
# 2        -1.360
# 3        -1.217
# 4        -1.079
# 5        -0.947

您对计算原理的理解似乎是正确的。但是,predict 函数要求 newx 矩阵具有与原始矩阵相同的维度(即列需要以相同的顺序排列)。当列混淆时,predict 函数结果将变得混乱,如您在示例中所见。

如果您添加一个快速调整,将 test 矩阵的顺序与 x1 矩阵的顺序相同(方便的快捷方式是仅使用原始矩阵中的 dimnames dplyr::select 就像下面的第 3 行)...

test <- data.frame(V23 = seq(min(x1[, 2]), max(x1[, 2]), 0.1)) %>% #let V23 vary from min-max by .1
    mutate(V23_SQ = V23^2, V11 = 0, V11_SQ = 0, V30 = 0, V30_SQ = 0) %>% #square V23 and set others to 0
    select(dimnames(x1)[[2]]) %>% #use the dimnames from x1 to select test in proper order!
    as.matrix()

...您会发现您得到了符合预期的正确结果。

#1 predict using predict.cv.glmnet()
pred_cv.glmnet <- predict(final.glmnet, s = 'lambda.min', newx = test[1:5,], type = 'link')
round(pred_cv.glmnet,3)
# [1,] -1.509
# [2,] -1.360
# [3,] -1.217
# [4,] -1.079
# [5,] -0.947

#2 predict using model formula
pred.model.formula <- data.frame(test[1:5,]) %>%
    mutate(link.longhand = ((1 * 0.409738094215867) +
               (V11 * -0.44879489356345) +
               (V23 * 0.332212780157358) +
               (V30 * -0.292459974168585) +
               (V11_SQ * -0.333437624504966) +
               (V23_SQ * -0.286796292157334) +
               (V30_SQ * 0.386474813542322))) %>%
    dplyr::select(link.longhand)
round(pred.model.formula, 3)
# link.longhand
# 1        -1.509
# 2        -1.360
# 3        -1.217
# 4        -1.079
# 5        -0.947