检查逻辑回归中的线性

Check linearity in logistic regression

为了检查逻辑回归中的线性度 ->

independent1independent2 变量是否与 depdendent 的对数几率线性相关?

我想优化这个(工作)计算:

这是代码:

# Check Linearity  ---------------------------------------------------------

# quartiles of independent1
quantile(df$independent1, probs=c(0, 0.25, 0.5, 0.75, 1))


table(df$dependent[df$independent1<52])
table(df$dependent[df$independent1>=52 & df$independent1 < 60])
table(df$dependent[df$independent1>=60 & df$independent1 < 73])
table(df$dependent[df$independent1>=73 & df$independent1 < 91])

p1 <- mean(df$dependent[df$independent1<52])
p2 <- mean(df$dependent[df$independent1>=52 & df$independent1 < 60])
p3 <- mean(df$dependent[df$independent1>=60 & df$independent1 < 73])
p4 <- mean(df$dependent[df$independent1>=73 & df$independent1 < 91])
probs <- c(p1, p2, p3, p4)

# calculate the log-odds
logits <- log(probs/(1-probs))

# quartiles of independent1
q <- quantile(df$independent1, probs=seq(0,1,0.25))

# calculate median independent1 for each of the 4 groups
meds <- c( median(df$independent1[ df$independent1<q[2]]),
           median(df$independent1[ df$independent1>=q[2] & df$independent1<q[3]]),
           median(df$independent1[ df$independent1>=q[3] & df$independent1<q[4]]),
           median(df$independent1[ df$independent1>=q[4]])
)

plot(meds, logits, main="xxx",
     xlab = "independent1",
     ylab = "log-odds(dependent|independent1)", las=1)

对于一个变量,这可能没问题。但是我有更多的自变量。那么我如何为每个独立变量优化这段代码(检查和绘图)(在这个例子中 independent1independent2

我的数据框:

df <- structure(list(dependent = c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), independent1 = c(84, 
49, 54, 75, 49, 70, 75, 42, 60, 72, 80, 73, 51, 61, 59, 78, 45, 
38, 78, 65, 91, 60, 39, 31, 42, 72, 41, 77, 73, 74, 39, 86, 71, 
55, 43, 75, 80, 75, 67, 74, 46, 70, 57, 66, 57, 72, 46, 52, 53, 
76, 57, 86, 67, 71, 57, 50, 76, 61, 41, 57, 62, 41, 64, 82, 53, 
75, 59, 38, 54, 56, 68, 63, 73, 26, 75, 76, 81, 46, 77, 53, 59, 
66, 51, 72, 80, 70, 39, 57, 62, 85, 84, 57, 73, 55, 70, 78, 66, 
69, 60, 51, 72, 68, 60, 62, 64, 44, 50, 59, 45, 81, 54, 68, 75, 
66, 54, 45, 52, 87, 44, 77, 49, 84, 68, 76, 82, 44, 58, 55, 69, 
33, 48, 62, 60, 76, 56, 73, 55, 58, 53, 53, 60, 52, 60, 41, 39, 
36, 38, 59, 54, 64), independent2 = c(23, 25, 34, 25, 31, 25, 
32, 19, 25, 28, 22, 18, 30, 26, 25, 25, 25, 19, 24, 27, 23, 28, 
39, 27, 30, 28, 22, 28, 25, 23, 18, 27, 27, 19, 25, 27, 26, 26, 
21, 26, 23, 28, 37, 32, 24, 32, 26, 23, 24, 27, 28, 25, 24, 22, 
34, 23, 35, 20, 29, 29, 21, 29, 25, 26, 23, 33, 25, 26, 29, 27, 
26, 28, 19, 22, 29, 22, 26, 35, 32, 29, 26, 23, 31, 30, 27, 28, 
23, 27, 34, 22, 24, 28, 21, 25, 18, 32, 21, 24, 31, 31, 24, 30, 
27, 23, 16, 26, 26, 19, 38, 21, 32, 34, 28, 19, 30, 24, 26, 24, 
40, 26, 15, 26, 28, 22, 25, 26, 31, 24, 26, 42, 26, 30, 28, 21, 
21, 19, 22, 20, 26, 31, 22, 25, 21, 20, 27, 27, 26, 29, 22, 24
)), row.names = c(NA, -150L), class = c("tbl_df", "tbl", "data.frame"
))

我将展示一种稍微不同但显然更有效的方法来拆分要在逻辑回归模型中使用的变量:

df$q41 <- with(df, cut(independent1, quantile(independent1), include = TRUE))
 # creates 4 level factor into roughly equally sized groups
 table(df$q41)
#--------------------
#[26,52] (52,60] (60,73] (73,91] 
#     39      37      39      35 

#Examine for "eyeball" trends in the log-odds of dependent 
fit1.q41 <- glm(dependent~q41+0, data=df, fam="binomial")
fit1.q41
#---------------------------
Call:  glm(formula = dependent ~ q41 + 0, family = "binomial", data = df)

Coefficients:
q41[26,52]  q41(52,60]  q41(60,73]  q41(73,91]  
    -3.638      -2.862      -2.918      -2.048  

Degrees of Freedom: 150 Total (i.e. Null);  146 Residual
Null Deviance:      207.9 
Residual Deviance: 65.52    AIC: 73.52

我选择删除截距项,因为它的存在阻止了在与上面 3 个相同的尺度上查看最低组的系数。系数只是我创建的分组的对数。比较:

> logits
[1] -3.555348 -2.740840 -2.970414 -2.169054
> coef(fit1.q41)
q41[26,52] q41(52,60] q41(60,73] q41(73,91] 
 -3.637586  -2.862201  -2.917771  -2.047693 

然后我尝试使该过程自动化,但 运行 遇到了一些问题,因为其中一个四分位数组中的事件数量很少,independent2 中最低四分位数的系数低得离谱,来自该类别中缺少任何事件或“1”。 (-19.566069 的估计对数几率确实指向 0 的比例。)

 lapply( df[-1], function(x){cat(str(x)); IVq <- cut(x, quantile(x), include = TRUE); logits<-coef( summary(glm(df$dependent~IVq+0, fam="binomial"))); logits})
 num [1:150] 84 49 54 75 49 70 75 42 60 72 ...
 num [1:150] 23 25 34 25 31 25 32 19 25 28 ...
$independent1
            Estimate Std. Error   z value     Pr(>|z|)
IVq[26,52] -3.637586  1.0130639 -3.590678 3.298191e-04
IVq(52,60] -2.862201  0.7270292 -3.936845 8.256004e-05
IVq(60,73] -2.917771  0.7259663 -4.019155 5.840732e-05
IVq(73,91] -2.047693  0.5312796 -3.854266 1.160776e-04

$independent2
             Estimate   Std. Error     z value     Pr(>|z|)
IVq[15,23] -19.566069 1639.9716035 -0.01193074 9.904809e-01
IVq(23,26]  -3.091042    0.7229988 -4.27530783 1.908734e-05
IVq(26,28]  -2.397895    0.7385489 -3.24676555 1.167245e-03
IVq(28,42]  -1.856298    0.4808846 -3.86017349 1.133066e-04


> lapply( df[-1], function(x){ IVq <- cut(x, quantile(x), include = TRUE); table(IVq, df$dependent) })
$independent1
         
IVq        0  1
  [26,52] 38  1
  (52,60] 35  2
  (60,73] 37  2
  (73,91] 31  4

$independent2
         
IVq        0  1
  [15,23] 43  0
  (23,26] 44  2
  (26,28] 22  2
  (28,42] 32  5

无论如何,我认为我已经展示了一种更接近 R 的方法来计算四分位数内的对数。它还为您提供了一种模型比较方法,用于检查线性偏差以及演示可能的陷阱。如果您有更多事件,您可能会考虑通过在简单线性模型之上添加四分位数因子来查看空模型的偏差变化……或者更强大地使用 poly 来创建比较型号。

过去,在处理具有足够数量事件的数据集时,我选择根据从 event==1 子集计算的分位数进行拆分,而不是让拆分基于整个数据集。