检查逻辑回归中的线性
Check linearity in logistic regression
为了检查逻辑回归中的线性度 ->
independent1
和 independent2
变量是否与 depdendent
的对数几率线性相关?
我想优化这个(工作)计算:
这是代码:
# Check Linearity ---------------------------------------------------------
# quartiles of independent1
quantile(df$independent1, probs=c(0, 0.25, 0.5, 0.75, 1))
table(df$dependent[df$independent1<52])
table(df$dependent[df$independent1>=52 & df$independent1 < 60])
table(df$dependent[df$independent1>=60 & df$independent1 < 73])
table(df$dependent[df$independent1>=73 & df$independent1 < 91])
p1 <- mean(df$dependent[df$independent1<52])
p2 <- mean(df$dependent[df$independent1>=52 & df$independent1 < 60])
p3 <- mean(df$dependent[df$independent1>=60 & df$independent1 < 73])
p4 <- mean(df$dependent[df$independent1>=73 & df$independent1 < 91])
probs <- c(p1, p2, p3, p4)
# calculate the log-odds
logits <- log(probs/(1-probs))
# quartiles of independent1
q <- quantile(df$independent1, probs=seq(0,1,0.25))
# calculate median independent1 for each of the 4 groups
meds <- c( median(df$independent1[ df$independent1<q[2]]),
median(df$independent1[ df$independent1>=q[2] & df$independent1<q[3]]),
median(df$independent1[ df$independent1>=q[3] & df$independent1<q[4]]),
median(df$independent1[ df$independent1>=q[4]])
)
plot(meds, logits, main="xxx",
xlab = "independent1",
ylab = "log-odds(dependent|independent1)", las=1)
对于一个变量,这可能没问题。但是我有更多的自变量。那么我如何为每个独立变量优化这段代码(检查和绘图)(在这个例子中 independent1
和 independent2
)
我的数据框:
df <- structure(list(dependent = c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), independent1 = c(84,
49, 54, 75, 49, 70, 75, 42, 60, 72, 80, 73, 51, 61, 59, 78, 45,
38, 78, 65, 91, 60, 39, 31, 42, 72, 41, 77, 73, 74, 39, 86, 71,
55, 43, 75, 80, 75, 67, 74, 46, 70, 57, 66, 57, 72, 46, 52, 53,
76, 57, 86, 67, 71, 57, 50, 76, 61, 41, 57, 62, 41, 64, 82, 53,
75, 59, 38, 54, 56, 68, 63, 73, 26, 75, 76, 81, 46, 77, 53, 59,
66, 51, 72, 80, 70, 39, 57, 62, 85, 84, 57, 73, 55, 70, 78, 66,
69, 60, 51, 72, 68, 60, 62, 64, 44, 50, 59, 45, 81, 54, 68, 75,
66, 54, 45, 52, 87, 44, 77, 49, 84, 68, 76, 82, 44, 58, 55, 69,
33, 48, 62, 60, 76, 56, 73, 55, 58, 53, 53, 60, 52, 60, 41, 39,
36, 38, 59, 54, 64), independent2 = c(23, 25, 34, 25, 31, 25,
32, 19, 25, 28, 22, 18, 30, 26, 25, 25, 25, 19, 24, 27, 23, 28,
39, 27, 30, 28, 22, 28, 25, 23, 18, 27, 27, 19, 25, 27, 26, 26,
21, 26, 23, 28, 37, 32, 24, 32, 26, 23, 24, 27, 28, 25, 24, 22,
34, 23, 35, 20, 29, 29, 21, 29, 25, 26, 23, 33, 25, 26, 29, 27,
26, 28, 19, 22, 29, 22, 26, 35, 32, 29, 26, 23, 31, 30, 27, 28,
23, 27, 34, 22, 24, 28, 21, 25, 18, 32, 21, 24, 31, 31, 24, 30,
27, 23, 16, 26, 26, 19, 38, 21, 32, 34, 28, 19, 30, 24, 26, 24,
40, 26, 15, 26, 28, 22, 25, 26, 31, 24, 26, 42, 26, 30, 28, 21,
21, 19, 22, 20, 26, 31, 22, 25, 21, 20, 27, 27, 26, 29, 22, 24
)), row.names = c(NA, -150L), class = c("tbl_df", "tbl", "data.frame"
))
我将展示一种稍微不同但显然更有效的方法来拆分要在逻辑回归模型中使用的变量:
df$q41 <- with(df, cut(independent1, quantile(independent1), include = TRUE))
# creates 4 level factor into roughly equally sized groups
table(df$q41)
#--------------------
#[26,52] (52,60] (60,73] (73,91]
# 39 37 39 35
#Examine for "eyeball" trends in the log-odds of dependent
fit1.q41 <- glm(dependent~q41+0, data=df, fam="binomial")
fit1.q41
#---------------------------
Call: glm(formula = dependent ~ q41 + 0, family = "binomial", data = df)
Coefficients:
q41[26,52] q41(52,60] q41(60,73] q41(73,91]
-3.638 -2.862 -2.918 -2.048
Degrees of Freedom: 150 Total (i.e. Null); 146 Residual
Null Deviance: 207.9
Residual Deviance: 65.52 AIC: 73.52
我选择删除截距项,因为它的存在阻止了在与上面 3 个相同的尺度上查看最低组的系数。系数只是我创建的分组的对数。比较:
> logits
[1] -3.555348 -2.740840 -2.970414 -2.169054
> coef(fit1.q41)
q41[26,52] q41(52,60] q41(60,73] q41(73,91]
-3.637586 -2.862201 -2.917771 -2.047693
然后我尝试使该过程自动化,但 运行 遇到了一些问题,因为其中一个四分位数组中的事件数量很少,independent2 中最低四分位数的系数低得离谱,来自该类别中缺少任何事件或“1”。 (-19.566069 的估计对数几率确实指向 0 的比例。)
lapply( df[-1], function(x){cat(str(x)); IVq <- cut(x, quantile(x), include = TRUE); logits<-coef( summary(glm(df$dependent~IVq+0, fam="binomial"))); logits})
num [1:150] 84 49 54 75 49 70 75 42 60 72 ...
num [1:150] 23 25 34 25 31 25 32 19 25 28 ...
$independent1
Estimate Std. Error z value Pr(>|z|)
IVq[26,52] -3.637586 1.0130639 -3.590678 3.298191e-04
IVq(52,60] -2.862201 0.7270292 -3.936845 8.256004e-05
IVq(60,73] -2.917771 0.7259663 -4.019155 5.840732e-05
IVq(73,91] -2.047693 0.5312796 -3.854266 1.160776e-04
$independent2
Estimate Std. Error z value Pr(>|z|)
IVq[15,23] -19.566069 1639.9716035 -0.01193074 9.904809e-01
IVq(23,26] -3.091042 0.7229988 -4.27530783 1.908734e-05
IVq(26,28] -2.397895 0.7385489 -3.24676555 1.167245e-03
IVq(28,42] -1.856298 0.4808846 -3.86017349 1.133066e-04
> lapply( df[-1], function(x){ IVq <- cut(x, quantile(x), include = TRUE); table(IVq, df$dependent) })
$independent1
IVq 0 1
[26,52] 38 1
(52,60] 35 2
(60,73] 37 2
(73,91] 31 4
$independent2
IVq 0 1
[15,23] 43 0
(23,26] 44 2
(26,28] 22 2
(28,42] 32 5
无论如何,我认为我已经展示了一种更接近 R 的方法来计算四分位数内的对数。它还为您提供了一种模型比较方法,用于检查线性偏差以及演示可能的陷阱。如果您有更多事件,您可能会考虑通过在简单线性模型之上添加四分位数因子来查看空模型的偏差变化……或者更强大地使用 poly
来创建比较型号。
过去,在处理具有足够数量事件的数据集时,我选择根据从 event==1
子集计算的分位数进行拆分,而不是让拆分基于整个数据集。
为了检查逻辑回归中的线性度 ->
independent1
和 independent2
变量是否与 depdendent
的对数几率线性相关?
我想优化这个(工作)计算:
这是代码:
# Check Linearity ---------------------------------------------------------
# quartiles of independent1
quantile(df$independent1, probs=c(0, 0.25, 0.5, 0.75, 1))
table(df$dependent[df$independent1<52])
table(df$dependent[df$independent1>=52 & df$independent1 < 60])
table(df$dependent[df$independent1>=60 & df$independent1 < 73])
table(df$dependent[df$independent1>=73 & df$independent1 < 91])
p1 <- mean(df$dependent[df$independent1<52])
p2 <- mean(df$dependent[df$independent1>=52 & df$independent1 < 60])
p3 <- mean(df$dependent[df$independent1>=60 & df$independent1 < 73])
p4 <- mean(df$dependent[df$independent1>=73 & df$independent1 < 91])
probs <- c(p1, p2, p3, p4)
# calculate the log-odds
logits <- log(probs/(1-probs))
# quartiles of independent1
q <- quantile(df$independent1, probs=seq(0,1,0.25))
# calculate median independent1 for each of the 4 groups
meds <- c( median(df$independent1[ df$independent1<q[2]]),
median(df$independent1[ df$independent1>=q[2] & df$independent1<q[3]]),
median(df$independent1[ df$independent1>=q[3] & df$independent1<q[4]]),
median(df$independent1[ df$independent1>=q[4]])
)
plot(meds, logits, main="xxx",
xlab = "independent1",
ylab = "log-odds(dependent|independent1)", las=1)
对于一个变量,这可能没问题。但是我有更多的自变量。那么我如何为每个独立变量优化这段代码(检查和绘图)(在这个例子中 independent1
和 independent2
)
我的数据框:
df <- structure(list(dependent = c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), independent1 = c(84,
49, 54, 75, 49, 70, 75, 42, 60, 72, 80, 73, 51, 61, 59, 78, 45,
38, 78, 65, 91, 60, 39, 31, 42, 72, 41, 77, 73, 74, 39, 86, 71,
55, 43, 75, 80, 75, 67, 74, 46, 70, 57, 66, 57, 72, 46, 52, 53,
76, 57, 86, 67, 71, 57, 50, 76, 61, 41, 57, 62, 41, 64, 82, 53,
75, 59, 38, 54, 56, 68, 63, 73, 26, 75, 76, 81, 46, 77, 53, 59,
66, 51, 72, 80, 70, 39, 57, 62, 85, 84, 57, 73, 55, 70, 78, 66,
69, 60, 51, 72, 68, 60, 62, 64, 44, 50, 59, 45, 81, 54, 68, 75,
66, 54, 45, 52, 87, 44, 77, 49, 84, 68, 76, 82, 44, 58, 55, 69,
33, 48, 62, 60, 76, 56, 73, 55, 58, 53, 53, 60, 52, 60, 41, 39,
36, 38, 59, 54, 64), independent2 = c(23, 25, 34, 25, 31, 25,
32, 19, 25, 28, 22, 18, 30, 26, 25, 25, 25, 19, 24, 27, 23, 28,
39, 27, 30, 28, 22, 28, 25, 23, 18, 27, 27, 19, 25, 27, 26, 26,
21, 26, 23, 28, 37, 32, 24, 32, 26, 23, 24, 27, 28, 25, 24, 22,
34, 23, 35, 20, 29, 29, 21, 29, 25, 26, 23, 33, 25, 26, 29, 27,
26, 28, 19, 22, 29, 22, 26, 35, 32, 29, 26, 23, 31, 30, 27, 28,
23, 27, 34, 22, 24, 28, 21, 25, 18, 32, 21, 24, 31, 31, 24, 30,
27, 23, 16, 26, 26, 19, 38, 21, 32, 34, 28, 19, 30, 24, 26, 24,
40, 26, 15, 26, 28, 22, 25, 26, 31, 24, 26, 42, 26, 30, 28, 21,
21, 19, 22, 20, 26, 31, 22, 25, 21, 20, 27, 27, 26, 29, 22, 24
)), row.names = c(NA, -150L), class = c("tbl_df", "tbl", "data.frame"
))
我将展示一种稍微不同但显然更有效的方法来拆分要在逻辑回归模型中使用的变量:
df$q41 <- with(df, cut(independent1, quantile(independent1), include = TRUE))
# creates 4 level factor into roughly equally sized groups
table(df$q41)
#--------------------
#[26,52] (52,60] (60,73] (73,91]
# 39 37 39 35
#Examine for "eyeball" trends in the log-odds of dependent
fit1.q41 <- glm(dependent~q41+0, data=df, fam="binomial")
fit1.q41
#---------------------------
Call: glm(formula = dependent ~ q41 + 0, family = "binomial", data = df)
Coefficients:
q41[26,52] q41(52,60] q41(60,73] q41(73,91]
-3.638 -2.862 -2.918 -2.048
Degrees of Freedom: 150 Total (i.e. Null); 146 Residual
Null Deviance: 207.9
Residual Deviance: 65.52 AIC: 73.52
我选择删除截距项,因为它的存在阻止了在与上面 3 个相同的尺度上查看最低组的系数。系数只是我创建的分组的对数。比较:
> logits
[1] -3.555348 -2.740840 -2.970414 -2.169054
> coef(fit1.q41)
q41[26,52] q41(52,60] q41(60,73] q41(73,91]
-3.637586 -2.862201 -2.917771 -2.047693
然后我尝试使该过程自动化,但 运行 遇到了一些问题,因为其中一个四分位数组中的事件数量很少,independent2 中最低四分位数的系数低得离谱,来自该类别中缺少任何事件或“1”。 (-19.566069 的估计对数几率确实指向 0 的比例。)
lapply( df[-1], function(x){cat(str(x)); IVq <- cut(x, quantile(x), include = TRUE); logits<-coef( summary(glm(df$dependent~IVq+0, fam="binomial"))); logits})
num [1:150] 84 49 54 75 49 70 75 42 60 72 ...
num [1:150] 23 25 34 25 31 25 32 19 25 28 ...
$independent1
Estimate Std. Error z value Pr(>|z|)
IVq[26,52] -3.637586 1.0130639 -3.590678 3.298191e-04
IVq(52,60] -2.862201 0.7270292 -3.936845 8.256004e-05
IVq(60,73] -2.917771 0.7259663 -4.019155 5.840732e-05
IVq(73,91] -2.047693 0.5312796 -3.854266 1.160776e-04
$independent2
Estimate Std. Error z value Pr(>|z|)
IVq[15,23] -19.566069 1639.9716035 -0.01193074 9.904809e-01
IVq(23,26] -3.091042 0.7229988 -4.27530783 1.908734e-05
IVq(26,28] -2.397895 0.7385489 -3.24676555 1.167245e-03
IVq(28,42] -1.856298 0.4808846 -3.86017349 1.133066e-04
> lapply( df[-1], function(x){ IVq <- cut(x, quantile(x), include = TRUE); table(IVq, df$dependent) })
$independent1
IVq 0 1
[26,52] 38 1
(52,60] 35 2
(60,73] 37 2
(73,91] 31 4
$independent2
IVq 0 1
[15,23] 43 0
(23,26] 44 2
(26,28] 22 2
(28,42] 32 5
无论如何,我认为我已经展示了一种更接近 R 的方法来计算四分位数内的对数。它还为您提供了一种模型比较方法,用于检查线性偏差以及演示可能的陷阱。如果您有更多事件,您可能会考虑通过在简单线性模型之上添加四分位数因子来查看空模型的偏差变化……或者更强大地使用 poly
来创建比较型号。
过去,在处理具有足够数量事件的数据集时,我选择根据从 event==1
子集计算的分位数进行拆分,而不是让拆分基于整个数据集。