具有连续变量的有序逻辑回归 - 缩放
ordinal logistic regression with continuous variables - scaling
我一直在学习逻辑回归,并通过一个 R 数据分析示例发现了这个 great post。我已经为我的分析调整了代码,到目前为止一切正常。
我确实有一个连续的预测器。我已经使用这些命令获得了一个 table,它显示了如果我们一次一个地将因变量回归到我们的预测变量,我们将得到的(线性)预测值。但是,该命令似乎正在将连续变量转换为分类变量。
> ## Ordinal logistic regression (OLR) ##
> # https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/
> mod_OLRfull <- polr(Percentage_f ~ Gender + SE_track + Total_testscore, data = mydata, Hess=TRUE)
> # calculate essential metrics
> ctable <- coef(summary(mod_OLRfull))
> p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
> ctable <- cbind(ctable, "p value" = p)
> # check if assumption holds: proportional odds
> sf <- function(y) {
+ c('Y>=1' = qlogis(mean(y >= 1)),
+ 'Y>=2' = qlogis(mean(y >= 2)),
+ 'Y>=3' = qlogis(mean(y >= 3)))#,
+ # 'Y>=4' = qlogis(mean(y >= 4)))
+ }
> s <- with(mydata, summary(as.numeric(Percentage_f) ~ Gender + SE_track + Total_testscore, fun=sf))
> s
as.numeric(Percentage_f) N= 286
+---------------+-------+---+----+---------+----------+
| | |N |Y>=1|Y>=2 |Y>=3 |
+---------------+-------+---+----+---------+----------+
|Gender |male | 97|Inf |1.2862109|-1.1685709|
| |female |189|Inf |1.5170646|-0.8397507|
+---------------+-------+---+----+---------+----------+
|SE_track |KSO | 39|Inf |1.0647107|-1.3545457|
| |TSO | 40|Inf |0.7308875|-1.7346011|
| |ASO |207|Inf |1.6990501|-0.7591051|
+---------------+-------+---+----+---------+----------+
|Total_testscore|[ 2, 8)| 74|Inf |0.8602013|-1.6422277|
| |[ 8,11)|104|Inf |1.6326948|-1.3156768|
| |[11,13)| 58|Inf |1.3437347|-0.5663955|
| |[13,16]| 50|Inf |2.4423470| 0.0000000|
+---------------+-------+---+----+---------+----------+
|Overall | |286|Inf |1.4350845|-0.9458495|
+---------------+-------+---+----+---------+----------+
> glm(I(as.numeric(Percentage_f) >= 2) ~ Gender + SE_track + Total_testscore, family = "binomial", data = mydata)
Call: glm(formula = I(as.numeric(Percentage_f) >= 2) ~ Gender + SE_track +
Total_testscore, family = "binomial", data = mydata)
> glm(I(as.numeric(Percentage_f) >= 3) ~ Gender + SE_track + Total_testscore, family = "binomial", data = mydata)
> glm(I(as.numeric(Percentage_f) >= 4) ~ Gender + SE_track + Total_testscore, family = "binomial", data = mydata)
> s[, 4] <- s[, 4] - s[, 3]
> s[, 3] <- s[, 3] - s[, 3]
> s
as.numeric(Percentage_f) N= 286
+---------------+-------+---+----+----+---------+
| | |N |Y>=1|Y>=2|Y>=3 |
+---------------+-------+---+----+----+---------+
|Gender |male | 97|Inf |0 |-2.454782|
| |female |189|Inf |0 |-2.356815|
+---------------+-------+---+----+----+---------+
|SE_track |KSO | 39|Inf |0 |-2.419256|
| |TSO | 40|Inf |0 |-2.465489|
| |ASO |207|Inf |0 |-2.458155|
+---------------+-------+---+----+----+---------+
|Total_testscore|[ 2, 8)| 74|Inf |0 |-2.502429|
| |[ 8,11)|104|Inf |0 |-2.948372|
| |[11,13)| 58|Inf |0 |-1.910130|
| |[13,16]| 50|Inf |0 |-2.442347|
+---------------+-------+---+----+----+---------+
|Overall | |286|Inf |0 |-2.380934|
+---------------+-------+---+----+----+---------+
问题:
如何更改我的变量 Total_testscore 以间隔 [ 2, 8), [ 8,11), [11,13), [13,16]
拆分?我想将它们更改为 [ 0, 5), [ 5,10), [10,13), [13,16]
您可以在 运行 模型之前在数据框中创建所需的间隔。可能有更好的方法,但在不查看数据的情况下,这样的方法应该可行。
library(dplyr)
mydata = mydata %>%
mutate(
`Total_testscore_[0,5)` = ifelse(Total_testscore>= 0 & Total_testscore < 5,1,0),
`Total_testscore_[5,10)` = ifelse(Total_testscore>= 5 & Total_testscore < 10,1,0),
`Total_testscore_[10,13)` = ifelse(Total_testscore>= 10 & Total_testscore < 13,1,0),
`Total_testscore_[13,16)` = ifelse(Total_testscore>= 13 & Total_testscore < 16,1,0)) %>%
select(.,-Total_testscore)
解决方案是在回归中使用之前缩放连续变量,使用:
starters$Total_testscore_f <- cut(starters$Total_testscore, breaks = c(0,5,10,13,16))
s <- with(mydata, summary(as.numeric(Percentage_f) ~ Gender + SE_track + Total_testscore_f, fun=sf))
glm(I(as.numeric(Percentage_f) >= 2) ~ Gender + SE_track + Total_testscore_f, family = "binomial", data = mydata)
glm(I(as.numeric(Percentage_f) >= 3) ~ Gender + SE_track + Total_testscore_f, family = "binomial", data = mydata)
glm(I(as.numeric(Percentage_f) >= 4) ~ Gender + SE_track + Total_testscore_f, family = "binomial", data = mydata)
s[, 4] <- s[, 4] - s[, 3]
s[, 3] <- s[, 3] - s[, 3]
s
# plot
par(mfrow = c(1,1))
plot(s, which=1:3, pch=1:3, xlab='logit', main=' ', xlim = c(-3,0))#xlim=range(s[,3:4]))
# suggesting that the proportional odds assumption may not hold
as.numeric(Percentage_f) N= 286 , 2 Missing
+-----------------+-------+---+----+----+---------+
| | |N |Y>=1|Y>=2|Y>=3 |
+-----------------+-------+---+----+----+---------+
|Gender |male | 97|Inf |0 |-2.454782|
| |female |189|Inf |0 |-2.356815|
+-----------------+-------+---+----+----+---------+
|SE_track |KSO | 39|Inf |0 |-2.419256|
| |TSO | 40|Inf |0 |-2.465489|
| |ASO |207|Inf |0 |-2.458155|
+-----------------+-------+---+----+----+---------+
|Total_testscore_f|(0,5] | 25|Inf |0 |-1.912387|
| |(5,10] |153|Inf |0 |-2.956124|
| |(10,13]| 81|Inf |0 |-2.096264|
| |(13,16]| 27|Inf |0 |-2.151035|
+-----------------+-------+---+----+----+---------+
|Overall | |286|Inf |0 |-2.380934|
+-----------------+-------+---+----+----+---------+
我一直在学习逻辑回归,并通过一个 R 数据分析示例发现了这个 great post。我已经为我的分析调整了代码,到目前为止一切正常。
我确实有一个连续的预测器。我已经使用这些命令获得了一个 table,它显示了如果我们一次一个地将因变量回归到我们的预测变量,我们将得到的(线性)预测值。但是,该命令似乎正在将连续变量转换为分类变量。
> ## Ordinal logistic regression (OLR) ##
> # https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/
> mod_OLRfull <- polr(Percentage_f ~ Gender + SE_track + Total_testscore, data = mydata, Hess=TRUE)
> # calculate essential metrics
> ctable <- coef(summary(mod_OLRfull))
> p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
> ctable <- cbind(ctable, "p value" = p)
> # check if assumption holds: proportional odds
> sf <- function(y) {
+ c('Y>=1' = qlogis(mean(y >= 1)),
+ 'Y>=2' = qlogis(mean(y >= 2)),
+ 'Y>=3' = qlogis(mean(y >= 3)))#,
+ # 'Y>=4' = qlogis(mean(y >= 4)))
+ }
> s <- with(mydata, summary(as.numeric(Percentage_f) ~ Gender + SE_track + Total_testscore, fun=sf))
> s
as.numeric(Percentage_f) N= 286
+---------------+-------+---+----+---------+----------+
| | |N |Y>=1|Y>=2 |Y>=3 |
+---------------+-------+---+----+---------+----------+
|Gender |male | 97|Inf |1.2862109|-1.1685709|
| |female |189|Inf |1.5170646|-0.8397507|
+---------------+-------+---+----+---------+----------+
|SE_track |KSO | 39|Inf |1.0647107|-1.3545457|
| |TSO | 40|Inf |0.7308875|-1.7346011|
| |ASO |207|Inf |1.6990501|-0.7591051|
+---------------+-------+---+----+---------+----------+
|Total_testscore|[ 2, 8)| 74|Inf |0.8602013|-1.6422277|
| |[ 8,11)|104|Inf |1.6326948|-1.3156768|
| |[11,13)| 58|Inf |1.3437347|-0.5663955|
| |[13,16]| 50|Inf |2.4423470| 0.0000000|
+---------------+-------+---+----+---------+----------+
|Overall | |286|Inf |1.4350845|-0.9458495|
+---------------+-------+---+----+---------+----------+
> glm(I(as.numeric(Percentage_f) >= 2) ~ Gender + SE_track + Total_testscore, family = "binomial", data = mydata)
Call: glm(formula = I(as.numeric(Percentage_f) >= 2) ~ Gender + SE_track +
Total_testscore, family = "binomial", data = mydata)
> glm(I(as.numeric(Percentage_f) >= 3) ~ Gender + SE_track + Total_testscore, family = "binomial", data = mydata)
> glm(I(as.numeric(Percentage_f) >= 4) ~ Gender + SE_track + Total_testscore, family = "binomial", data = mydata)
> s[, 4] <- s[, 4] - s[, 3]
> s[, 3] <- s[, 3] - s[, 3]
> s
as.numeric(Percentage_f) N= 286
+---------------+-------+---+----+----+---------+
| | |N |Y>=1|Y>=2|Y>=3 |
+---------------+-------+---+----+----+---------+
|Gender |male | 97|Inf |0 |-2.454782|
| |female |189|Inf |0 |-2.356815|
+---------------+-------+---+----+----+---------+
|SE_track |KSO | 39|Inf |0 |-2.419256|
| |TSO | 40|Inf |0 |-2.465489|
| |ASO |207|Inf |0 |-2.458155|
+---------------+-------+---+----+----+---------+
|Total_testscore|[ 2, 8)| 74|Inf |0 |-2.502429|
| |[ 8,11)|104|Inf |0 |-2.948372|
| |[11,13)| 58|Inf |0 |-1.910130|
| |[13,16]| 50|Inf |0 |-2.442347|
+---------------+-------+---+----+----+---------+
|Overall | |286|Inf |0 |-2.380934|
+---------------+-------+---+----+----+---------+
问题:
如何更改我的变量 Total_testscore 以间隔 [ 2, 8), [ 8,11), [11,13), [13,16]
拆分?我想将它们更改为 [ 0, 5), [ 5,10), [10,13), [13,16]
您可以在 运行 模型之前在数据框中创建所需的间隔。可能有更好的方法,但在不查看数据的情况下,这样的方法应该可行。
library(dplyr)
mydata = mydata %>%
mutate(
`Total_testscore_[0,5)` = ifelse(Total_testscore>= 0 & Total_testscore < 5,1,0),
`Total_testscore_[5,10)` = ifelse(Total_testscore>= 5 & Total_testscore < 10,1,0),
`Total_testscore_[10,13)` = ifelse(Total_testscore>= 10 & Total_testscore < 13,1,0),
`Total_testscore_[13,16)` = ifelse(Total_testscore>= 13 & Total_testscore < 16,1,0)) %>%
select(.,-Total_testscore)
解决方案是在回归中使用之前缩放连续变量,使用:
starters$Total_testscore_f <- cut(starters$Total_testscore, breaks = c(0,5,10,13,16))
s <- with(mydata, summary(as.numeric(Percentage_f) ~ Gender + SE_track + Total_testscore_f, fun=sf))
glm(I(as.numeric(Percentage_f) >= 2) ~ Gender + SE_track + Total_testscore_f, family = "binomial", data = mydata)
glm(I(as.numeric(Percentage_f) >= 3) ~ Gender + SE_track + Total_testscore_f, family = "binomial", data = mydata)
glm(I(as.numeric(Percentage_f) >= 4) ~ Gender + SE_track + Total_testscore_f, family = "binomial", data = mydata)
s[, 4] <- s[, 4] - s[, 3]
s[, 3] <- s[, 3] - s[, 3]
s
# plot
par(mfrow = c(1,1))
plot(s, which=1:3, pch=1:3, xlab='logit', main=' ', xlim = c(-3,0))#xlim=range(s[,3:4]))
# suggesting that the proportional odds assumption may not hold
as.numeric(Percentage_f) N= 286 , 2 Missing
+-----------------+-------+---+----+----+---------+
| | |N |Y>=1|Y>=2|Y>=3 |
+-----------------+-------+---+----+----+---------+
|Gender |male | 97|Inf |0 |-2.454782|
| |female |189|Inf |0 |-2.356815|
+-----------------+-------+---+----+----+---------+
|SE_track |KSO | 39|Inf |0 |-2.419256|
| |TSO | 40|Inf |0 |-2.465489|
| |ASO |207|Inf |0 |-2.458155|
+-----------------+-------+---+----+----+---------+
|Total_testscore_f|(0,5] | 25|Inf |0 |-1.912387|
| |(5,10] |153|Inf |0 |-2.956124|
| |(10,13]| 81|Inf |0 |-2.096264|
| |(13,16]| 27|Inf |0 |-2.151035|
+-----------------+-------+---+----+----+---------+
|Overall | |286|Inf |0 |-2.380934|
+-----------------+-------+---+----+----+---------+