Caret::calibration 在公式中包含多个模型列时复制数据
Caret::calibration duplicating data when including multiple model columns in the formula
不确定这是一个错误还是我的理解有缺陷,但是当我运行下面的例子时:
library(caret)
data(mdrr)
mdrrDescr <- mdrrDescr[, -nearZeroVar(mdrrDescr)]
mdrrDescr <- mdrrDescr[, -findCorrelation(cor(mdrrDescr), .5)]
preProc <- preProcess(mdrrDescr, c("center", "scale"))
mdrrDescr <- predict(preProc, mdrrDescr)
inTrain <- createDataPartition(mdrrClass)
trainX <- mdrrDescr[inTrain[[1]], ]
trainY <- mdrrClass[inTrain[[1]]]
testX <- mdrrDescr[-inTrain[[1]], ]
testY <- mdrrClass[-inTrain[[1]]]
library(MASS)
ldaFit <- lda(trainX, trainY)
qdaFit <- qda(trainX, trainY)
testProbs <- data.frame(obs = testY,
lda = predict(ldaFit, testX)$posterior[,1],
qda = predict(qdaFit, testX)$posterior[,1])
calPlotData <- caret::calibration(obs ~ lda + qda, data = testProbs, cuts = 5)
> calPlotData$data
我得到这个结果:
# out:
calibModelVar bin Percent Lower Upper Count midpoint
1 lda [0,0.2] 6.521739 2.430775 13.65621 6 10
2 lda (0.2,0.4] 30.232558 20.789989 41.08301 26 30
3 lda (0.4,0.6] 59.375000 46.367688 71.48530 38 50
4 lda (0.6,0.8] 70.909091 61.481025 79.17690 78 70
5 lda (0.8,1] 85.227273 79.108431 90.11742 150 90
6 qda [0,0.2] 28.099174 22.529270 34.21445 68 10
7 qda (0.2,0.4] 40.000000 12.155226 73.76219 4 30
8 qda (0.4,0.6] 33.333333 9.924609 65.11245 4 50
9 qda (0.6,0.8] 80.000000 56.338600 94.26660 16 70
10 qda (0.8,1] 84.426230 79.256188 88.73729 206 90
但是,当我进行一些调查时,发现这些结果中的数据是重复的。例如,
>table(testProbs$obs == "Active" & testProbs$lda <= 0.2)
# out:
FALSE TRUE
261 3
>table(testProbs$obs == "Active" & testProbs$qda <= 0.2)
# out:
FALSE TRUE
230 34
这也会影响误差估计(table 中的上限和下限)。例如,当我 运行 calibration() 函数只有一个模型列时,
> calPlotData <- caret::calibration(obs ~ lda, data = testProbs, cuts = 5)
> calPlotData$data
# out:
calibModelVar bin Percent Lower Upper Count midpoint
1 lda [0,0.2] 6.521739 1.365677 17.89644 3 10
2 lda (0.2,0.4] 30.232558 17.182499 46.12533 13 30
3 lda (0.4,0.6] 59.375000 40.644925 76.30159 19 50
4 lda (0.6,0.8] 70.909091 57.101742 82.37003 39 70
5 lda (0.8,1] 85.227273 76.063784 91.89296 75 90
这是一个错误,已被 this PR 修复。
不确定这是一个错误还是我的理解有缺陷,但是当我运行下面的例子时:
library(caret)
data(mdrr)
mdrrDescr <- mdrrDescr[, -nearZeroVar(mdrrDescr)]
mdrrDescr <- mdrrDescr[, -findCorrelation(cor(mdrrDescr), .5)]
preProc <- preProcess(mdrrDescr, c("center", "scale"))
mdrrDescr <- predict(preProc, mdrrDescr)
inTrain <- createDataPartition(mdrrClass)
trainX <- mdrrDescr[inTrain[[1]], ]
trainY <- mdrrClass[inTrain[[1]]]
testX <- mdrrDescr[-inTrain[[1]], ]
testY <- mdrrClass[-inTrain[[1]]]
library(MASS)
ldaFit <- lda(trainX, trainY)
qdaFit <- qda(trainX, trainY)
testProbs <- data.frame(obs = testY,
lda = predict(ldaFit, testX)$posterior[,1],
qda = predict(qdaFit, testX)$posterior[,1])
calPlotData <- caret::calibration(obs ~ lda + qda, data = testProbs, cuts = 5)
> calPlotData$data
我得到这个结果:
# out:
calibModelVar bin Percent Lower Upper Count midpoint
1 lda [0,0.2] 6.521739 2.430775 13.65621 6 10
2 lda (0.2,0.4] 30.232558 20.789989 41.08301 26 30
3 lda (0.4,0.6] 59.375000 46.367688 71.48530 38 50
4 lda (0.6,0.8] 70.909091 61.481025 79.17690 78 70
5 lda (0.8,1] 85.227273 79.108431 90.11742 150 90
6 qda [0,0.2] 28.099174 22.529270 34.21445 68 10
7 qda (0.2,0.4] 40.000000 12.155226 73.76219 4 30
8 qda (0.4,0.6] 33.333333 9.924609 65.11245 4 50
9 qda (0.6,0.8] 80.000000 56.338600 94.26660 16 70
10 qda (0.8,1] 84.426230 79.256188 88.73729 206 90
但是,当我进行一些调查时,发现这些结果中的数据是重复的。例如,
>table(testProbs$obs == "Active" & testProbs$lda <= 0.2)
# out:
FALSE TRUE
261 3
>table(testProbs$obs == "Active" & testProbs$qda <= 0.2)
# out:
FALSE TRUE
230 34
这也会影响误差估计(table 中的上限和下限)。例如,当我 运行 calibration() 函数只有一个模型列时,
> calPlotData <- caret::calibration(obs ~ lda, data = testProbs, cuts = 5)
> calPlotData$data
# out:
calibModelVar bin Percent Lower Upper Count midpoint
1 lda [0,0.2] 6.521739 1.365677 17.89644 3 10
2 lda (0.2,0.4] 30.232558 17.182499 46.12533 13 30
3 lda (0.4,0.6] 59.375000 40.644925 76.30159 19 50
4 lda (0.6,0.8] 70.909091 57.101742 82.37003 39 70
5 lda (0.8,1] 85.227273 76.063784 91.89296 75 90
这是一个错误,已被 this PR 修复。