如何使用插入符号和 C5.0Cost 更改 R 中的成本矩阵?
How to change the cost matrix in R with caret and C5.0Cost?
我目前正在 R 中试验插入符和 C5.0Cost。到目前为止,我有一个运行良好的基本模型。但是调整参数让我有些头疼。
我似乎无法更改误报的成本。
library(mlbench)
data(Sonar)
library(caret)
set.seed(990)
inTraining <- createDataPartition(Sonar$Class, p = .5, list = FALSE)
inTraining
training <- Sonar[inTraining,]
test <- Sonar[-inTraining,]
set.seed(990)
fitControl <- trainControl(method="repeatedcv", number=10, repeats=5)
statGrid <- expand.grid(trials = 1,
model = "tree",
winnow = FALSE,
cost = matrix(c(
0, 2,
1, 0
), 2, 2, byrow=TRUE))
set.seed(825)
statFit <- train(Class~., data=training, method="C5.0Cost", trControl=fitControl, tuneGrid = statGrid, metric = "Accuracy")
statFit["finalModel"]
write(capture.output(summary(statFit)), "c50model.txt")
R version 3.2.1 (2015-06-18)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 8 x64 (build 9200)
locale:
[1] LC_COLLATE=German_Germany.1252 LC_CTYPE=German_Germany.1252 LC_MONETARY=German_Germany.1252
[4] LC_NUMERIC=C LC_TIME=German_Germany.1252
attached base packages:
[1] grid stats graphics grDevices utils datasets methods base
other attached packages:
[1] DMwR_0.4.1 plyr_1.8.3 C50_0.1.0-24 caret_6.0-52 ggplot2_1.0.1 lattice_0.20-31
[7] mlbench_2.1-1
loaded via a namespace (and not attached):
[1] Rcpp_0.11.6 compiler_3.2.1 nloptr_1.0.4 bitops_1.0-6
[5] xts_0.9-7 class_7.3-12 iterators_1.0.7 tools_3.2.1
[9] rpart_4.1-9 partykit_1.0-3 digest_0.6.8 lme4_1.1-8
[13] nlme_3.1-120 gtable_0.1.2 mgcv_1.8-6 Matrix_1.2-1
[17] foreach_1.4.2 parallel_3.2.1 brglm_0.5-9 SparseM_1.6
[21] proto_0.3-10 e1071_1.6-7 BradleyTerry2_1.0-6 stringr_1.0.0
[25] caTools_1.17.1 gtools_3.5.0 stats4_3.2.1 nnet_7.3-9
[29] survival_2.38-1 gdata_2.17.0 minqa_1.2.4 ROCR_1.0-7
[33] TTR_0.23-0 reshape2_1.4.1 car_2.0-26 magrittr_1.5
[37] gplots_2.17.0 scales_0.2.5 codetools_0.2-11 MASS_7.3-40
[41] splines_3.2.1 quantmod_0.4-5 abind_1.4-3 pbkrtest_0.4-2
[45] colorspace_1.2-6 quantreg_5.11 KernSmooth_2.23-14 stringi_0.5-5
[49] munsell_0.4.2 zoo_1.7-12
插入符号 (?) 接受的唯一更改是对漏报的更改(上例中设置为两个的漏报)。不幸的是,所有其他更改都被忽略了。可以通过在 R 控制台中键入 statFit["finalModel"]
轻松确认这一点。
C5.0 documentation 的第 3 页提供了有关实施成本矩阵的详细信息。从文档中,您可以看到 "diagonal elements [of the cost matrix] are ignored"
@JimBoy 我 运行 遇到了和你一样的问题。我查看了 github 上 "C5.0Cost" 的插入符号包装器的源代码,您可以看到矩阵的左上角在代码中设置为 1(请参阅 cmat 对象)。
我修改了 modelInfo 中的成本输入,以便您可以将成本添加到 flase positives 和 negatives。您现在不再包含一个成本参数,而是在 grid.expand 误报 (costFP) 和漏报 (costFN) 中指定两个,它们是您要评估的成本向量。
modelInfo <- list(label = "Cost-Sensitive C5.0",
library = c("C50", "plyr"),
loop = function(grid) {
loop <- ddply(grid, c("model", "winnow", "costFP","costFN"),
function(x) c(trials = max(x$trials)))
submodels <- vector(mode = "list", length = nrow(loop))
for(i in seq(along = loop$trials))
{
index <- which(grid$model == loop$model[i] &
grid$winnow == loop$winnow[i],
grid$costFP[i] == loop$costFP[i],
grid$costFN[i] == loop$costFN[i])
trials <- grid[index, "trials"]
submodels[[i]] <- data.frame(trials = trials[trials != loop$trials[i]])
}
list(loop = loop, submodels = submodels)
},
type = "Classification",
parameters = data.frame(parameter = c('trials', 'model', 'winnow', "costFP","costFN"),
class = c("numeric", "character", "logical", "numeric","numeric"),
label = c('# Boosting Iterations', 'Model Type', 'Winnow', "CostFP","CostFN")),
grid = function(x, y, len = NULL, search = "grid") {
c5seq <- if(len == 1) 1 else c(1, 10*((2:min(len, 11)) - 1))
expand.grid(trials = c5seq, model = c("tree", "rules"),
winnow = c(TRUE, FALSE),
costFP = 1:2,
costFN = 1:2)
if(search == "grid") {
c5seq <- if(len == 1) 1 else c(1, 10*((2:min(len, 11)) - 1))
out <- expand.grid(trials = c5seq, model = c("tree", "rules"),
winnow = c(TRUE, FALSE), costFP = 1:2, costFN = 1:2)
} else {
out <- data.frame(trials = sample(1:100, replace = TRUE, size = len),
model = sample(c("tree", "rules"), replace = TRUE, size = len),
winnow = sample(c(TRUE, FALSE), replace = TRUE, size = len),
costFP = runif(len, min = 1, max = 20),
costFN = runif(len, min = 1, max = 20))
}
out
},
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
theDots <- list(...)
if(any(names(theDots) == "control"))
{
theDots$control$winnow <- param$winnow
} else theDots$control <- C5.0Control(winnow = param$winnow)
argList <- list(x = x, y = y, weights = wts, trials = param$trials,
rules = param$model == "rules")
cmat <-matrix(c(0, param$costFP, param$costFN, 0), ncol = 2)
rownames(cmat) <- colnames(cmat) <- levels(y)
if(any(names(theDots) == "costFP")){
warning("For 'C5.0Cost', the costs are a tuning parameter")
theDots$costs <- cmat
} else argList$costs <- cmat
argList <- c(argList, theDots)
do.call("C5.0.default", argList)
},
predict = function(modelFit, newdata, submodels = NULL) {
out <- predict(modelFit, newdata)
if(!is.null(submodels))
{
tmp <- out
out <- vector(mode = "list", length = nrow(submodels) + 1)
out[[1]] <- tmp
for(j in seq(along = submodels$trials))
out[[j+1]] <- as.character(predict(modelFit, newdata, trial = submodels$trials[j]))
}
out
},
prob = NULL,
predictors = function(x, ...) {
vars <- C5imp(x, metric = "splits")
rownames(vars)[vars$Overall > 0]
},
levels = function(x) x$obsLevels,
varImp = function(object, ...) C5imp(object, ...),
tags = c("Tree-Based Model", "Rule-Based Model", "Implicit Feature Selection",
"Boosting", "Ensemble Model", "Cost Sensitive Learning", "Two Class Only",
"Handle Missing Predictor Data", "Accepts Case Weights"),
sort = function(x){
x$model <- factor(as.character(x$model), levels = c("rules", "tree"))
x[order(x$trials, x$model, !x$winnow, x$costFP,x$costFN),]
},
trim = function(x) {
x$boostResults <- NULL
x$size <- NULL
x$call <- NULL
x$output <- NULL
x
})
您上面提供的例子可以运行如下,
## Example provided
library(mlbench)
data(Sonar)
library(caret)
set.seed(990)
inTraining <- createDataPartition(Sonar$Class, p = .5, list = FALSE)
inTraining
training <- Sonar[inTraining,]
test <- Sonar[-inTraining,]
set.seed(990)
fitControl <- trainControl(method="repeatedcv", number=10, repeats=5)
statGrid <- expand.grid(trials = 3,
model = "tree",
winnow = FALSE,
cost = 2)
set.seed(825)
statFit <- train(Class~., data=training, method="C5.0Cost", trControl=fitControl, tuneGrid = statGrid, metric = "Accuracy")
## Example modified to include costs for both false positives and negatives
set.seed(825)
statGridMod <- expand.grid(trials = 3,
model = "tree",
winnow = FALSE,
costFP = c(1,2,3), #new cost parameters
costFN = c(3,2,1)) #new cost parameters
statFit <- train(Class~., data=training, method=modelInfo, trControl=fitControl, tuneGrid = statGridMod, metric = "Accuracy")
statFit
我目前正在 R 中试验插入符和 C5.0Cost。到目前为止,我有一个运行良好的基本模型。但是调整参数让我有些头疼。
我似乎无法更改误报的成本。
library(mlbench)
data(Sonar)
library(caret)
set.seed(990)
inTraining <- createDataPartition(Sonar$Class, p = .5, list = FALSE)
inTraining
training <- Sonar[inTraining,]
test <- Sonar[-inTraining,]
set.seed(990)
fitControl <- trainControl(method="repeatedcv", number=10, repeats=5)
statGrid <- expand.grid(trials = 1,
model = "tree",
winnow = FALSE,
cost = matrix(c(
0, 2,
1, 0
), 2, 2, byrow=TRUE))
set.seed(825)
statFit <- train(Class~., data=training, method="C5.0Cost", trControl=fitControl, tuneGrid = statGrid, metric = "Accuracy")
statFit["finalModel"]
write(capture.output(summary(statFit)), "c50model.txt")
R version 3.2.1 (2015-06-18) Platform: x86_64-w64-mingw32/x64 (64-bit) Running under: Windows 8 x64 (build 9200)
locale: [1] LC_COLLATE=German_Germany.1252 LC_CTYPE=German_Germany.1252 LC_MONETARY=German_Germany.1252 [4] LC_NUMERIC=C LC_TIME=German_Germany.1252
attached base packages: [1] grid stats graphics grDevices utils datasets methods base
other attached packages: [1] DMwR_0.4.1 plyr_1.8.3 C50_0.1.0-24 caret_6.0-52 ggplot2_1.0.1 lattice_0.20-31 [7] mlbench_2.1-1
loaded via a namespace (and not attached): [1] Rcpp_0.11.6 compiler_3.2.1 nloptr_1.0.4 bitops_1.0-6
[5] xts_0.9-7 class_7.3-12 iterators_1.0.7 tools_3.2.1
[9] rpart_4.1-9 partykit_1.0-3 digest_0.6.8 lme4_1.1-8
[13] nlme_3.1-120 gtable_0.1.2 mgcv_1.8-6 Matrix_1.2-1
[17] foreach_1.4.2 parallel_3.2.1 brglm_0.5-9 SparseM_1.6
[21] proto_0.3-10 e1071_1.6-7 BradleyTerry2_1.0-6 stringr_1.0.0
[25] caTools_1.17.1 gtools_3.5.0 stats4_3.2.1 nnet_7.3-9
[29] survival_2.38-1 gdata_2.17.0 minqa_1.2.4 ROCR_1.0-7
[33] TTR_0.23-0 reshape2_1.4.1 car_2.0-26 magrittr_1.5
[37] gplots_2.17.0 scales_0.2.5 codetools_0.2-11 MASS_7.3-40
[41] splines_3.2.1 quantmod_0.4-5 abind_1.4-3 pbkrtest_0.4-2
[45] colorspace_1.2-6 quantreg_5.11 KernSmooth_2.23-14 stringi_0.5-5
[49] munsell_0.4.2 zoo_1.7-12
插入符号 (?) 接受的唯一更改是对漏报的更改(上例中设置为两个的漏报)。不幸的是,所有其他更改都被忽略了。可以通过在 R 控制台中键入 statFit["finalModel"]
轻松确认这一点。
C5.0 documentation 的第 3 页提供了有关实施成本矩阵的详细信息。从文档中,您可以看到 "diagonal elements [of the cost matrix] are ignored"
@JimBoy 我 运行 遇到了和你一样的问题。我查看了 github 上 "C5.0Cost" 的插入符号包装器的源代码,您可以看到矩阵的左上角在代码中设置为 1(请参阅 cmat 对象)。
我修改了 modelInfo 中的成本输入,以便您可以将成本添加到 flase positives 和 negatives。您现在不再包含一个成本参数,而是在 grid.expand 误报 (costFP) 和漏报 (costFN) 中指定两个,它们是您要评估的成本向量。
modelInfo <- list(label = "Cost-Sensitive C5.0",
library = c("C50", "plyr"),
loop = function(grid) {
loop <- ddply(grid, c("model", "winnow", "costFP","costFN"),
function(x) c(trials = max(x$trials)))
submodels <- vector(mode = "list", length = nrow(loop))
for(i in seq(along = loop$trials))
{
index <- which(grid$model == loop$model[i] &
grid$winnow == loop$winnow[i],
grid$costFP[i] == loop$costFP[i],
grid$costFN[i] == loop$costFN[i])
trials <- grid[index, "trials"]
submodels[[i]] <- data.frame(trials = trials[trials != loop$trials[i]])
}
list(loop = loop, submodels = submodels)
},
type = "Classification",
parameters = data.frame(parameter = c('trials', 'model', 'winnow', "costFP","costFN"),
class = c("numeric", "character", "logical", "numeric","numeric"),
label = c('# Boosting Iterations', 'Model Type', 'Winnow', "CostFP","CostFN")),
grid = function(x, y, len = NULL, search = "grid") {
c5seq <- if(len == 1) 1 else c(1, 10*((2:min(len, 11)) - 1))
expand.grid(trials = c5seq, model = c("tree", "rules"),
winnow = c(TRUE, FALSE),
costFP = 1:2,
costFN = 1:2)
if(search == "grid") {
c5seq <- if(len == 1) 1 else c(1, 10*((2:min(len, 11)) - 1))
out <- expand.grid(trials = c5seq, model = c("tree", "rules"),
winnow = c(TRUE, FALSE), costFP = 1:2, costFN = 1:2)
} else {
out <- data.frame(trials = sample(1:100, replace = TRUE, size = len),
model = sample(c("tree", "rules"), replace = TRUE, size = len),
winnow = sample(c(TRUE, FALSE), replace = TRUE, size = len),
costFP = runif(len, min = 1, max = 20),
costFN = runif(len, min = 1, max = 20))
}
out
},
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
theDots <- list(...)
if(any(names(theDots) == "control"))
{
theDots$control$winnow <- param$winnow
} else theDots$control <- C5.0Control(winnow = param$winnow)
argList <- list(x = x, y = y, weights = wts, trials = param$trials,
rules = param$model == "rules")
cmat <-matrix(c(0, param$costFP, param$costFN, 0), ncol = 2)
rownames(cmat) <- colnames(cmat) <- levels(y)
if(any(names(theDots) == "costFP")){
warning("For 'C5.0Cost', the costs are a tuning parameter")
theDots$costs <- cmat
} else argList$costs <- cmat
argList <- c(argList, theDots)
do.call("C5.0.default", argList)
},
predict = function(modelFit, newdata, submodels = NULL) {
out <- predict(modelFit, newdata)
if(!is.null(submodels))
{
tmp <- out
out <- vector(mode = "list", length = nrow(submodels) + 1)
out[[1]] <- tmp
for(j in seq(along = submodels$trials))
out[[j+1]] <- as.character(predict(modelFit, newdata, trial = submodels$trials[j]))
}
out
},
prob = NULL,
predictors = function(x, ...) {
vars <- C5imp(x, metric = "splits")
rownames(vars)[vars$Overall > 0]
},
levels = function(x) x$obsLevels,
varImp = function(object, ...) C5imp(object, ...),
tags = c("Tree-Based Model", "Rule-Based Model", "Implicit Feature Selection",
"Boosting", "Ensemble Model", "Cost Sensitive Learning", "Two Class Only",
"Handle Missing Predictor Data", "Accepts Case Weights"),
sort = function(x){
x$model <- factor(as.character(x$model), levels = c("rules", "tree"))
x[order(x$trials, x$model, !x$winnow, x$costFP,x$costFN),]
},
trim = function(x) {
x$boostResults <- NULL
x$size <- NULL
x$call <- NULL
x$output <- NULL
x
})
您上面提供的例子可以运行如下,
## Example provided
library(mlbench)
data(Sonar)
library(caret)
set.seed(990)
inTraining <- createDataPartition(Sonar$Class, p = .5, list = FALSE)
inTraining
training <- Sonar[inTraining,]
test <- Sonar[-inTraining,]
set.seed(990)
fitControl <- trainControl(method="repeatedcv", number=10, repeats=5)
statGrid <- expand.grid(trials = 3,
model = "tree",
winnow = FALSE,
cost = 2)
set.seed(825)
statFit <- train(Class~., data=training, method="C5.0Cost", trControl=fitControl, tuneGrid = statGrid, metric = "Accuracy")
## Example modified to include costs for both false positives and negatives
set.seed(825)
statGridMod <- expand.grid(trials = 3,
model = "tree",
winnow = FALSE,
costFP = c(1,2,3), #new cost parameters
costFN = c(3,2,1)) #new cost parameters
statFit <- train(Class~., data=training, method=modelInfo, trControl=fitControl, tuneGrid = statGridMod, metric = "Accuracy")
statFit