如何使用插入符号和 C5.0Cost 更改 R 中的成本矩阵?

How to change the cost matrix in R with caret and C5.0Cost?

我目前正在 R 中试验插入符和 C5.0Cost。到目前为止,我有一个运行良好的基本模型。但是调整参数让我有些头疼。

我似乎无法更改误报的成本。

library(mlbench)
data(Sonar)

library(caret)

set.seed(990)
inTraining <- createDataPartition(Sonar$Class, p = .5, list = FALSE)
inTraining
training <- Sonar[inTraining,]
test <- Sonar[-inTraining,]

set.seed(990)
fitControl <- trainControl(method="repeatedcv", number=10, repeats=5)
statGrid <-  expand.grid(trials = 1,
                     model = "tree",
                     winnow = FALSE,
                     cost = matrix(c(
                         0, 2,
                         1, 0
                     ), 2, 2, byrow=TRUE))

set.seed(825)
statFit <- train(Class~., data=training, method="C5.0Cost", trControl=fitControl, tuneGrid = statGrid, metric = "Accuracy")

statFit["finalModel"]

write(capture.output(summary(statFit)), "c50model.txt")

R version 3.2.1 (2015-06-18) Platform: x86_64-w64-mingw32/x64 (64-bit) Running under: Windows 8 x64 (build 9200)

locale: [1] LC_COLLATE=German_Germany.1252 LC_CTYPE=German_Germany.1252 LC_MONETARY=German_Germany.1252 [4] LC_NUMERIC=C LC_TIME=German_Germany.1252

attached base packages: [1] grid stats graphics grDevices utils datasets methods base

other attached packages: [1] DMwR_0.4.1 plyr_1.8.3 C50_0.1.0-24 caret_6.0-52 ggplot2_1.0.1 lattice_0.20-31 [7] mlbench_2.1-1

loaded via a namespace (and not attached): [1] Rcpp_0.11.6 compiler_3.2.1 nloptr_1.0.4 bitops_1.0-6
[5] xts_0.9-7 class_7.3-12 iterators_1.0.7 tools_3.2.1
[9] rpart_4.1-9 partykit_1.0-3 digest_0.6.8 lme4_1.1-8
[13] nlme_3.1-120 gtable_0.1.2 mgcv_1.8-6 Matrix_1.2-1
[17] foreach_1.4.2 parallel_3.2.1 brglm_0.5-9 SparseM_1.6
[21] proto_0.3-10 e1071_1.6-7 BradleyTerry2_1.0-6 stringr_1.0.0
[25] caTools_1.17.1 gtools_3.5.0 stats4_3.2.1 nnet_7.3-9
[29] survival_2.38-1 gdata_2.17.0 minqa_1.2.4 ROCR_1.0-7
[33] TTR_0.23-0 reshape2_1.4.1 car_2.0-26 magrittr_1.5
[37] gplots_2.17.0 scales_0.2.5 codetools_0.2-11 MASS_7.3-40
[41] splines_3.2.1 quantmod_0.4-5 abind_1.4-3 pbkrtest_0.4-2
[45] colorspace_1.2-6 quantreg_5.11 KernSmooth_2.23-14 stringi_0.5-5
[49] munsell_0.4.2 zoo_1.7-12

插入符号 (?) 接受的唯一更改是对漏报的更改(上例中设置为两个的漏报)。不幸的是,所有其他更改都被忽略了。可以通过在 R 控制台中键入 statFit["finalModel"] 轻松确认这一点。

C5.0 documentation 的第 3 页提供了有关实施成本矩阵的详细信息。从文档中,您可以看到 "diagonal elements [of the cost matrix] are ignored"

@JimBoy 我 运行 遇到了和你一样的问题。我查看了 github 上 "C5.0Cost" 的插入符号包装器的源代码,您可以看到矩阵的左上角在代码中设置为 1(请参阅 cmat 对象)。

我修改了 modelInfo 中的成本输入,以便您可以将成本添加到 flase positives 和 negatives。您现在不再包含一个成本参数,而是在 grid.expand 误报 (costFP) 和漏报 (costFN) 中指定两个,它们是您要评估的成本向量。

modelInfo <- list(label = "Cost-Sensitive C5.0",
            library = c("C50", "plyr"),
            loop = function(grid) {     
              loop <- ddply(grid, c("model", "winnow", "costFP","costFN"),
                            function(x) c(trials = max(x$trials)))                 

              submodels <- vector(mode = "list", length = nrow(loop))
              for(i in seq(along = loop$trials))
              {
                index <- which(grid$model == loop$model[i] & 
                                 grid$winnow == loop$winnow[i],
                               grid$costFP[i] == loop$costFP[i],
                               grid$costFN[i] == loop$costFN[i])
                trials <- grid[index, "trials"] 
                submodels[[i]] <- data.frame(trials = trials[trials != loop$trials[i]])
              }     
              list(loop = loop, submodels = submodels)
            },
            type = "Classification",
            parameters = data.frame(parameter = c('trials', 'model', 'winnow', "costFP","costFN"),
                                    class = c("numeric", "character", "logical", "numeric","numeric"),
                                    label = c('# Boosting Iterations', 'Model Type', 'Winnow', "CostFP","CostFN")),
            grid = function(x, y, len = NULL, search = "grid") {
              c5seq <- if(len == 1)  1 else  c(1, 10*((2:min(len, 11)) - 1))
              expand.grid(trials = c5seq, model = c("tree", "rules"), 
                          winnow = c(TRUE, FALSE),
                          costFP = 1:2,
                          costFN = 1:2)
              if(search == "grid") {
                c5seq <- if(len == 1)  1 else  c(1, 10*((2:min(len, 11)) - 1))
                out <- expand.grid(trials = c5seq, model = c("tree", "rules"), 
                                   winnow = c(TRUE, FALSE), costFP = 1:2, costFN = 1:2)
              } else {
                out <- data.frame(trials = sample(1:100, replace = TRUE, size = len),
                                  model = sample(c("tree", "rules"), replace = TRUE, size = len),
                                  winnow = sample(c(TRUE, FALSE), replace = TRUE, size = len),
                                  costFP = runif(len, min = 1, max = 20),
                                  costFN = runif(len, min = 1, max = 20))
              }
              out    
            },
            fit = function(x, y, wts, param, lev, last, classProbs, ...) { 
              theDots <- list(...)

              if(any(names(theDots) == "control"))
              {                           
                theDots$control$winnow <- param$winnow
              } else theDots$control <- C5.0Control(winnow = param$winnow)

              argList <- list(x = x, y = y, weights = wts, trials = param$trials,
                              rules = param$model == "rules")

              cmat <-matrix(c(0, param$costFP, param$costFN, 0), ncol = 2)
              rownames(cmat) <- colnames(cmat) <- levels(y)
              if(any(names(theDots) == "costFP")){
                warning("For 'C5.0Cost', the costs are a tuning parameter")
                theDots$costs <- cmat
              } else argList$costs <- cmat

              argList <- c(argList, theDots)
              do.call("C5.0.default", argList)
            },
            predict = function(modelFit, newdata, submodels = NULL) {
              out <- predict(modelFit, newdata)

              if(!is.null(submodels))
              {
                tmp <- out
                out <- vector(mode = "list", length = nrow(submodels) + 1)
                out[[1]] <- tmp

                for(j in seq(along = submodels$trials))
                  out[[j+1]] <- as.character(predict(modelFit, newdata, trial = submodels$trials[j]))
              }
              out              
            },
            prob = NULL,
            predictors = function(x, ...) {
              vars <- C5imp(x, metric = "splits")
              rownames(vars)[vars$Overall > 0]
            },
            levels = function(x) x$obsLevels,
            varImp = function(object, ...) C5imp(object, ...),
            tags = c("Tree-Based Model", "Rule-Based Model", "Implicit Feature Selection",
                     "Boosting", "Ensemble Model", "Cost Sensitive Learning", "Two Class Only", 
                     "Handle Missing Predictor Data", "Accepts Case Weights"),
            sort = function(x){
              x$model <- factor(as.character(x$model), levels = c("rules", "tree"))
              x[order(x$trials, x$model, !x$winnow, x$costFP,x$costFN),]
            },
            trim = function(x) {
              x$boostResults <- NULL
              x$size <- NULL
              x$call <- NULL
              x$output <- NULL
              x
            })

您上面提供的例子可以运行如下,

## Example provided
library(mlbench)
data(Sonar)

library(caret)


set.seed(990)
inTraining <- createDataPartition(Sonar$Class, p = .5, list = FALSE)
inTraining
training <- Sonar[inTraining,]
test <- Sonar[-inTraining,]



set.seed(990)
fitControl <- trainControl(method="repeatedcv", number=10, repeats=5)


statGrid <-  expand.grid(trials = 3,
                         model = "tree",
                         winnow = FALSE,
                         cost = 2)

set.seed(825)


statFit <- train(Class~., data=training, method="C5.0Cost", trControl=fitControl, tuneGrid = statGrid, metric = "Accuracy")


## Example modified to include costs for both false positives and negatives
set.seed(825)
statGridMod <-  expand.grid(trials = 3,
                            model = "tree",
                            winnow = FALSE,
                            costFP = c(1,2,3), #new cost parameters
                            costFN = c(3,2,1)) #new cost parameters


statFit <- train(Class~., data=training, method=modelInfo, trControl=fitControl, tuneGrid = statGridMod, metric = "Accuracy")

statFit