instantiateResampleInstance.CVDesc:尺寸折叠太多
instantiateResampleInstance.CVDesc: too many folds for size
正在对 xgboost 模型进行参数调整,运行 在我的 mlr 实现中出现了一个有趣的错误,我认为这是由我的重采样实例引起的 due to the documentation here。问题是我不太清楚如何修复它。我试过手动设置函数的大小参数,但也被拒绝了。
基本代码:
samplecount = sample.split(test_train,SplitRatio = 0.8)
stest <- subset(test_train,samplecount ==TRUE)
strain <- subset(test_train,samplecount ==FALSE)
new_tr <- model.matrix(~.+0,data = subset(strain,select=-c(Value)))
new_ts <- model.matrix(~.+0,data = subset(stest,select=-c(Value)))
labels <- strain$Value
labels <- as.numeric(labels)-1
ts_label <- stest$Value
ts_label <- as.numeric(ts_label)-1
dtrain <- xgb.DMatrix(data = new_tr,label = labels)
dtest <- xgb.DMatrix(data = new_ts,label=ts_label)
params <- list(booster = "gbtree", objective = "reg:linear",
eta=0.3, gamma=0, max_depth=6, min_child_weight=1, subsample=1, colsample_bytree=1)
xgbcv <- xgb.cv( params = params, data = dtrain,
nrounds = 100, showsd = T,nfold=5,
stratified = T, print.every.n = 1, early.stop.round = 20, maximize = F)
xgb1 <- xgb.train (params = params, data = dtrain, nrounds = 79, watchlist = list(val=dtest,train=dtrain), print.every.n = 10, early.stop.round = 10, maximize = F , eval_metric = "error")
xgbpred <- predict (xgb1,dtest)
mat <- xgb.importance (feature_names = colnames(new_tr),model = xgb1)
xgb.plot.importance (importance_matrix = mat[1:20])
#convert characters to factors
fact_col <- colnames(strain)[sapply(strain,is.character)]
for(i in fact_col) set(strain,j=i,value = factor(strain[[i]]))
for (i in fact_col) set(stest,j=i,value = factor(stest[[i]]))
## this seems like an odd add, but for the steps
strain$Value <- as.factor(strain$Value)
stest$Value <- as.factor(stest$Value)
#create tasks
traintask <- makeClassifTask (data = strain,target = "Value",fixup.data = "no")
testtask <- makeClassifTask (data = stest,target = "Value",fixup.data = "no")
#do one hot encoding`<br/>
traintask <- createDummyFeatures (obj = traintask)
testtask <- createDummyFeatures (obj = testtask)
lrn <- makeLearner("regr.xgboost",predict.type = "response")
lrn$par.vals <- list( objective="reg:linear", eval_metric="error", nrounds=100L, eta=0.1)
#set parameter space
params <- makeParamSet( makeDiscreteParam("booster",values = c("gbtree","gblinear")),
makeIntegerParam("max_depth",lower = 3L,upper = 10L),
makeNumericParam("min_child_weight",lower = 1L,upper = 10L),
makeNumericParam("subsample",lower = 0.5,upper = 1),
makeNumericParam("colsample_bytree",lower = 0.5,upper = 1))
#set resampling strategy
rdesc <- makeResampleDesc("CV",stratify = T,iters=5L)
ctrl <- makeTuneControlRandom(maxit = 10L)
library(parallel)
library(parallelMap)
parallelStartSocket(cpus = detectCores())
#parameter tuning
mytune <- tuneParams(learner = lrn, task = traintask, resampling = rdesc, , measures = acc, par.set = params, control = ctrl, show.info = T)
Error in instantiateResampleInstance.CVDesc(desc, length(ci), task) :
Cannot use more folds (5) than size (1)!
从那里我试过了:
rdesc <- makeResampleDesc("CV",stratify = T,size=5)
Error in makeResampleDescCV(size = 5) : unused argument (size = 5)
我在这里有点不知所措,有什么想法吗?
大小不是 makeResampleDesc
中的参数。我认为(不完全确定),你的问题是,你对某些 类 没有足够的观察,然后你无法进行分层。
尝试使用:rdesc <- makeResampleDesc("CV",stratify = F,iters=5)
问题来自 makeResampleDesc
函数中的 stratify
项。我们通常使用分层方法作为一种统计方法来掌握混杂,即使用模型 class化中的混淆。
您可以阅读理论和推理(在 Python 中实现)here。
在您的数据集中,如果 target
变量的 class 条目很少(少于 5 个),则 makeResampleDesc
函数无法执行该特定 class.
的重采样实例
如上所述,设置 stratify = F
将解决此处的问题,但我会对此进行调查,并会 oversample
任何 class 进行少量观察,或者, holdout
(read here) 从最初的训练到了解如何对 class 的其余部分进行分类,然后再简单地忽略 stratify
方法。
要了解哪些 classes 几乎没有观察,您可以做的是查看频率并使用以下行从那里做出决定:
library("data.table")
table(daraframe$target)
在我的例子中,我几乎没有 class GG3
的实例,所以我无法 stratify
数据,见下文:
正在对 xgboost 模型进行参数调整,运行 在我的 mlr 实现中出现了一个有趣的错误,我认为这是由我的重采样实例引起的 due to the documentation here。问题是我不太清楚如何修复它。我试过手动设置函数的大小参数,但也被拒绝了。
基本代码:
samplecount = sample.split(test_train,SplitRatio = 0.8)
stest <- subset(test_train,samplecount ==TRUE)
strain <- subset(test_train,samplecount ==FALSE)
new_tr <- model.matrix(~.+0,data = subset(strain,select=-c(Value)))
new_ts <- model.matrix(~.+0,data = subset(stest,select=-c(Value)))
labels <- strain$Value
labels <- as.numeric(labels)-1
ts_label <- stest$Value
ts_label <- as.numeric(ts_label)-1
dtrain <- xgb.DMatrix(data = new_tr,label = labels)
dtest <- xgb.DMatrix(data = new_ts,label=ts_label)
params <- list(booster = "gbtree", objective = "reg:linear",
eta=0.3, gamma=0, max_depth=6, min_child_weight=1, subsample=1, colsample_bytree=1)
xgbcv <- xgb.cv( params = params, data = dtrain,
nrounds = 100, showsd = T,nfold=5,
stratified = T, print.every.n = 1, early.stop.round = 20, maximize = F)
xgb1 <- xgb.train (params = params, data = dtrain, nrounds = 79, watchlist = list(val=dtest,train=dtrain), print.every.n = 10, early.stop.round = 10, maximize = F , eval_metric = "error")
xgbpred <- predict (xgb1,dtest)
mat <- xgb.importance (feature_names = colnames(new_tr),model = xgb1)
xgb.plot.importance (importance_matrix = mat[1:20])
#convert characters to factors
fact_col <- colnames(strain)[sapply(strain,is.character)]
for(i in fact_col) set(strain,j=i,value = factor(strain[[i]]))
for (i in fact_col) set(stest,j=i,value = factor(stest[[i]]))
## this seems like an odd add, but for the steps
strain$Value <- as.factor(strain$Value)
stest$Value <- as.factor(stest$Value)
#create tasks
traintask <- makeClassifTask (data = strain,target = "Value",fixup.data = "no")
testtask <- makeClassifTask (data = stest,target = "Value",fixup.data = "no")
#do one hot encoding`<br/>
traintask <- createDummyFeatures (obj = traintask)
testtask <- createDummyFeatures (obj = testtask)
lrn <- makeLearner("regr.xgboost",predict.type = "response")
lrn$par.vals <- list( objective="reg:linear", eval_metric="error", nrounds=100L, eta=0.1)
#set parameter space
params <- makeParamSet( makeDiscreteParam("booster",values = c("gbtree","gblinear")),
makeIntegerParam("max_depth",lower = 3L,upper = 10L),
makeNumericParam("min_child_weight",lower = 1L,upper = 10L),
makeNumericParam("subsample",lower = 0.5,upper = 1),
makeNumericParam("colsample_bytree",lower = 0.5,upper = 1))
#set resampling strategy
rdesc <- makeResampleDesc("CV",stratify = T,iters=5L)
ctrl <- makeTuneControlRandom(maxit = 10L)
library(parallel)
library(parallelMap)
parallelStartSocket(cpus = detectCores())
#parameter tuning
mytune <- tuneParams(learner = lrn, task = traintask, resampling = rdesc, , measures = acc, par.set = params, control = ctrl, show.info = T)
Error in instantiateResampleInstance.CVDesc(desc, length(ci), task) :
Cannot use more folds (5) than size (1)!
从那里我试过了:
rdesc <- makeResampleDesc("CV",stratify = T,size=5)
Error in makeResampleDescCV(size = 5) : unused argument (size = 5)
我在这里有点不知所措,有什么想法吗?
大小不是 makeResampleDesc
中的参数。我认为(不完全确定),你的问题是,你对某些 类 没有足够的观察,然后你无法进行分层。
尝试使用:rdesc <- makeResampleDesc("CV",stratify = F,iters=5)
问题来自 makeResampleDesc
函数中的 stratify
项。我们通常使用分层方法作为一种统计方法来掌握混杂,即使用模型 class化中的混淆。
您可以阅读理论和推理(在 Python 中实现)here。
在您的数据集中,如果 target
变量的 class 条目很少(少于 5 个),则 makeResampleDesc
函数无法执行该特定 class.
如上所述,设置 stratify = F
将解决此处的问题,但我会对此进行调查,并会 oversample
任何 class 进行少量观察,或者, holdout
(read here) 从最初的训练到了解如何对 class 的其余部分进行分类,然后再简单地忽略 stratify
方法。
要了解哪些 classes 几乎没有观察,您可以做的是查看频率并使用以下行从那里做出决定:
library("data.table")
table(daraframe$target)
在我的例子中,我几乎没有 class GG3
的实例,所以我无法 stratify
数据,见下文: