从 R 中的 xgboost 模型绘制 AUC
Plotting the AUC from an xgboost model in R
我目前正在关注以下 link 中的幻灯片。我在幻灯片 121/128 上,我想知道如何复制 AUC。作者没有解释如何这样做(幻灯片 124 上也是如此)。其次在幻灯片 125 上生成以下代码;
bestRound = which.max(as.matrix(cv.res)[,3]-as.matrix(cv.res)[,4])
bestRound
我收到以下错误;
Error in as.matrix(cv.res)[, 2] : subscript out of bounds
下面代码的数据可以在here下载,我制作了下面的代码供大家参考。
问题:作为作者我如何产生AUC,为什么下标越界?
-----代码------
# Kaggle Winning Solutions
train <- read.csv('train.csv', header = TRUE)
test <- read.csv('test.csv', header = TRUE)
y <- train[, 1]
train <- as.matrix(train[, -1])
test <- as.matrix(test)
train[1, ]
#We want to determin who is more influencial than the other
new.train <- cbind(train[, 12:22], train[, 1:11])
train = rbind(train, new.train)
y <- c(y, 1 - y)
x <- rbind(train, test)
(dat[,i]+lambda)/(dat[,j]+lambda)
A.follow.ratio = calcRatio(x,1,2)
A.mention.ratio = calcRatio(x,4,6)
A.retweet.ratio = calcRatio(x,5,7)
A.follow.post = calcRatio(x,1,8)
A.mention.post = calcRatio(x,4,8)
A.retweet.post = calcRatio(x,5,8)
B.follow.ratio = calcRatio(x,12,13)
B.mention.ratio = calcRatio(x,15,17)
B.retweet.ratio = calcRatio(x,16,18)
B.follow.post = calcRatio(x,12,19)
B.mention.post = calcRatio(x,15,19)
B.retweet.post = calcRatio(x,16,19)
x = cbind(x[,1:11],
A.follow.ratio,A.mention.ratio,A.retweet.ratio,
A.follow.post,A.mention.post,A.retweet.post,
x[,12:22],
B.follow.ratio,B.mention.ratio,B.retweet.ratio,
B.follow.post,B.mention.post,B.retweet.post)
AB.diff = x[,1:17]-x[,18:34]
x = cbind(x,AB.diff)
train = x[1:nrow(train),]
test = x[-(1:nrow(train)),]
set.seed(1024)
cv.res <- xgb.cv(data = train, nfold = 3, label = y, nrounds = 100, verbose = FALSE,
objective = 'binary:logistic', eval_metric = 'auc')
在此处绘制 AUC 图
set.seed(1024)
cv.res = xgb.cv(data = train, nfold = 3, label = y, nrounds = 3000,
objective='binary:logistic', eval_metric = 'auc',
eta = 0.005, gamma = 1,lambda = 3, nthread = 8,
max_depth = 4, min_child_weight = 1, verbose = F,
subsample = 0.8,colsample_bytree = 0.8)
这是我遇到的代码中断
#bestRound: - subscript out of bounds
bestRound <- which.max(as.matrix(cv.res)[,3]-as.matrix(cv.res)[,4])
bestRound
cv.res
cv.res[bestRound,]
set.seed(1024) bst <- xgboost(data = train, label = y, nrounds = 3000,
objective='binary:logistic', eval_metric = 'auc',
eta = 0.005, gamma = 1,lambda = 3, nthread = 8,
max_depth = 4, min_child_weight = 1,
subsample = 0.8,colsample_bytree = 0.8)
preds <- predict(bst,test,ntreelimit = bestRound)
result <- data.frame(Id = 1:nrow(test), Choice = preds)
write.csv(result,'submission.csv',quote=FALSE,row.names=FALSE)
代码的许多部分对我来说意义不大,但这里是使用提供的数据构建模型的最小示例:
数据:
train <- read.csv('train.csv', header = TRUE)
y <- train[, 1]
train <- as.matrix(train[, -1])
型号:
library(xgboost)
cv.res <- xgb.cv(data = train, nfold = 3, label = y, nrounds = 100, verbose = FALSE,
objective = 'binary:logistic', eval_metric = 'auc', prediction = T)
要获得交叉验证预测,必须在调用 xgb.cv
.
时指定 prediction = T
获得最佳迭代:
it = which.max(cv.res$evaluation_log$test_auc_mean)
best.iter = cv.res$evaluation_log$iter[it]
在交叉验证结果上绘制 ROC 曲线:
library(pROC)
plot(pROC::roc(response = y,
predictor = cv.res$pred,
levels=c(0, 1)),
lwd=1.5)
得到混淆矩阵(假设0.5概率为阈值):
library(caret)
confusionMatrix(ifelse(cv.res$pred <= 0.5, 0, 1), y)
#output
Reference
Prediction 0 1
0 2020 638
1 678 2164
Accuracy : 0.7607
95% CI : (0.7492, 0.772)
No Information Rate : 0.5095
P-Value [Acc > NIR] : <2e-16
Kappa : 0.5212
Mcnemar's Test P-Value : 0.2823
Sensitivity : 0.7487
Specificity : 0.7723
Pos Pred Value : 0.7600
Neg Pred Value : 0.7614
Prevalence : 0.4905
Detection Rate : 0.3673
Detection Prevalence : 0.4833
Balanced Accuracy : 0.7605
'Positive' Class : 0
也就是说,应该通过交叉验证来调整超参数,例如 eta、gamma、lambda、子样本、colsample_bytree、colsample_bylevel 等
最简单的方法是构建一个网格搜索,在其中对超参数的所有组合使用 expand.grid
,并在网格上使用 lapply,xgb.cv
作为超参数的一部分自定义函数)。如果您需要更多详细信息,请发表评论。
我目前正在关注以下 link 中的幻灯片。我在幻灯片 121/128 上,我想知道如何复制 AUC。作者没有解释如何这样做(幻灯片 124 上也是如此)。其次在幻灯片 125 上生成以下代码;
bestRound = which.max(as.matrix(cv.res)[,3]-as.matrix(cv.res)[,4])
bestRound
我收到以下错误;
Error in as.matrix(cv.res)[, 2] : subscript out of bounds
下面代码的数据可以在here下载,我制作了下面的代码供大家参考。
问题:作为作者我如何产生AUC,为什么下标越界?
-----代码------
# Kaggle Winning Solutions
train <- read.csv('train.csv', header = TRUE)
test <- read.csv('test.csv', header = TRUE)
y <- train[, 1]
train <- as.matrix(train[, -1])
test <- as.matrix(test)
train[1, ]
#We want to determin who is more influencial than the other
new.train <- cbind(train[, 12:22], train[, 1:11])
train = rbind(train, new.train)
y <- c(y, 1 - y)
x <- rbind(train, test)
(dat[,i]+lambda)/(dat[,j]+lambda)
A.follow.ratio = calcRatio(x,1,2)
A.mention.ratio = calcRatio(x,4,6)
A.retweet.ratio = calcRatio(x,5,7)
A.follow.post = calcRatio(x,1,8)
A.mention.post = calcRatio(x,4,8)
A.retweet.post = calcRatio(x,5,8)
B.follow.ratio = calcRatio(x,12,13)
B.mention.ratio = calcRatio(x,15,17)
B.retweet.ratio = calcRatio(x,16,18)
B.follow.post = calcRatio(x,12,19)
B.mention.post = calcRatio(x,15,19)
B.retweet.post = calcRatio(x,16,19)
x = cbind(x[,1:11],
A.follow.ratio,A.mention.ratio,A.retweet.ratio,
A.follow.post,A.mention.post,A.retweet.post,
x[,12:22],
B.follow.ratio,B.mention.ratio,B.retweet.ratio,
B.follow.post,B.mention.post,B.retweet.post)
AB.diff = x[,1:17]-x[,18:34]
x = cbind(x,AB.diff)
train = x[1:nrow(train),]
test = x[-(1:nrow(train)),]
set.seed(1024)
cv.res <- xgb.cv(data = train, nfold = 3, label = y, nrounds = 100, verbose = FALSE,
objective = 'binary:logistic', eval_metric = 'auc')
在此处绘制 AUC 图
set.seed(1024)
cv.res = xgb.cv(data = train, nfold = 3, label = y, nrounds = 3000,
objective='binary:logistic', eval_metric = 'auc',
eta = 0.005, gamma = 1,lambda = 3, nthread = 8,
max_depth = 4, min_child_weight = 1, verbose = F,
subsample = 0.8,colsample_bytree = 0.8)
这是我遇到的代码中断
#bestRound: - subscript out of bounds
bestRound <- which.max(as.matrix(cv.res)[,3]-as.matrix(cv.res)[,4])
bestRound
cv.res
cv.res[bestRound,]
set.seed(1024) bst <- xgboost(data = train, label = y, nrounds = 3000,
objective='binary:logistic', eval_metric = 'auc',
eta = 0.005, gamma = 1,lambda = 3, nthread = 8,
max_depth = 4, min_child_weight = 1,
subsample = 0.8,colsample_bytree = 0.8)
preds <- predict(bst,test,ntreelimit = bestRound)
result <- data.frame(Id = 1:nrow(test), Choice = preds)
write.csv(result,'submission.csv',quote=FALSE,row.names=FALSE)
代码的许多部分对我来说意义不大,但这里是使用提供的数据构建模型的最小示例:
数据:
train <- read.csv('train.csv', header = TRUE)
y <- train[, 1]
train <- as.matrix(train[, -1])
型号:
library(xgboost)
cv.res <- xgb.cv(data = train, nfold = 3, label = y, nrounds = 100, verbose = FALSE,
objective = 'binary:logistic', eval_metric = 'auc', prediction = T)
要获得交叉验证预测,必须在调用 xgb.cv
.
prediction = T
获得最佳迭代:
it = which.max(cv.res$evaluation_log$test_auc_mean)
best.iter = cv.res$evaluation_log$iter[it]
在交叉验证结果上绘制 ROC 曲线:
library(pROC)
plot(pROC::roc(response = y,
predictor = cv.res$pred,
levels=c(0, 1)),
lwd=1.5)
得到混淆矩阵(假设0.5概率为阈值):
library(caret)
confusionMatrix(ifelse(cv.res$pred <= 0.5, 0, 1), y)
#output
Reference
Prediction 0 1
0 2020 638
1 678 2164
Accuracy : 0.7607
95% CI : (0.7492, 0.772)
No Information Rate : 0.5095
P-Value [Acc > NIR] : <2e-16
Kappa : 0.5212
Mcnemar's Test P-Value : 0.2823
Sensitivity : 0.7487
Specificity : 0.7723
Pos Pred Value : 0.7600
Neg Pred Value : 0.7614
Prevalence : 0.4905
Detection Rate : 0.3673
Detection Prevalence : 0.4833
Balanced Accuracy : 0.7605
'Positive' Class : 0
也就是说,应该通过交叉验证来调整超参数,例如 eta、gamma、lambda、子样本、colsample_bytree、colsample_bylevel 等
最简单的方法是构建一个网格搜索,在其中对超参数的所有组合使用 expand.grid
,并在网格上使用 lapply,xgb.cv
作为超参数的一部分自定义函数)。如果您需要更多详细信息,请发表评论。