python 中用于多类分类的 XGBClassifier 交叉验证
Cross-validation on XGBClassifier for multiclass classification in python
的以下代码对 XGBClassifier 执行交叉验证以解决多重 class class 化问题
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
def modelFit(alg, X, y, useTrainCV=True, cvFolds=5, early_stopping_rounds=50):
if useTrainCV:
xgbParams = alg.get_xgb_params()
xgTrain = xgb.DMatrix(X, label=y)
cvresult = xgb.cv(xgbParams,
xgTrain,
num_boost_round=alg.get_params()['n_estimators'],
nfold=cvFolds,
stratified=True,
metrics={'mlogloss'},
early_stopping_rounds=early_stopping_rounds,
seed=0,
callbacks=[xgb.callback.print_evaluation(show_stdv=False), xgb.callback.early_stop(3)])
print cvresult
alg.set_params(n_estimators=cvresult.shape[0])
# Fit the algorithm
alg.fit(X, y, eval_metric='mlogloss')
# Predict
dtrainPredictions = alg.predict(X)
dtrainPredProb = alg.predict_proba(X)
# Print model report:
print "\nModel Report"
print "Classification report: \n"
print(classification_report(y_val, y_val_pred))
print "Accuracy : %.4g" % metrics.accuracy_score(y, dtrainPredictions)
print "Log Loss Score (Train): %f" % metrics.log_loss(y, dtrainPredProb)
feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
# 1) Read training set
print('>> Read training set')
train = pd.read_csv(trainFile)
# 2) Extract target attribute and convert to numeric
print('>> Preprocessing')
y_train = train['OutcomeType'].values
le_y = LabelEncoder()
y_train = le_y.fit_transform(y_train)
train.drop('OutcomeType', axis=1, inplace=True)
# 4) Extract features and target from training set
X_train = train.values
# 5) First classifier
xgb = XGBClassifier(learning_rate =0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
scale_pos_weight=1,
objective='multi:softprob',
seed=27)
modelFit(xgb, X_train, y_train)
其中 y_train
包含从 0 到 4 的标签。但是,当我 运行 这段代码时,我从 xgb.cv
函数 xgboost.core.XGBoostError: value 0for Parameter num_class should be greater equal to 1
得到以下错误。在 XGBoost 文档上,我读到在 multiclass 情况下 xgb 从目标向量中的标签推断 classes 的数量,所以我不明白发生了什么。
您必须将参数“num_class”添加到 xgb_param 字典中。参数说明和您在上面提供的 link 的评论中也提到了这一点。
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
def modelFit(alg, X, y, useTrainCV=True, cvFolds=5, early_stopping_rounds=50):
if useTrainCV:
xgbParams = alg.get_xgb_params()
xgTrain = xgb.DMatrix(X, label=y)
cvresult = xgb.cv(xgbParams,
xgTrain,
num_boost_round=alg.get_params()['n_estimators'],
nfold=cvFolds,
stratified=True,
metrics={'mlogloss'},
early_stopping_rounds=early_stopping_rounds,
seed=0,
callbacks=[xgb.callback.print_evaluation(show_stdv=False), xgb.callback.early_stop(3)])
print cvresult
alg.set_params(n_estimators=cvresult.shape[0])
# Fit the algorithm
alg.fit(X, y, eval_metric='mlogloss')
# Predict
dtrainPredictions = alg.predict(X)
dtrainPredProb = alg.predict_proba(X)
# Print model report:
print "\nModel Report"
print "Classification report: \n"
print(classification_report(y_val, y_val_pred))
print "Accuracy : %.4g" % metrics.accuracy_score(y, dtrainPredictions)
print "Log Loss Score (Train): %f" % metrics.log_loss(y, dtrainPredProb)
feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
# 1) Read training set
print('>> Read training set')
train = pd.read_csv(trainFile)
# 2) Extract target attribute and convert to numeric
print('>> Preprocessing')
y_train = train['OutcomeType'].values
le_y = LabelEncoder()
y_train = le_y.fit_transform(y_train)
train.drop('OutcomeType', axis=1, inplace=True)
# 4) Extract features and target from training set
X_train = train.values
# 5) First classifier
xgb = XGBClassifier(learning_rate =0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
scale_pos_weight=1,
objective='multi:softprob',
seed=27)
modelFit(xgb, X_train, y_train)
其中 y_train
包含从 0 到 4 的标签。但是,当我 运行 这段代码时,我从 xgb.cv
函数 xgboost.core.XGBoostError: value 0for Parameter num_class should be greater equal to 1
得到以下错误。在 XGBoost 文档上,我读到在 multiclass 情况下 xgb 从目标向量中的标签推断 classes 的数量,所以我不明白发生了什么。
您必须将参数“num_class”添加到 xgb_param 字典中。参数说明和您在上面提供的 link 的评论中也提到了这一点。