使用 xgboost 进行二进制分类,得到 10000x2 yhat
Doing a binary classification using xgboost, getting a 10000x2 yhat
我是 xgboost 的新手,我正在尝试 运行 二元分类模型。
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from xgboost.sklearn import XGBClassifier
df_train = pd.read_csv('case2_training.csv')
df_test = pd.read_csv('case2_testing.csv')
df_train.shape, df_test.shape
# Exam the data
print(df_train.head())
print(df_test.head())
# Check for exsitence of null
df_train.info()
print('--------------------------------------------------------------')
X, y = df_train.iloc[:, :-1], df_train.iloc[:, -1] # X = features Y = label
data_dmatrix = xgb.DMatrix(data=X, label=y)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
xgb = XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints=None,
learning_rate=0.3, max_delta_step=0, max_depth=6,
min_child_weight=1, monotone_constraints=None,
n_estimators=25, n_jobs=0, num_parallel_tree=1,
objective='binary:logistic', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=None, seed=0, subsample=0.5,
tree_method=None, validate_parameters=False, verbosity=None)
xgb.fit(X_train,y_train)
yhat = xgb.predict_proba(X_test)
print(yhat.shape)
print(yhat)
print(y_test.shape)
# train_accuracy = roc_auc_score(y_test, yhat)
这里的 yhat 给出了一个 (10000x2),这不是我所期望的。它应该是 10000x1,然后我可以将它们四舍五入以与 y_test 进行比较。
谁能帮忙?
谢谢
您会得到每个数据点的给定 class 的概率。所以列中两个数字的总和 = 1,您可以通过取 np.argmax()
.
找到 class
我是 xgboost 的新手,我正在尝试 运行 二元分类模型。
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from xgboost.sklearn import XGBClassifier
df_train = pd.read_csv('case2_training.csv')
df_test = pd.read_csv('case2_testing.csv')
df_train.shape, df_test.shape
# Exam the data
print(df_train.head())
print(df_test.head())
# Check for exsitence of null
df_train.info()
print('--------------------------------------------------------------')
X, y = df_train.iloc[:, :-1], df_train.iloc[:, -1] # X = features Y = label
data_dmatrix = xgb.DMatrix(data=X, label=y)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
xgb = XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints=None,
learning_rate=0.3, max_delta_step=0, max_depth=6,
min_child_weight=1, monotone_constraints=None,
n_estimators=25, n_jobs=0, num_parallel_tree=1,
objective='binary:logistic', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=None, seed=0, subsample=0.5,
tree_method=None, validate_parameters=False, verbosity=None)
xgb.fit(X_train,y_train)
yhat = xgb.predict_proba(X_test)
print(yhat.shape)
print(yhat)
print(y_test.shape)
# train_accuracy = roc_auc_score(y_test, yhat)
这里的 yhat 给出了一个 (10000x2),这不是我所期望的。它应该是 10000x1,然后我可以将它们四舍五入以与 y_test 进行比较。 谁能帮忙? 谢谢
您会得到每个数据点的给定 class 的概率。所以列中两个数字的总和 = 1,您可以通过取 np.argmax()
.