当测试数据集中的行被打乱时,分类性能会下降
Classification performance degrades when rows from test dataset are shuffled
为什么我打乱测试数据集后分类性能会下降?
出于复制目的:
我创建了一个不平衡的数据集:
n = 1
centers=[[0.0, -5, 2.5], [0, 0,2.5], [0, 5,2.5]]
cluster_std = [1.0, 1.0,1.0]
X, y = make_blobs(n_samples=[250,24500,250], centers=centers, cluster_std=cluster_std,n_features=len(cluster_std), random_state = n)
dataset_x = pd.DataFrame({'var1': X[:, 0], 'var2': X[:, 1],'var3': X[:, 2]})
dataset_y = pd.DataFrame({'target': y})
simulated_blob_dataset = pd.concat([dataset_x,dataset_y], axis=1)
我将数据集拆分为训练和测试:
training_data, testing_data = data_split(raw_data=simulated_blob_dataset,target_variable_name="target",test_size=0.2)
我创建了一个基础模型:
def base_models():
models = dict()
models['rf'] = RandomForestClassifier(n_jobs=-1)
models['gbm'] = GradientBoostingClassifier()
models['dt'] = DecisionTreeClassifier()
models['svc'] = SVC()
models['knn'] = KNeighborsClassifier(n_jobs=-1)
models['nb'] = GaussianNB()
models['SE_rf'] = stack_ensemble_1()
models['SE_gbm'] = stack_ensemble_2()
models['SE_dt'] = stack_ensemble_3()
models['SE_svc'] = stack_ensemble_4()
models['SE_knn'] = stack_ensemble_5()
models['SE_nb'] = stack_ensemble_6()
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='balanced_accuracy', cv=cv,n_jobs=-1, error_score='raise')
return scores
def stack_ensemble_1():
# define the base models
level0 = list()
level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
level0.append(('gbm', GradientBoostingClassifier()))
level0.append(('dt', DecisionTreeClassifier()))
level0.append(('svc', SVC()))
level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
level0.append(('nb', GaussianNB()))
# define meta learner model
level1 = RandomForestClassifier(n_jobs=-1)
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use n_job= -1 for all cores
return model
def stack_ensemble_2():
# define the base models
level0 = list()
level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
level0.append(('gbm', GradientBoostingClassifier()))
level0.append(('dt', DecisionTreeClassifier()))
level0.append(('svc', SVC()))
level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
level0.append(('nb', GaussianNB()))
# define meta learner model
level1 = GradientBoostingClassifier()
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use -1 for all cores
return model
def stack_ensemble_3():
# define the base models
level0 = list()
level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
level0.append(('gbm', GradientBoostingClassifier()))
level0.append(('dt', DecisionTreeClassifier()))
level0.append(('svc', SVC()))
level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
level0.append(('nb', GaussianNB()))
# define meta learner model
level1 = DecisionTreeClassifier()
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use n_jobs = -1 for all cores
return model
def stack_ensemble_4():
# define the base models
level0 = list()
level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
level0.append(('gbm', GradientBoostingClassifier()))
level0.append(('dt', DecisionTreeClassifier()))
level0.append(('svc', SVC()))
level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
level0.append(('nb', GaussianNB()))
# define meta learner model
level1 = SVC()
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use -1 for all cores
return model
def stack_ensemble_5():
# define the base models
level0 = list()
level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
level0.append(('gbm', GradientBoostingClassifier()))
level0.append(('dt', DecisionTreeClassifier()))
level0.append(('svc', SVC()))
level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
level0.append(('nb', GaussianNB()))
# define meta learner model
level1 = KNeighborsClassifier(n_jobs=-1)
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use -1 for all cores
return model
def stack_ensemble_6():
# define the base models
level0 = list()
level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
level0.append(('gbm', GradientBoostingClassifier()))
level0.append(('dt', DecisionTreeClassifier()))
level0.append(('svc', SVC()))
level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
level0.append(('nb', GaussianNB()))
# define meta learner model
level1 = GaussianNB()
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use -1 for all cores
return model
首先,我尝试以正常方式 运行 它(测试数据集中的行未重新排列):
X, y = training_data[['var1', 'var2', 'var3']].values,training_data['target'].values
models = base_models()
results, names = list(), list()
for name, model in models.items():
print(name)
clf = model.fit(X,y.ravel())
y_pred = clf.predict(testing_data[['var1', 'var2', 'var3']].values)
cnf_matrix = confusion_matrix(testing_data['target'].values, y_pred)
print(cnf_matrix)
效果不错:
然而,当我重新调整测试数据集的行时,为了检查模型的稳健性[通过创建相同事实的不同角度]:
X, y = training_data[['var1', 'var2', 'var3']].values,training_data['target'].values
models = base_models()
results, names = list(), list()
for name, model in models.items():
print(name)
clf = model.fit(X,y.ravel())
y_pred = clf.predict(testing_data[['var1', 'var2', 'var3']].values).sample(frac=1,replace= False,random_state=1).reset_index(drop = True).values)
cnf_matrix = confusion_matrix(testing_data['target'].values, y_pred)
print(cnf_matrix)
结果降级:
你的洗牌程序是错误的:你只洗牌了预测值,而真实值保持原样;这打破了预测值和真实值之间的 1-1 对应关系,并且肯定会导致这种荒谬的结果。
您需要使用 scikit-learn 的 shuffle
实用方法 串联 打乱真实值和预测值。这是一个使用您自己的数据和 RF 分类器的示例:
import pandas as pd
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
# your data as-is:
n = 1
centers=[[0.0, -5, 2.5], [0, 0,2.5], [0, 5,2.5]]
cluster_std = [1.0, 1.0,1.0]
X, y = make_blobs(n_samples=[250,24500,250], centers=centers, cluster_std=cluster_std,n_features=len(cluster_std), random_state = n)
dataset_x = pd.DataFrame({'var1': X[:, 0], 'var2': X[:, 1],'var3': X[:, 2]})
dataset_y = pd.DataFrame({'target': y})
simulated_blob_dataset = pd.concat([dataset_x,dataset_y], axis=1)
# train-test split using scikit-learn, as data_split is of unknown origin:
training_data, testing_data = train_test_split(simulated_blob_dataset, test_size=0.2)
# fit & predict
rf = RandomForestClassifier(n_jobs=-1)
X, y = training_data[['var1', 'var2', 'var3']].values,training_data['target'].values
rf.fit(X,y.ravel())
y_pred = rf.predict(testing_data[['var1', 'var2', 'var3']].values)
cm = confusion_matrix(testing_data['target'].values, y_pred)
print(cm)
得到的混淆矩阵cm
是:
[[ 42 2 0]
[ 3 4896 0]
[ 0 3 54]]
现在,使用 shuffle
正确洗牌:
y_true_shuffled, y_pred_shuffled = shuffle(testing_data['target'].values, y_pred)
cm_shuffled = confusion_matrix(y_true_shuffled, y_pred_shuffled)
print(cm_shuffled)
得到的混淆矩阵cm_shuffled
是:
[[ 42 2 0]
[ 3 4896 0]
[ 0 3 54]]
和
np.all(cm==cm_shuffled)
# True
为什么我打乱测试数据集后分类性能会下降?
出于复制目的: 我创建了一个不平衡的数据集:
n = 1
centers=[[0.0, -5, 2.5], [0, 0,2.5], [0, 5,2.5]]
cluster_std = [1.0, 1.0,1.0]
X, y = make_blobs(n_samples=[250,24500,250], centers=centers, cluster_std=cluster_std,n_features=len(cluster_std), random_state = n)
dataset_x = pd.DataFrame({'var1': X[:, 0], 'var2': X[:, 1],'var3': X[:, 2]})
dataset_y = pd.DataFrame({'target': y})
simulated_blob_dataset = pd.concat([dataset_x,dataset_y], axis=1)
我将数据集拆分为训练和测试:
training_data, testing_data = data_split(raw_data=simulated_blob_dataset,target_variable_name="target",test_size=0.2)
我创建了一个基础模型:
def base_models():
models = dict()
models['rf'] = RandomForestClassifier(n_jobs=-1)
models['gbm'] = GradientBoostingClassifier()
models['dt'] = DecisionTreeClassifier()
models['svc'] = SVC()
models['knn'] = KNeighborsClassifier(n_jobs=-1)
models['nb'] = GaussianNB()
models['SE_rf'] = stack_ensemble_1()
models['SE_gbm'] = stack_ensemble_2()
models['SE_dt'] = stack_ensemble_3()
models['SE_svc'] = stack_ensemble_4()
models['SE_knn'] = stack_ensemble_5()
models['SE_nb'] = stack_ensemble_6()
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='balanced_accuracy', cv=cv,n_jobs=-1, error_score='raise')
return scores
def stack_ensemble_1():
# define the base models
level0 = list()
level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
level0.append(('gbm', GradientBoostingClassifier()))
level0.append(('dt', DecisionTreeClassifier()))
level0.append(('svc', SVC()))
level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
level0.append(('nb', GaussianNB()))
# define meta learner model
level1 = RandomForestClassifier(n_jobs=-1)
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use n_job= -1 for all cores
return model
def stack_ensemble_2():
# define the base models
level0 = list()
level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
level0.append(('gbm', GradientBoostingClassifier()))
level0.append(('dt', DecisionTreeClassifier()))
level0.append(('svc', SVC()))
level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
level0.append(('nb', GaussianNB()))
# define meta learner model
level1 = GradientBoostingClassifier()
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use -1 for all cores
return model
def stack_ensemble_3():
# define the base models
level0 = list()
level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
level0.append(('gbm', GradientBoostingClassifier()))
level0.append(('dt', DecisionTreeClassifier()))
level0.append(('svc', SVC()))
level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
level0.append(('nb', GaussianNB()))
# define meta learner model
level1 = DecisionTreeClassifier()
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use n_jobs = -1 for all cores
return model
def stack_ensemble_4():
# define the base models
level0 = list()
level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
level0.append(('gbm', GradientBoostingClassifier()))
level0.append(('dt', DecisionTreeClassifier()))
level0.append(('svc', SVC()))
level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
level0.append(('nb', GaussianNB()))
# define meta learner model
level1 = SVC()
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use -1 for all cores
return model
def stack_ensemble_5():
# define the base models
level0 = list()
level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
level0.append(('gbm', GradientBoostingClassifier()))
level0.append(('dt', DecisionTreeClassifier()))
level0.append(('svc', SVC()))
level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
level0.append(('nb', GaussianNB()))
# define meta learner model
level1 = KNeighborsClassifier(n_jobs=-1)
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use -1 for all cores
return model
def stack_ensemble_6():
# define the base models
level0 = list()
level0.append(('rf', RandomForestClassifier(n_jobs=-1)))
level0.append(('gbm', GradientBoostingClassifier()))
level0.append(('dt', DecisionTreeClassifier()))
level0.append(('svc', SVC()))
level0.append(('knn', KNeighborsClassifier(n_jobs=-1)))
level0.append(('nb', GaussianNB()))
# define meta learner model
level1 = GaussianNB()
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=10) # use -1 for all cores
return model
首先,我尝试以正常方式 运行 它(测试数据集中的行未重新排列):
X, y = training_data[['var1', 'var2', 'var3']].values,training_data['target'].values
models = base_models()
results, names = list(), list()
for name, model in models.items():
print(name)
clf = model.fit(X,y.ravel())
y_pred = clf.predict(testing_data[['var1', 'var2', 'var3']].values)
cnf_matrix = confusion_matrix(testing_data['target'].values, y_pred)
print(cnf_matrix)
效果不错:
然而,当我重新调整测试数据集的行时,为了检查模型的稳健性[通过创建相同事实的不同角度]:
X, y = training_data[['var1', 'var2', 'var3']].values,training_data['target'].values
models = base_models()
results, names = list(), list()
for name, model in models.items():
print(name)
clf = model.fit(X,y.ravel())
y_pred = clf.predict(testing_data[['var1', 'var2', 'var3']].values).sample(frac=1,replace= False,random_state=1).reset_index(drop = True).values)
cnf_matrix = confusion_matrix(testing_data['target'].values, y_pred)
print(cnf_matrix)
结果降级:
你的洗牌程序是错误的:你只洗牌了预测值,而真实值保持原样;这打破了预测值和真实值之间的 1-1 对应关系,并且肯定会导致这种荒谬的结果。
您需要使用 scikit-learn 的 shuffle
实用方法 串联 打乱真实值和预测值。这是一个使用您自己的数据和 RF 分类器的示例:
import pandas as pd
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
# your data as-is:
n = 1
centers=[[0.0, -5, 2.5], [0, 0,2.5], [0, 5,2.5]]
cluster_std = [1.0, 1.0,1.0]
X, y = make_blobs(n_samples=[250,24500,250], centers=centers, cluster_std=cluster_std,n_features=len(cluster_std), random_state = n)
dataset_x = pd.DataFrame({'var1': X[:, 0], 'var2': X[:, 1],'var3': X[:, 2]})
dataset_y = pd.DataFrame({'target': y})
simulated_blob_dataset = pd.concat([dataset_x,dataset_y], axis=1)
# train-test split using scikit-learn, as data_split is of unknown origin:
training_data, testing_data = train_test_split(simulated_blob_dataset, test_size=0.2)
# fit & predict
rf = RandomForestClassifier(n_jobs=-1)
X, y = training_data[['var1', 'var2', 'var3']].values,training_data['target'].values
rf.fit(X,y.ravel())
y_pred = rf.predict(testing_data[['var1', 'var2', 'var3']].values)
cm = confusion_matrix(testing_data['target'].values, y_pred)
print(cm)
得到的混淆矩阵cm
是:
[[ 42 2 0]
[ 3 4896 0]
[ 0 3 54]]
现在,使用 shuffle
正确洗牌:
y_true_shuffled, y_pred_shuffled = shuffle(testing_data['target'].values, y_pred)
cm_shuffled = confusion_matrix(y_true_shuffled, y_pred_shuffled)
print(cm_shuffled)
得到的混淆矩阵cm_shuffled
是:
[[ 42 2 0]
[ 3 4896 0]
[ 0 3 54]]
和
np.all(cm==cm_shuffled)
# True