多分类集成交叉验证函数太多值无法解包(预期 2)
Multi Classification Ensemble Cross validation Function too many values to unpack (expected 2)
[Link 到示例文件][1]
[1]: https://www.dropbox.com/s/vk0ht1bowdhz85n/Whosebug_Example.csv?dl=0
下面的代码分为函数和调用函数的主要代码两部分。一路上有一堆打印语句可以帮助排除故障。我认为这个问题与“mean_feature_importances”变量有关。此过程有效,并且可以毫无问题地对二进制 class 进行比较。我试图改变它来评估 multi-class classifiers 所以我比较那里的表现。为什么它只需要 2 个标签是有道理的,因为这就是它的用途,但这个模型有 5 个不同的标签可供选择。我已经更改了我认为应该更改的每一个值,以适应 5 个不同的标签而不是 2 个。如果我遗漏了什么,请告知问题发生在 return after print(19)
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron # linear classifiers
from sklearn.model_selection import StratifiedKFold # train/test splitting tool for cross-validation
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, \
GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc # scoring metrics
Here is the function used to process classifier ensemble cross validation
def train_MultiClass_classifier_ensemble_CV(classifiers, X_data, y_data, clf_params=None, cv_splits=10,
random_state=21, return_trained_classifiers=True, verbose=0, prtParam=0):
"""
Trains a list of classifiers on the input training data and returns cross-validated accuracy and f1 scores
as well as feature_importances (where available). The list of trained classifier objects is also returned
upon request.
: param classifiers : List of classifier objects; expects each has a scikit-learn wrapper.
: param X_data : Pandas dataframe containing our training features.
: param y_data : Pandas dataframe containing our training class labels.
: param clf_params : (Optional) List of dictionaries containing parameters for each classifier object
in the list 'classifiers'. If not provided, the already-initialized parameters of
each classifier object will be used.
: param cv_splits : Integer number of cross-validation splits.
: param random_state : Seed for reproducibility between executions.
: param return_trained_classifiers : Boolean; if True, function will also return a list containing thefit classifier objects.
: param verbose : The amount of status text displayed during execution; 0 for less, 1 for more.
: return clf_comparison : A pandas dataframe tabulating the cross-validated performance of each classifier.
: return mean_feature_importances : An array containing the ranked feature importances for each classifier having the feature_importances_ attribute.
: return trained_classifiers : (if return_trained_classifiers=True) A list of trained classifier objects.
"""
# initialization
kfold = StratifiedKFold(n_splits=cv_splits, random_state=random_state)
train_accuracy_mean = []
train_accuracy_std = []
test_accuracy_mean = []
test_accuracy_std = []
f1_score_mean = []
f1_score_std = []
mean_feature_importances = []
trained_classifiers = []
classifier_name = []
if clf_params is None: # construct using classifier's existing parameter assignment
clf_params = []
for clf in classifiers:
#print(clf)
params = clf.get_params()
if 'random_state' in params.keys(): # assign random state / seed
params['random_state'] = random_state
elif 'seed' in params.keys():
params['seed'] = random_state
clf_params.append(params)
# step through the classifiers for training and scoring with cross-validation
for clf, params in zip(classifiers, clf_params):
#print(clf)
#print(params)
# automatically obtain the name of the classifier
name = get_clf_name(clf)
classifier_name.append(name)
if prtParam == 1:
print(clf)
if verbose == 1: # print status
print('\nPerforming Cross-Validation on Classifier %s of %s:'
% (len(classifier_name), len(classifiers)))
print(name)
# perform k-fold cross validation for this classifier and calculate scores for each split
kth_train_accuracy = []
kth_test_accuracy = []
kth_test_f1_score = []
kth_feature_importances = []
for (train, test) in kfold.split(X_data, y_data):
clf.set_params(**params)
print(clf)
print(params)
OneVsOneClassifier(clf.fit(X_data.iloc[train], y_data.iloc[train]))
kth_train_accuracy.append(clf.score(X_data.iloc[train], y_data.iloc[train]))
print('1.1')
kth_test_accuracy.append(clf.score(X_data.iloc[test], y_data.iloc[test]))
print('2.2')
kth_test_f1_score.append(f1_score(y_true=y_data.iloc[test], y_pred=clf.predict(X_data.iloc[test]), average='weighted'))
print('3.3')
if hasattr(clf, 'feature_importances_'): # some classifiers (like linReg) lack this attribute
print(clf.feature_importances_)
kth_feature_importances.append(clf.feature_importances_)
# populate scoring statistics for this classifier (over all cross-validation splits)
train_accuracy_mean.append(np.mean(kth_train_accuracy))
print('4')
train_accuracy_std.append(np.std(kth_train_accuracy))
print('5')
test_accuracy_mean.append(np.mean(kth_test_accuracy))
print('6')
test_accuracy_std.append(np.std(kth_test_accuracy))
print('7')
f1_score_mean.append(np.mean(kth_test_f1_score))
print('8')
print('8-1')
f1_score_std.append(np.std(kth_test_f1_score))
print('9')
print(kth_test_f1_score)
# obtain array of mean feature importances, if this classifier had that attribute
print('9-1')
print(kth_feature_importances)
if len(kth_feature_importances) == 0:
print('10')
print(mean_feature_importances)
mean_feature_importances.append(False)
else:
print('10.1')
mean_feature_importances.append(np.mean(kth_feature_importances, axis=0))
# if requested, also export classifier after fitting on the complete training set
if return_trained_classifiers is not False:
print('12')
clf.fit(X_data, y_data)
print('13')
trained_classifiers.append(clf)
print('14')
# remove AdaBoost feature importances (we won't discuss their interpretation)
if type(clf) == type(AdaBoostClassifier()):
print('15')
mean_feature_importances[-1] = False
print('16')
# construct dataframe for comparison of classifiers
clf_comparison = pd.DataFrame({'Classifier Name' : classifier_name,
'Mean Train Accuracy' : train_accuracy_mean,
'Train Accuracy Standard Deviation' : train_accuracy_std,
'Mean Test Accuracy' : test_accuracy_mean,
'Test Accuracy Standard Deviation' : test_accuracy_std,
'Mean Test F1-Score' : f1_score_mean,
'F1-Score Standard Deviation' : f1_score_std})
print('17')
# enforce the desired column order
clf_comparison = clf_comparison[['Classifier Name', 'Mean Train Accuracy',
'Train Accuracy Standard Deviation', 'Mean Test Accuracy',
'Test Accuracy Standard Deviation', 'Mean Test F1-Score',
'F1-Score Standard Deviation']]
print('18')
# add return_trained_classifiers to the function return, if requested, otherwise omit
if return_trained_classifiers is not False:
print('19')
print(clf_comparison)
print(mean_feature_importances)
print(trained_classifiers)
return clf_comparison, mean_feature_importances, trained_classifiers
else:
print('20')
return clf_comparison, mean_feature_importances
这是代码和附件,应该可以帮助您重现错误。 Dataframe 可以在上面下载并放在此处 运行 代码。我相信我包含了 运行 代码所需的每个包,如果没有请导入
dfage_train = pd.read_csv('Whosebug_Example.csv')
y1 = dfage_train['AgeBin']
X1 = dfage_train
X1 = X1.drop(['AgeBin'], axis=1)
num_jobs=-1 # I'll use all available CPUs when possible
Ageclassifier_list = [LogisticRegression(n_jobs=num_jobs, solver='lbfgs'),
RandomForestClassifier(criterion = 'entropy',n_estimators=100, n_jobs=num_jobs),
LinearSVC(class_weight=None,random_state=27,multi_class='ovr')]
X1['Pclass'] = X1['Pclass'].astype(int)
X1['isMale'] = X1['isMale'].astype(bool)
X1['Embarked'] = X1['Embarked'].astype(int)
clf_comp_Full_FeatureSet, mean_feature_importances = train_MultiClass_classifier_ensemble_CV(classifiers=Ageclassifier_list, prtParam = 1,
verbose=1,
X_data=X1,
y_data=y1)
错误输出
ValueError: too many values to unpack (expected 2)
根据条件,您的函数 train_MultiClass_classifier_ensemble_CV
returns 有 2 个或 3 个参数。不要那样做。因为当你想给返回的变量赋值时,可能会出现不匹配的情况。现在,它返回 3 个值,但您只想将其分配给两个值。这是有问题的部分:
if return_trained_classifiers is not False:
print('19')
print(clf_comparison)
print(mean_feature_importances)
print(trained_classifiers)
return clf_comparison, mean_feature_importances, trained_classifiers # three here
else:
print('20')
return clf_comparison, mean_feature_importances # two here
[Link 到示例文件][1] [1]: https://www.dropbox.com/s/vk0ht1bowdhz85n/Whosebug_Example.csv?dl=0
下面的代码分为函数和调用函数的主要代码两部分。一路上有一堆打印语句可以帮助排除故障。我认为这个问题与“mean_feature_importances”变量有关。此过程有效,并且可以毫无问题地对二进制 class 进行比较。我试图改变它来评估 multi-class classifiers 所以我比较那里的表现。为什么它只需要 2 个标签是有道理的,因为这就是它的用途,但这个模型有 5 个不同的标签可供选择。我已经更改了我认为应该更改的每一个值,以适应 5 个不同的标签而不是 2 个。如果我遗漏了什么,请告知问题发生在 return after print(19)
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron # linear classifiers
from sklearn.model_selection import StratifiedKFold # train/test splitting tool for cross-validation
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, \
GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc # scoring metrics
Here is the function used to process classifier ensemble cross validation
def train_MultiClass_classifier_ensemble_CV(classifiers, X_data, y_data, clf_params=None, cv_splits=10,
random_state=21, return_trained_classifiers=True, verbose=0, prtParam=0):
"""
Trains a list of classifiers on the input training data and returns cross-validated accuracy and f1 scores
as well as feature_importances (where available). The list of trained classifier objects is also returned
upon request.
: param classifiers : List of classifier objects; expects each has a scikit-learn wrapper.
: param X_data : Pandas dataframe containing our training features.
: param y_data : Pandas dataframe containing our training class labels.
: param clf_params : (Optional) List of dictionaries containing parameters for each classifier object
in the list 'classifiers'. If not provided, the already-initialized parameters of
each classifier object will be used.
: param cv_splits : Integer number of cross-validation splits.
: param random_state : Seed for reproducibility between executions.
: param return_trained_classifiers : Boolean; if True, function will also return a list containing thefit classifier objects.
: param verbose : The amount of status text displayed during execution; 0 for less, 1 for more.
: return clf_comparison : A pandas dataframe tabulating the cross-validated performance of each classifier.
: return mean_feature_importances : An array containing the ranked feature importances for each classifier having the feature_importances_ attribute.
: return trained_classifiers : (if return_trained_classifiers=True) A list of trained classifier objects.
"""
# initialization
kfold = StratifiedKFold(n_splits=cv_splits, random_state=random_state)
train_accuracy_mean = []
train_accuracy_std = []
test_accuracy_mean = []
test_accuracy_std = []
f1_score_mean = []
f1_score_std = []
mean_feature_importances = []
trained_classifiers = []
classifier_name = []
if clf_params is None: # construct using classifier's existing parameter assignment
clf_params = []
for clf in classifiers:
#print(clf)
params = clf.get_params()
if 'random_state' in params.keys(): # assign random state / seed
params['random_state'] = random_state
elif 'seed' in params.keys():
params['seed'] = random_state
clf_params.append(params)
# step through the classifiers for training and scoring with cross-validation
for clf, params in zip(classifiers, clf_params):
#print(clf)
#print(params)
# automatically obtain the name of the classifier
name = get_clf_name(clf)
classifier_name.append(name)
if prtParam == 1:
print(clf)
if verbose == 1: # print status
print('\nPerforming Cross-Validation on Classifier %s of %s:'
% (len(classifier_name), len(classifiers)))
print(name)
# perform k-fold cross validation for this classifier and calculate scores for each split
kth_train_accuracy = []
kth_test_accuracy = []
kth_test_f1_score = []
kth_feature_importances = []
for (train, test) in kfold.split(X_data, y_data):
clf.set_params(**params)
print(clf)
print(params)
OneVsOneClassifier(clf.fit(X_data.iloc[train], y_data.iloc[train]))
kth_train_accuracy.append(clf.score(X_data.iloc[train], y_data.iloc[train]))
print('1.1')
kth_test_accuracy.append(clf.score(X_data.iloc[test], y_data.iloc[test]))
print('2.2')
kth_test_f1_score.append(f1_score(y_true=y_data.iloc[test], y_pred=clf.predict(X_data.iloc[test]), average='weighted'))
print('3.3')
if hasattr(clf, 'feature_importances_'): # some classifiers (like linReg) lack this attribute
print(clf.feature_importances_)
kth_feature_importances.append(clf.feature_importances_)
# populate scoring statistics for this classifier (over all cross-validation splits)
train_accuracy_mean.append(np.mean(kth_train_accuracy))
print('4')
train_accuracy_std.append(np.std(kth_train_accuracy))
print('5')
test_accuracy_mean.append(np.mean(kth_test_accuracy))
print('6')
test_accuracy_std.append(np.std(kth_test_accuracy))
print('7')
f1_score_mean.append(np.mean(kth_test_f1_score))
print('8')
print('8-1')
f1_score_std.append(np.std(kth_test_f1_score))
print('9')
print(kth_test_f1_score)
# obtain array of mean feature importances, if this classifier had that attribute
print('9-1')
print(kth_feature_importances)
if len(kth_feature_importances) == 0:
print('10')
print(mean_feature_importances)
mean_feature_importances.append(False)
else:
print('10.1')
mean_feature_importances.append(np.mean(kth_feature_importances, axis=0))
# if requested, also export classifier after fitting on the complete training set
if return_trained_classifiers is not False:
print('12')
clf.fit(X_data, y_data)
print('13')
trained_classifiers.append(clf)
print('14')
# remove AdaBoost feature importances (we won't discuss their interpretation)
if type(clf) == type(AdaBoostClassifier()):
print('15')
mean_feature_importances[-1] = False
print('16')
# construct dataframe for comparison of classifiers
clf_comparison = pd.DataFrame({'Classifier Name' : classifier_name,
'Mean Train Accuracy' : train_accuracy_mean,
'Train Accuracy Standard Deviation' : train_accuracy_std,
'Mean Test Accuracy' : test_accuracy_mean,
'Test Accuracy Standard Deviation' : test_accuracy_std,
'Mean Test F1-Score' : f1_score_mean,
'F1-Score Standard Deviation' : f1_score_std})
print('17')
# enforce the desired column order
clf_comparison = clf_comparison[['Classifier Name', 'Mean Train Accuracy',
'Train Accuracy Standard Deviation', 'Mean Test Accuracy',
'Test Accuracy Standard Deviation', 'Mean Test F1-Score',
'F1-Score Standard Deviation']]
print('18')
# add return_trained_classifiers to the function return, if requested, otherwise omit
if return_trained_classifiers is not False:
print('19')
print(clf_comparison)
print(mean_feature_importances)
print(trained_classifiers)
return clf_comparison, mean_feature_importances, trained_classifiers
else:
print('20')
return clf_comparison, mean_feature_importances
这是代码和附件,应该可以帮助您重现错误。 Dataframe 可以在上面下载并放在此处 运行 代码。我相信我包含了 运行 代码所需的每个包,如果没有请导入
dfage_train = pd.read_csv('Whosebug_Example.csv')
y1 = dfage_train['AgeBin']
X1 = dfage_train
X1 = X1.drop(['AgeBin'], axis=1)
num_jobs=-1 # I'll use all available CPUs when possible
Ageclassifier_list = [LogisticRegression(n_jobs=num_jobs, solver='lbfgs'),
RandomForestClassifier(criterion = 'entropy',n_estimators=100, n_jobs=num_jobs),
LinearSVC(class_weight=None,random_state=27,multi_class='ovr')]
X1['Pclass'] = X1['Pclass'].astype(int)
X1['isMale'] = X1['isMale'].astype(bool)
X1['Embarked'] = X1['Embarked'].astype(int)
clf_comp_Full_FeatureSet, mean_feature_importances = train_MultiClass_classifier_ensemble_CV(classifiers=Ageclassifier_list, prtParam = 1,
verbose=1,
X_data=X1,
y_data=y1)
错误输出
ValueError: too many values to unpack (expected 2)
根据条件,您的函数 train_MultiClass_classifier_ensemble_CV
returns 有 2 个或 3 个参数。不要那样做。因为当你想给返回的变量赋值时,可能会出现不匹配的情况。现在,它返回 3 个值,但您只想将其分配给两个值。这是有问题的部分:
if return_trained_classifiers is not False:
print('19')
print(clf_comparison)
print(mean_feature_importances)
print(trained_classifiers)
return clf_comparison, mean_feature_importances, trained_classifiers # three here
else:
print('20')
return clf_comparison, mean_feature_importances # two here