运行 gridsearchcv with pipeline 时出错
Error when running gridsearchcv with pipeline
我想创建一个包含模型训练过程中所有过程的流水线结构。做了相关的库和定义后,我创建了如下结构来实验。我使用了电信客户流失数据集。
ohe_f =["gender","SeniorCitizen","Partner","Dependents","PhoneService","MultipleLines",
"InternetService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport",
"StreamingTV","StreamingMovies","Contract","PaperlessBilling","PaymentMethod"]
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
stratify=y,
random_state=11)
pipeline = Pipeline(steps = [['smote', SMOTE(random_state=11)],
['scaler', MinMaxScaler()],
['encoder', OneHotEncoder(),ohe_f],
['classifier', LogisticRegression(random_state=11)]])
stratified_kfold = StratifiedKFold(n_splits=3,
shuffle=True,
random_state=11)
param_grid = {'classifier__C':[0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(estimator=pipeline,
param_grid=param_grid,
scoring='roc_auc',
cv=stratified_kfold,
n_jobs=-1)
当我开始训练模型时,出现以下错误。我该如何解决?
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 436, in _process_worker
r = call_item()
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 288, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 595, in __call__
return self.func(*args, **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 216, in __call__
return self.function(*args, **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 668, in _fit_and_score
estimator = estimator.set_params(**cloned_parameters)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\pipeline.py", line 188, in set_params
self._set_params("steps", **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py", line 54, in _set_params
super().set_params(**params)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\base.py", line 239, in set_params
valid_params = self.get_params(deep=True)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\pipeline.py", line 167, in get_params
return self._get_params("steps", deep=deep)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py", line 33, in _get_params
out.update(estimators)
ValueError: dictionary update sequence element #2 has length 3; 2 is required
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_1388/1962240236.py in <module>
23 n_jobs=-1)
24
---> 25 grid_search.fit(X_train, y_train)
26 cv_score = grid_search.best_score_
27 test_score = grid_search.score(X_test, y_test)
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
889 return results
890
--> 891 self._run_search(evaluate_candidates)
892
893 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1390 def _run_search(self, evaluate_candidates):
1391 """Search all candidates in param_grid"""
-> 1392 evaluate_candidates(ParameterGrid(self.param_grid))
1393
1394
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
836 )
837
--> 838 out = parallel(
839 delayed(_fit_and_score)(
840 clone(base_estimator),
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1054
1055 with self._backend.retrieval_context():
-> 1056 self.retrieve()
1057 # Make sure that we get a last message telling us we are done
1058 elapsed_time = time.time() - self._start_time
~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
933 try:
934 if getattr(self._backend, 'supports_timeout', False):
--> 935 self._output.extend(job.get(timeout=self.timeout))
936 else:
937 self._output.extend(job.get())
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
443 raise CancelledError()
444 elif self._state == FINISHED:
--> 445 return self.__get_result()
446 else:
447 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
388 if self._exception:
389 try:
--> 390 raise self._exception
391 finally:
392 # Break a reference cycle with the exception in self._exception
ValueError: dictionary update sequence element #2 has length 3; 2 is required
您需要将管道分成两部分:一个处理数字特征(使用最小最大缩放器),另一个处理分类特征(使用一个热编码器)。您可以使用 scikit-learn 中的 class ColumnTransformer
:https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
我想创建一个包含模型训练过程中所有过程的流水线结构。做了相关的库和定义后,我创建了如下结构来实验。我使用了电信客户流失数据集。
ohe_f =["gender","SeniorCitizen","Partner","Dependents","PhoneService","MultipleLines",
"InternetService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport",
"StreamingTV","StreamingMovies","Contract","PaperlessBilling","PaymentMethod"]
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
stratify=y,
random_state=11)
pipeline = Pipeline(steps = [['smote', SMOTE(random_state=11)],
['scaler', MinMaxScaler()],
['encoder', OneHotEncoder(),ohe_f],
['classifier', LogisticRegression(random_state=11)]])
stratified_kfold = StratifiedKFold(n_splits=3,
shuffle=True,
random_state=11)
param_grid = {'classifier__C':[0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(estimator=pipeline,
param_grid=param_grid,
scoring='roc_auc',
cv=stratified_kfold,
n_jobs=-1)
当我开始训练模型时,出现以下错误。我该如何解决?
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 436, in _process_worker
r = call_item()
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 288, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 595, in __call__
return self.func(*args, **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\joblib\parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 216, in __call__
return self.function(*args, **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 668, in _fit_and_score
estimator = estimator.set_params(**cloned_parameters)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\pipeline.py", line 188, in set_params
self._set_params("steps", **kwargs)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py", line 54, in _set_params
super().set_params(**params)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\base.py", line 239, in set_params
valid_params = self.get_params(deep=True)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\pipeline.py", line 167, in get_params
return self._get_params("steps", deep=deep)
File "C:\Users\burak\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py", line 33, in _get_params
out.update(estimators)
ValueError: dictionary update sequence element #2 has length 3; 2 is required
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_1388/1962240236.py in <module>
23 n_jobs=-1)
24
---> 25 grid_search.fit(X_train, y_train)
26 cv_score = grid_search.best_score_
27 test_score = grid_search.score(X_test, y_test)
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
889 return results
890
--> 891 self._run_search(evaluate_candidates)
892
893 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1390 def _run_search(self, evaluate_candidates):
1391 """Search all candidates in param_grid"""
-> 1392 evaluate_candidates(ParameterGrid(self.param_grid))
1393
1394
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
836 )
837
--> 838 out = parallel(
839 delayed(_fit_and_score)(
840 clone(base_estimator),
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1054
1055 with self._backend.retrieval_context():
-> 1056 self.retrieve()
1057 # Make sure that we get a last message telling us we are done
1058 elapsed_time = time.time() - self._start_time
~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
933 try:
934 if getattr(self._backend, 'supports_timeout', False):
--> 935 self._output.extend(job.get(timeout=self.timeout))
936 else:
937 self._output.extend(job.get())
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
443 raise CancelledError()
444 elif self._state == FINISHED:
--> 445 return self.__get_result()
446 else:
447 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
388 if self._exception:
389 try:
--> 390 raise self._exception
391 finally:
392 # Break a reference cycle with the exception in self._exception
ValueError: dictionary update sequence element #2 has length 3; 2 is required
您需要将管道分成两部分:一个处理数字特征(使用最小最大缩放器),另一个处理分类特征(使用一个热编码器)。您可以使用 scikit-learn 中的 class ColumnTransformer
:https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html