RandomizedSearchCV:所有估计器都不适合
RandomizedSearchCV: All estimators failed to fit
我目前正在参加“法国汽车索赔数据集 freMTPL2freq”Kaggle 竞赛 (https://www.kaggle.com/floser/french-motor-claims-datasets-fremtpl2freq)。不幸的是,每当我使用 RandomizedSearchCV 时,我都会收到“NotFittedError: All estimators failed to fit”错误,我无法弄清楚为什么会这样。
非常感谢任何帮助。
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_poisson_deviance
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_gamma_deviance
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
data_freq = pd.read_csv('freMTPL2freq.csv')
data_freq['Area'] = data_freq['Area'].str.replace('\'','')
data_freq['VehBrand'] = data_freq['VehBrand'].str.replace('\'','')
data_freq['VehGas'] = data_freq['VehGas'].str.replace('\'','')
data_freq['Region'] = data_freq['Region'].str.replace('\'','')
data_freq['frequency'] = data_freq['ClaimNb'] / data_freq['Exposure']
y = data_freq['frequency']
X = data_freq.drop(['frequency', 'ClaimNb', 'IDpol'], axis = 1)
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, shuffle = True, random_state = 42)
pt_columns = ['VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density']
cat_columns = ['Area', 'Region', 'VehBrand', 'VehGas']
from xgboost import XGBRegressor
ct = ColumnTransformer([('pt', 'passthrough', pt_columns),
('ohe', OneHotEncoder(), cat_columns)])
pipe_xgbr = Pipeline([('cf_trans', ct),
('ssc', StandardScaler(with_mean = False)),
('xgb_regressor', XGBRegressor())
])
param = {'xgb_regressor__n_estimators':[3, 5],
'xgb_regressor__max_depth':[3, 5, 7],
'xgb_regressor__learning_rate':[0.1, 0.5],
'xgb_regressor__colsample_bytree':[0.5, 0.8],
'xgb_regressor__subsample':[0.5, 0.8]
}
rscv = RandomizedSearchCV(pipe_xgbr, param_distributions = param, n_iter = 2, scoring = mean_squared_error, n_jobs = -1, cv = 5, error_score = 'raise')
rscv.fit(X_train, y_train, xgbr_regressor__sample_weight = X_train['Exposure'])
原始数据帧的前五行 data_freq 如下所示:
IDpol ClaimNb Exposure Area VehPower VehAge DrivAge BonusMalus VehBrand VehGas Density Region
0 1.0 1 0.10 D 5 0 55 50 B12 Regular 1217 R82
1 3.0 1 0.77 D 5 0 55 50 B12 Regular 1217 R82
2 5.0 1 0.75 B 6 2 52 50 B12 Diesel 54 R22
3 10.0 1 0.09 B 7 0 46 50 B12 Diesel 76 R72
4 11.0 1 0.84 B 7 0 46 50 B12 Diesel 76 R72
我得到的错误如下:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 418, in _process_worker
r = call_item()
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 608, in __call__
return self.func(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\pipeline.py", line 340, in fit
fit_params_steps = self._check_fit_params(**fit_params)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\pipeline.py", line 261, in _check_fit_params
fit_params_steps[step][param] = pval
KeyError: 'xgbr_regressor'
"""
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-68-0c1886d1e985> in <module>
----> 1 rscv.fit(X_train, y_train, xgbr_regressor__sample_weight = X_train['Exposure'])
2 #pipe_xgbr.fit(X_train, y_train)
3 #X_train.describe(include = 'all')
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1633 evaluate_candidates(ParameterSampler(
1634 self.param_distributions, self.n_iter,
-> 1635 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
807 (split_idx, (train, test)) in product(
808 enumerate(candidate_params),
--> 809 enumerate(cv.split(X, y, groups))))
810
811 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
KeyError: 'xgbr_regressor'
我也试过 运行 没有 sample_weight 参数。在这种情况下,错误更改为:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 418, in _process_worker
r = call_item()
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 608, in __call__
return self.func(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 625, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer, error_score)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
scores = scorer(estimator, X_test, y_test)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 74, in inner_f
return f(**kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 336, in mean_squared_error
y_true, y_pred, multioutput)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 88, in _check_reg_targets
check_consistent_length(y_true, y_pred)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 316, in check_consistent_length
lengths = [_num_samples(X) for X in arrays if X is not None]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 316, in <listcomp>
lengths = [_num_samples(X) for X in arrays if X is not None]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 249, in _num_samples
raise TypeError(message)
TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>
"""
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
<ipython-input-69-a9be9cc5df4a> in <module>
----> 1 rscv.fit(X_train, y_train)#, xgbr_regressor__sample_weight = X_train['Exposure'])
2 #pipe_xgbr.fit(X_train, y_train)
3 #X_train.describe(include = 'all')
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1633 evaluate_candidates(ParameterSampler(
1634 self.param_distributions, self.n_iter,
-> 1635 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
807 (split_idx, (train, test)) in product(
808 enumerate(candidate_params),
--> 809 enumerate(cv.split(X, y, groups))))
810
811 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>
设置 verbose = 10 和 n_jobs = 1 时显示以下错误消息:
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5; 1/2] START xgb_regressor__colsample_bytree=0.5, xgb_regressor__learning_rate=0.5, xgb_regressor__max_depth=5, xgb_regressor__n_estimators=5, xgb_regressor__subsample=0.5
C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py:72: FutureWarning: Pass sample_weight=406477 1.0
393150 0.0
252885 0.0
260652 0.0
661256 0.0
...
154663 0.0
398414 0.0
42890 0.0
640774 0.0
114446 0.0
Name: frequency, Length: 108482, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
"will result in an error", FutureWarning)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-84-74435f74c470> in <module>
----> 1 rscv.fit(X_train, y_train, xgb_regressor__sample_weight = X_train['Exposure'])
2 #pipe_xgbr.fit(X_train, y_train)
3 #X_train.describe(include = 'all')
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1633 evaluate_candidates(ParameterSampler(
1634 self.param_distributions, self.n_iter,
-> 1635 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
807 (split_idx, (train, test)) in product(
808 enumerate(candidate_params),
--> 809 enumerate(cv.split(X, y, groups))))
810
811 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
~\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\anaconda3\lib\site-packages\sklearn\utils\fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
623
624 fit_time = time.time() - start_time
--> 625 test_scores = _score(estimator, X_test, y_test, scorer, error_score)
626 score_time = time.time() - start_time - fit_time
627 if return_train_score:
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, error_score)
685 scores = scorer(estimator, X_test)
686 else:
--> 687 scores = scorer(estimator, X_test, y_test)
688 except Exception:
689 if error_score == 'raise':
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
72 "will result in an error", FutureWarning)
73 kwargs.update(zip(sig.parameters, args))
---> 74 return f(**kwargs)
75 return inner_f
76
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in mean_squared_error(y_true, y_pred, sample_weight, multioutput, squared)
334 """
335 y_type, y_true, y_pred, multioutput = _check_reg_targets(
--> 336 y_true, y_pred, multioutput)
337 check_consistent_length(y_true, y_pred, sample_weight)
338 output_errors = np.average((y_true - y_pred) ** 2, axis=0,
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
86 the dtype argument passed to check_array.
87 """
---> 88 check_consistent_length(y_true, y_pred)
89 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
90 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
314 """
315
--> 316 lengths = [_num_samples(X) for X in arrays if X is not None]
317 uniques = np.unique(lengths)
318 if len(uniques) > 1:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in <listcomp>(.0)
314 """
315
--> 316 lengths = [_num_samples(X) for X in arrays if X is not None]
317 uniques = np.unique(lengths)
318 if len(uniques) > 1:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in _num_samples(x)
247 if hasattr(x, 'fit') and callable(x.fit):
248 # Don't get num_samples from an ensembles length!
--> 249 raise TypeError(message)
250
251 if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>
根据您的错误消息,KeyError: 'xgbr_regressor'
代码无法在您的管道中找到密钥 xgbr_regressor
。在您的管道中,您定义了 xgb_regressor
:
pipe_xgbr = Pipeline(
[('cf_trans', ct),
('ssc', StandardScaler(with_mean = False)),
('xgb_regressor', XGBRegressor())])
但是当你试图适应时,你会通过引用 xgbr_regressor
来调用它,这就是抛出 KeyError 的原因:
rscv.fit(X_train, y_train, xgbr_regressor__sample_weight=X_train['Exposure'])
因此,您必须更改上面的行以将 xgbr_regressor__sample_weight
换成 xgb_regressor__sample_weight
,这样应该可以消除该错误。
哇,那真是一团乱麻,但我想我终于找到了。您设置 scoring=mean_squared_error
,应该改为使用 scoring="neg_mean_squared_error"
。
度量函数mean_squared_error
具有签名(y_true, y_pred, *, <kwargs>)
,而使用字符串"neg_mean_squared_error"
获得的得分手具有签名(estimator, X_test, y_test)
。所以在追溯中,你看到
--> 687 scores = scorer(estimator, X_test, y_test)
它用 y_true=estimator
、y_test=X_test
和 sample_weight=y_test
调用 mean_squared_error
(第一个 kwarg,因此是关于将关键字参数指定为位置的 FutureWarning)。深入回溯,我们看到检查 y_true
和 y_pred
的形状是否兼容,但它认为前者是您的管道对象(因此是最终的错误消息)!
我目前正在参加“法国汽车索赔数据集 freMTPL2freq”Kaggle 竞赛 (https://www.kaggle.com/floser/french-motor-claims-datasets-fremtpl2freq)。不幸的是,每当我使用 RandomizedSearchCV 时,我都会收到“NotFittedError: All estimators failed to fit”错误,我无法弄清楚为什么会这样。 非常感谢任何帮助。
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_poisson_deviance
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_gamma_deviance
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
data_freq = pd.read_csv('freMTPL2freq.csv')
data_freq['Area'] = data_freq['Area'].str.replace('\'','')
data_freq['VehBrand'] = data_freq['VehBrand'].str.replace('\'','')
data_freq['VehGas'] = data_freq['VehGas'].str.replace('\'','')
data_freq['Region'] = data_freq['Region'].str.replace('\'','')
data_freq['frequency'] = data_freq['ClaimNb'] / data_freq['Exposure']
y = data_freq['frequency']
X = data_freq.drop(['frequency', 'ClaimNb', 'IDpol'], axis = 1)
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, shuffle = True, random_state = 42)
pt_columns = ['VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density']
cat_columns = ['Area', 'Region', 'VehBrand', 'VehGas']
from xgboost import XGBRegressor
ct = ColumnTransformer([('pt', 'passthrough', pt_columns),
('ohe', OneHotEncoder(), cat_columns)])
pipe_xgbr = Pipeline([('cf_trans', ct),
('ssc', StandardScaler(with_mean = False)),
('xgb_regressor', XGBRegressor())
])
param = {'xgb_regressor__n_estimators':[3, 5],
'xgb_regressor__max_depth':[3, 5, 7],
'xgb_regressor__learning_rate':[0.1, 0.5],
'xgb_regressor__colsample_bytree':[0.5, 0.8],
'xgb_regressor__subsample':[0.5, 0.8]
}
rscv = RandomizedSearchCV(pipe_xgbr, param_distributions = param, n_iter = 2, scoring = mean_squared_error, n_jobs = -1, cv = 5, error_score = 'raise')
rscv.fit(X_train, y_train, xgbr_regressor__sample_weight = X_train['Exposure'])
原始数据帧的前五行 data_freq 如下所示:
IDpol ClaimNb Exposure Area VehPower VehAge DrivAge BonusMalus VehBrand VehGas Density Region
0 1.0 1 0.10 D 5 0 55 50 B12 Regular 1217 R82
1 3.0 1 0.77 D 5 0 55 50 B12 Regular 1217 R82
2 5.0 1 0.75 B 6 2 52 50 B12 Diesel 54 R22
3 10.0 1 0.09 B 7 0 46 50 B12 Diesel 76 R72
4 11.0 1 0.84 B 7 0 46 50 B12 Diesel 76 R72
我得到的错误如下:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 418, in _process_worker
r = call_item()
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 608, in __call__
return self.func(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\pipeline.py", line 340, in fit
fit_params_steps = self._check_fit_params(**fit_params)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\pipeline.py", line 261, in _check_fit_params
fit_params_steps[step][param] = pval
KeyError: 'xgbr_regressor'
"""
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-68-0c1886d1e985> in <module>
----> 1 rscv.fit(X_train, y_train, xgbr_regressor__sample_weight = X_train['Exposure'])
2 #pipe_xgbr.fit(X_train, y_train)
3 #X_train.describe(include = 'all')
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1633 evaluate_candidates(ParameterSampler(
1634 self.param_distributions, self.n_iter,
-> 1635 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
807 (split_idx, (train, test)) in product(
808 enumerate(candidate_params),
--> 809 enumerate(cv.split(X, y, groups))))
810
811 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
KeyError: 'xgbr_regressor'
我也试过 运行 没有 sample_weight 参数。在这种情况下,错误更改为:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 418, in _process_worker
r = call_item()
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 608, in __call__
return self.func(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 625, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer, error_score)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
scores = scorer(estimator, X_test, y_test)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 74, in inner_f
return f(**kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 336, in mean_squared_error
y_true, y_pred, multioutput)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 88, in _check_reg_targets
check_consistent_length(y_true, y_pred)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 316, in check_consistent_length
lengths = [_num_samples(X) for X in arrays if X is not None]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 316, in <listcomp>
lengths = [_num_samples(X) for X in arrays if X is not None]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 249, in _num_samples
raise TypeError(message)
TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>
"""
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
<ipython-input-69-a9be9cc5df4a> in <module>
----> 1 rscv.fit(X_train, y_train)#, xgbr_regressor__sample_weight = X_train['Exposure'])
2 #pipe_xgbr.fit(X_train, y_train)
3 #X_train.describe(include = 'all')
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1633 evaluate_candidates(ParameterSampler(
1634 self.param_distributions, self.n_iter,
-> 1635 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
807 (split_idx, (train, test)) in product(
808 enumerate(candidate_params),
--> 809 enumerate(cv.split(X, y, groups))))
810
811 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>
设置 verbose = 10 和 n_jobs = 1 时显示以下错误消息:
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5; 1/2] START xgb_regressor__colsample_bytree=0.5, xgb_regressor__learning_rate=0.5, xgb_regressor__max_depth=5, xgb_regressor__n_estimators=5, xgb_regressor__subsample=0.5
C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py:72: FutureWarning: Pass sample_weight=406477 1.0
393150 0.0
252885 0.0
260652 0.0
661256 0.0
...
154663 0.0
398414 0.0
42890 0.0
640774 0.0
114446 0.0
Name: frequency, Length: 108482, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
"will result in an error", FutureWarning)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-84-74435f74c470> in <module>
----> 1 rscv.fit(X_train, y_train, xgb_regressor__sample_weight = X_train['Exposure'])
2 #pipe_xgbr.fit(X_train, y_train)
3 #X_train.describe(include = 'all')
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1633 evaluate_candidates(ParameterSampler(
1634 self.param_distributions, self.n_iter,
-> 1635 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
807 (split_idx, (train, test)) in product(
808 enumerate(candidate_params),
--> 809 enumerate(cv.split(X, y, groups))))
810
811 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
~\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\anaconda3\lib\site-packages\sklearn\utils\fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
623
624 fit_time = time.time() - start_time
--> 625 test_scores = _score(estimator, X_test, y_test, scorer, error_score)
626 score_time = time.time() - start_time - fit_time
627 if return_train_score:
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, error_score)
685 scores = scorer(estimator, X_test)
686 else:
--> 687 scores = scorer(estimator, X_test, y_test)
688 except Exception:
689 if error_score == 'raise':
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
72 "will result in an error", FutureWarning)
73 kwargs.update(zip(sig.parameters, args))
---> 74 return f(**kwargs)
75 return inner_f
76
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in mean_squared_error(y_true, y_pred, sample_weight, multioutput, squared)
334 """
335 y_type, y_true, y_pred, multioutput = _check_reg_targets(
--> 336 y_true, y_pred, multioutput)
337 check_consistent_length(y_true, y_pred, sample_weight)
338 output_errors = np.average((y_true - y_pred) ** 2, axis=0,
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
86 the dtype argument passed to check_array.
87 """
---> 88 check_consistent_length(y_true, y_pred)
89 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
90 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
314 """
315
--> 316 lengths = [_num_samples(X) for X in arrays if X is not None]
317 uniques = np.unique(lengths)
318 if len(uniques) > 1:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in <listcomp>(.0)
314 """
315
--> 316 lengths = [_num_samples(X) for X in arrays if X is not None]
317 uniques = np.unique(lengths)
318 if len(uniques) > 1:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in _num_samples(x)
247 if hasattr(x, 'fit') and callable(x.fit):
248 # Don't get num_samples from an ensembles length!
--> 249 raise TypeError(message)
250
251 if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>
根据您的错误消息,KeyError: 'xgbr_regressor'
代码无法在您的管道中找到密钥 xgbr_regressor
。在您的管道中,您定义了 xgb_regressor
:
pipe_xgbr = Pipeline(
[('cf_trans', ct),
('ssc', StandardScaler(with_mean = False)),
('xgb_regressor', XGBRegressor())])
但是当你试图适应时,你会通过引用 xgbr_regressor
来调用它,这就是抛出 KeyError 的原因:
rscv.fit(X_train, y_train, xgbr_regressor__sample_weight=X_train['Exposure'])
因此,您必须更改上面的行以将 xgbr_regressor__sample_weight
换成 xgb_regressor__sample_weight
,这样应该可以消除该错误。
哇,那真是一团乱麻,但我想我终于找到了。您设置 scoring=mean_squared_error
,应该改为使用 scoring="neg_mean_squared_error"
。
度量函数mean_squared_error
具有签名(y_true, y_pred, *, <kwargs>)
,而使用字符串"neg_mean_squared_error"
获得的得分手具有签名(estimator, X_test, y_test)
。所以在追溯中,你看到
--> 687 scores = scorer(estimator, X_test, y_test)
它用 y_true=estimator
、y_test=X_test
和 sample_weight=y_test
调用 mean_squared_error
(第一个 kwarg,因此是关于将关键字参数指定为位置的 FutureWarning)。深入回溯,我们看到检查 y_true
和 y_pred
的形状是否兼容,但它认为前者是您的管道对象(因此是最终的错误消息)!