SequentialFeatureSelector ValueError: continuous format is not supported
SequentialFeatureSelector ValueError: continuous format is not supported
我是机器学习的新手,正在尝试了解 SequentialFeatureSelector
来自 sklearn 的概念。我正在使用 Anaconda 和 Jupyter notebook 进行 poc。我已经导入
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
包。默认情况下 mlxtend 包不是 Anaconda 的一部分,然后我通过 pip install mlxtend 命令安装。
我为此 poc 使用了 sklearn 波士顿住房数据集,并执行了以下代码。在安装 sfs 时,出现错误。
如何解决这个错误?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.metrics import roc_curve, roc_auc_score
%matplotlib inline
data = load_boston()
print(data.keys())
X = pd.DataFrame(data.data)
X.columns = data.feature_names
y = data.target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
sfs1=sfs(RandomForestRegressor(n_jobs=1),
k_features=7,
forward=True,
floating=False,
verbose=3,
scoring='roc_auc',
cv=3
)
sfs1=sfs1.fit(X_train,y_train)
错误
ValueError Traceback (most recent call last)
<ipython-input-77-96b29660189d> in <module>
1 #sfs1.fit(X_train,y_train)
2 X_train.shape
----> 3 sfs2=sfs1.fit(X_train,y_train)
C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in fit(self, X, y, custom_feature_names, **fit_params)
371 X=X_,
372 y=y,
--> 373 **fit_params
374 )
375 else:
C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in _inclusion(self, orig_set, subset, X, y, ignore_feature, **fit_params)
528 tuple(subset | {feature}),
529 **fit_params)
--> 530 for feature in remaining
531 if feature != ignore_feature)
532
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in _calc_score(selector, X, y, indices, **fit_params)
32 n_jobs=1,
33 pre_dispatch=selector.pre_dispatch,
---> 34 fit_params=fit_params)
35 else:
36 selector.est_.fit(X[:, indices], y, **fit_params)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
400 fit_params=fit_params,
401 pre_dispatch=pre_dispatch,
--> 402 error_score=error_score)
403 return cv_results['test_score']
404
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
238 return_times=True, return_estimator=return_estimator,
239 error_score=error_score)
--> 240 for train, test in cv.split(X, y, groups))
241
242 zipped_scores = list(zip(*scores))
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
566 fit_time = time.time() - start_time
567 # _score will return dict if is_multimetric is True
--> 568 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
569 score_time = time.time() - start_time - fit_time
570 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
603 """
604 if is_multimetric:
--> 605 return _multimetric_score(estimator, X_test, y_test, scorer)
606 else:
607 if y_test is None:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
633 score = scorer(estimator, X_test)
634 else:
--> 635 score = scorer(estimator, X_test, y_test)
636
637 if hasattr(score, 'item'):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, clf, X, y, sample_weight)
174 y_type = type_of_target(y)
175 if y_type not in ("binary", "multilabel-indicator"):
--> 176 raise ValueError("{0} format is not supported".format(y_type))
177
178 if is_regressor(clf):
ValueError: continuous format is not supported
仔细观察跟踪,您会发现错误不是由 mlxtend
引发的 - 它是由 scikit-learn 的 scorer.py
模块引发的,这是因为roc_auc_score
您使用的仅适用于分类问题;对于回归问题,比如你这里的问题,它是 meaninglesss.
来自docs(强调已添加):
sklearn.metrics.roc_auc_score
(y_true, y_score, average=’macro’, sample_weight=None, max_fpr=None)
Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.
Note: this implementation is restricted to the binary classification task or multilabel classification task in label indicator format.
另见 scikit-learn list of metrics 每种问题,您可以在其中确认 roc_auc
不适合回归。
因此,在您的 sfs
定义中将其更改为
scoring='neg_mean_squared_error'
就像 SequentialFeatureSelector
的文档 example 中那样,或者任何其他适合回归的指标,你会没事的。
我是机器学习的新手,正在尝试了解 SequentialFeatureSelector 来自 sklearn 的概念。我正在使用 Anaconda 和 Jupyter notebook 进行 poc。我已经导入
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
包。默认情况下 mlxtend 包不是 Anaconda 的一部分,然后我通过 pip install mlxtend 命令安装。
我为此 poc 使用了 sklearn 波士顿住房数据集,并执行了以下代码。在安装 sfs 时,出现错误。
如何解决这个错误?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.metrics import roc_curve, roc_auc_score
%matplotlib inline
data = load_boston()
print(data.keys())
X = pd.DataFrame(data.data)
X.columns = data.feature_names
y = data.target
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
sfs1=sfs(RandomForestRegressor(n_jobs=1),
k_features=7,
forward=True,
floating=False,
verbose=3,
scoring='roc_auc',
cv=3
)
sfs1=sfs1.fit(X_train,y_train)
错误
ValueError Traceback (most recent call last)
<ipython-input-77-96b29660189d> in <module>
1 #sfs1.fit(X_train,y_train)
2 X_train.shape
----> 3 sfs2=sfs1.fit(X_train,y_train)
C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in fit(self, X, y, custom_feature_names, **fit_params)
371 X=X_,
372 y=y,
--> 373 **fit_params
374 )
375 else:
C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in _inclusion(self, orig_set, subset, X, y, ignore_feature, **fit_params)
528 tuple(subset | {feature}),
529 **fit_params)
--> 530 for feature in remaining
531 if feature != ignore_feature)
532
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\mlxtend\feature_selection\sequential_feature_selector.py in _calc_score(selector, X, y, indices, **fit_params)
32 n_jobs=1,
33 pre_dispatch=selector.pre_dispatch,
---> 34 fit_params=fit_params)
35 else:
36 selector.est_.fit(X[:, indices], y, **fit_params)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
400 fit_params=fit_params,
401 pre_dispatch=pre_dispatch,
--> 402 error_score=error_score)
403 return cv_results['test_score']
404
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
238 return_times=True, return_estimator=return_estimator,
239 error_score=error_score)
--> 240 for train, test in cv.split(X, y, groups))
241
242 zipped_scores = list(zip(*scores))
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
566 fit_time = time.time() - start_time
567 # _score will return dict if is_multimetric is True
--> 568 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
569 score_time = time.time() - start_time - fit_time
570 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
603 """
604 if is_multimetric:
--> 605 return _multimetric_score(estimator, X_test, y_test, scorer)
606 else:
607 if y_test is None:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
633 score = scorer(estimator, X_test)
634 else:
--> 635 score = scorer(estimator, X_test, y_test)
636
637 if hasattr(score, 'item'):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, clf, X, y, sample_weight)
174 y_type = type_of_target(y)
175 if y_type not in ("binary", "multilabel-indicator"):
--> 176 raise ValueError("{0} format is not supported".format(y_type))
177
178 if is_regressor(clf):
ValueError: continuous format is not supported
仔细观察跟踪,您会发现错误不是由 mlxtend
引发的 - 它是由 scikit-learn 的 scorer.py
模块引发的,这是因为roc_auc_score
您使用的仅适用于分类问题;对于回归问题,比如你这里的问题,它是 meaninglesss.
来自docs(强调已添加):
sklearn.metrics.roc_auc_score
(y_true, y_score, average=’macro’, sample_weight=None, max_fpr=None)Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.
Note: this implementation is restricted to the binary classification task or multilabel classification task in label indicator format.
另见 scikit-learn list of metrics 每种问题,您可以在其中确认 roc_auc
不适合回归。
因此,在您的 sfs
定义中将其更改为
scoring='neg_mean_squared_error'
就像 SequentialFeatureSelector
的文档 example 中那样,或者任何其他适合回归的指标,你会没事的。