具有分类特征的 CatBoost 无法与 Scikit-Learn CalibratedCV 一起使用
CatBoost with categorical features failed to work with Scikit-Learn CalibratedCV
我尝试使用 sklearn
CalibratedClassifierCV
校准 CatBoostClassifier
模型的概率。拟合时 运行 很好,但使用校准模型进行预测时失败。我已经尝试使用 LGBMClassifier
,因为它具有相同的 categorical_features
,而且 运行 很好。这个问题有什么解决办法吗?这是我使用的代码:
from catboost import CatBoostClassifier
from sklearn.calibration import CalibratedClassifierCV
import pandas as pd
X, y = make_classification(n_samples=100, n_features=3,n_redundant=0, random_state=42)
X=pd.DataFrame(X,columns=['a','b','c'])
X['d'] = [1,2,3,4,5]*20
model = CatBoostClassifier()
model.fit(X,y,verbose=False,cat_features=[3])
model_cat = CalibratedClassifierCV(base_estimator=model,cv='prefit')
model_cat.fit(X,y)
model_cat.predict(X)
CatBoostError Traceback (most recent call last)
/tmp/ipykernel_3228/1832915274.py in <module>
----> 1 model_cat.predict(X)
~/anaconda3/lib/python3.8/site-packages/sklearn/calibration.py in predict(self, X)
383 """
384 check_is_fitted(self)
--> 385 return self.classes_[np.argmax(self.predict_proba(X), axis=1)]
386
387 def _more_tags(self):
~/anaconda3/lib/python3.8/site-packages/sklearn/calibration.py in predict_proba(self, X)
360 mean_proba = np.zeros((X.shape[0], len(self.classes_)))
361 for calibrated_classifier in self.calibrated_classifiers_:
--> 362 proba = calibrated_classifier.predict_proba(X)
363 mean_proba += proba
364
~/anaconda3/lib/python3.8/site-packages/sklearn/calibration.py in predict_proba(self, X)
637 n_classes = len(self.classes)
638 pred_method = _get_prediction_method(self.base_estimator)
--> 639 predictions = _compute_predictions(pred_method, X, n_classes)
640
641 label_encoder = LabelEncoder().fit(self.classes)
~/anaconda3/lib/python3.8/site-packages/sklearn/calibration.py in _compute_predictions(pred_method, X, n_classes)
499 (X.shape[0], 1).
500 """
--> 501 predictions = pred_method(X=X)
502 if hasattr(pred_method, '__name__'):
503 method_name = pred_method.__name__
~/anaconda3/lib/python3.8/site-packages/catboost/core.py in predict_proba(self, X, ntree_start, ntree_end, thread_count, verbose, task_type)
4767 with probability for every class for each object.
4768 """
-> 4769 return self._predict(X, 'Probability', ntree_start, ntree_end, thread_count, verbose, 'predict_proba', task_type)
4770
4771
~/anaconda3/lib/python3.8/site-packages/catboost/core.py in _predict(self, data, prediction_type, ntree_start, ntree_end, thread_count, verbose, parent_method_name, task_type)
2175 if verbose is None:
2176 verbose = False
-> 2177 data, data_is_single_object = self._process_predict_input_data(data, parent_method_name, thread_count)
2178 self._validate_prediction_type(prediction_type)
2179
~/anaconda3/lib/python3.8/site-packages/catboost/core.py in _process_predict_input_data(self, data, parent_method_name, thread_count, label)
2155 is_single_object = _is_data_single_object(data)
2156 if not isinstance(data, Pool):
-> 2157 data = Pool(
2158 data=[data] if is_single_object else data,
2159 label=label,
~/anaconda3/lib/python3.8/site-packages/catboost/core.py in __init__(self, data, label, cat_features, text_features, embedding_features, column_description, pairs, delimiter, has_header, ignore_csv_quoting, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, feature_names, thread_count, log_cout, log_cerr)
580 elif isinstance(data, np.ndarray):
581 if (data.dtype.kind == 'f') and (cat_features is not None) and (len(cat_features) > 0):
--> 582 raise CatBoostError(
583 "'data' is numpy array of floating point numerical type, it means no categorical features,"
584 " but 'cat_features' parameter specifies nonzero number of categorical features"
CatBoostError: 'data' is numpy array of floating point numerical type, it means no categorical features, but 'cat_features' parameter specifies nonzero number of categorical features
我不知道问题来自 Scikit-Learn 还是 CatBoost,但我已经尝试将此问题提出给 CatBoost here。
感谢您的回复。非常感谢。
非常感谢 catboost
和 scikit-learn
提高这两个模块的性能并解决所有提出的问题。通过将 catboost
和 scikit-learn
升级到 1.x.x
版本解决了这个问题。
我尝试使用 sklearn
CalibratedClassifierCV
校准 CatBoostClassifier
模型的概率。拟合时 运行 很好,但使用校准模型进行预测时失败。我已经尝试使用 LGBMClassifier
,因为它具有相同的 categorical_features
,而且 运行 很好。这个问题有什么解决办法吗?这是我使用的代码:
from catboost import CatBoostClassifier
from sklearn.calibration import CalibratedClassifierCV
import pandas as pd
X, y = make_classification(n_samples=100, n_features=3,n_redundant=0, random_state=42)
X=pd.DataFrame(X,columns=['a','b','c'])
X['d'] = [1,2,3,4,5]*20
model = CatBoostClassifier()
model.fit(X,y,verbose=False,cat_features=[3])
model_cat = CalibratedClassifierCV(base_estimator=model,cv='prefit')
model_cat.fit(X,y)
model_cat.predict(X)
CatBoostError Traceback (most recent call last)
/tmp/ipykernel_3228/1832915274.py in <module>
----> 1 model_cat.predict(X)
~/anaconda3/lib/python3.8/site-packages/sklearn/calibration.py in predict(self, X)
383 """
384 check_is_fitted(self)
--> 385 return self.classes_[np.argmax(self.predict_proba(X), axis=1)]
386
387 def _more_tags(self):
~/anaconda3/lib/python3.8/site-packages/sklearn/calibration.py in predict_proba(self, X)
360 mean_proba = np.zeros((X.shape[0], len(self.classes_)))
361 for calibrated_classifier in self.calibrated_classifiers_:
--> 362 proba = calibrated_classifier.predict_proba(X)
363 mean_proba += proba
364
~/anaconda3/lib/python3.8/site-packages/sklearn/calibration.py in predict_proba(self, X)
637 n_classes = len(self.classes)
638 pred_method = _get_prediction_method(self.base_estimator)
--> 639 predictions = _compute_predictions(pred_method, X, n_classes)
640
641 label_encoder = LabelEncoder().fit(self.classes)
~/anaconda3/lib/python3.8/site-packages/sklearn/calibration.py in _compute_predictions(pred_method, X, n_classes)
499 (X.shape[0], 1).
500 """
--> 501 predictions = pred_method(X=X)
502 if hasattr(pred_method, '__name__'):
503 method_name = pred_method.__name__
~/anaconda3/lib/python3.8/site-packages/catboost/core.py in predict_proba(self, X, ntree_start, ntree_end, thread_count, verbose, task_type)
4767 with probability for every class for each object.
4768 """
-> 4769 return self._predict(X, 'Probability', ntree_start, ntree_end, thread_count, verbose, 'predict_proba', task_type)
4770
4771
~/anaconda3/lib/python3.8/site-packages/catboost/core.py in _predict(self, data, prediction_type, ntree_start, ntree_end, thread_count, verbose, parent_method_name, task_type)
2175 if verbose is None:
2176 verbose = False
-> 2177 data, data_is_single_object = self._process_predict_input_data(data, parent_method_name, thread_count)
2178 self._validate_prediction_type(prediction_type)
2179
~/anaconda3/lib/python3.8/site-packages/catboost/core.py in _process_predict_input_data(self, data, parent_method_name, thread_count, label)
2155 is_single_object = _is_data_single_object(data)
2156 if not isinstance(data, Pool):
-> 2157 data = Pool(
2158 data=[data] if is_single_object else data,
2159 label=label,
~/anaconda3/lib/python3.8/site-packages/catboost/core.py in __init__(self, data, label, cat_features, text_features, embedding_features, column_description, pairs, delimiter, has_header, ignore_csv_quoting, weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, feature_names, thread_count, log_cout, log_cerr)
580 elif isinstance(data, np.ndarray):
581 if (data.dtype.kind == 'f') and (cat_features is not None) and (len(cat_features) > 0):
--> 582 raise CatBoostError(
583 "'data' is numpy array of floating point numerical type, it means no categorical features,"
584 " but 'cat_features' parameter specifies nonzero number of categorical features"
CatBoostError: 'data' is numpy array of floating point numerical type, it means no categorical features, but 'cat_features' parameter specifies nonzero number of categorical features
我不知道问题来自 Scikit-Learn 还是 CatBoost,但我已经尝试将此问题提出给 CatBoost here。
感谢您的回复。非常感谢。
非常感谢 catboost
和 scikit-learn
提高这两个模块的性能并解决所有提出的问题。通过将 catboost
和 scikit-learn
升级到 1.x.x
版本解决了这个问题。