如何使用包含 OneHotEncode 和 LightGBM 的管道预测验证集上的目标?
How to predict target on a validation set with a Pipeline containing OneHotEncode and LightGBM?
我正在尝试使用具有数值和分类特征的 sklearn 和 LightGBM。我创建了一个管道:
- 依赖 ColumnTransformer 进行数据预处理的 1 步(分类变量使用 OneHOtEncoder 编码)。
- 使用 LightGBM 进行实际模型训练的 1 步。
它可以很好地训练我的模型,但是当我想使用我的模型对测试数据集进行预测时出现错误消息。看起来预处理没有应用到这个测试数据集,但我不明白为什么。在我在网上找到的教程中,它似乎可以工作,但使用 sklearn 分类器。
这是我的代码:
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder,
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer,
from sklearn.impute import SimpleImputer
# Numerical features
numerical_features = ['Distance']
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
# Categorical features
categorical_features = ['Travel', 'Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder())])
# Build the preprocessor with ColumnTransformer
preprocess = ColumnTransformer(transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
]
)
# Build a pipeline
clf = Pipeline(steps=[('preprocess', preprocess),
('classifier', LGBMClassifier(random_state=17))])
# Fit
clf.fit(X_build, y_build)
# Scores
print("model training score (clf internal scoring function with standards parameters): {0}".format(clf.score(X_build, y_build))) # returns a score
print("Score: %f" % clf.score(X_valid, y_valid)) # Here is the problem
这里是错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-116-70bf0e236540> in <module>()
----> 1 print("Score: %f" % clf.predict(X_valid))
~/anaconda3/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
116
117 # lambda, but not partial, allows help() to work with update_wrapper
--> 118 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
119 # update the docstring of the returned function
120 update_wrapper(out, self.fn)
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
329 for name, transform in self.steps[:-1]:
330 if transform is not None:
--> 331 Xt = transform.transform(Xt)
332 return self.steps[-1][-1].predict(Xt, **predict_params)
333
~/anaconda3/lib/python3.6/site-packages/sklearn/compose/_column_transformer.py in transform(self, X)
491
492 X = _check_X(X)
--> 493 Xs = self._fit_transform(X, None, _transform_one, fitted=True)
494 self._validate_output(Xs)
495
~/anaconda3/lib/python3.6/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
391 _get_column(X, column), y, weight)
392 for _, trans, column, weight in self._iter(
--> 393 fitted=fitted, replace_strings=True))
394 except ValueError as e:
395 if "Expected 2D array, got 1D array instead" in str(e):
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
918 self._iterating = self._original_iterator is not None
919
--> 920 while self.dispatch_one_batch(iterator):
921 pass
922
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _transform_one(transformer, X, y, weight, **fit_params)
603
604 def _transform_one(transformer, X, y, weight, **fit_params):
--> 605 res = transformer.transform(X)
606 # if we have a weight for this transformer, multiply output
607 if weight is None:
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _transform(self, X)
449 for name, transform in self.steps:
450 if transform is not None:
--> 451 Xt = transform.transform(Xt)
452 return Xt
453
~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in transform(self, X)
611 copy=True)
612 else:
--> 613 return self._transform_new(X)
614
615 def inverse_transform(self, X):
~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in _transform_new(self, X)
572 n_samples, n_features = X.shape
573
--> 574 X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
575
576 mask = X_mask.ravel()
~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in _transform(self, X, handle_unknown)
105 msg = ("Found unknown categories {0} in column {1}"
106 " during transform".format(diff, i))
--> 107 raise ValueError(msg)
108 else:
109 # Set the problematic rows to an acceptable value and
ValueError: Found unknown categories ['BOS-CHS', 'ORD-JAC', 'LAS-OKC', 'VCT-IAH', 'CVG-EGE', 'PIT-PVD', 'BDL-SLC', 'TEX-PHX', 'LAX-LGA', 'LEX-LGA', 'CLE-SLC', 'KOA-SNA', 'SNA-HNL', 'MDW-SNA', 'MIA-SEA', 'MEM-RDU', 'YUM-IPL', 'SLC-KOA', 'EGE-EWR', 'MTJ-DFW', 'TPA-CHS', 'FLL-OAK', 'PVD-MCI', 'SLC-DSM', 'RSW-DEN', 'ORD-JAN', 'ATL-FSD', 'CHS-JAX', 'MCO-MLI', 'FSD-SLC', 'SLC-LGA', 'GRB-DFW', 'PNS-JAX', 'BDL-LAX', 'ATL-SOP', 'MSP-FAI', 'CLT-CAE', 'PIT-SEA', 'SRQ-IND', 'PHF-CLT', 'MIA-CMH', 'FAR-SLC', 'TUL-LAS', 'EWR-TUS', 'ORD-STT', 'CLT-TRI', 'BHM-CLE', 'ORD-PWM', 'SRQ-IAH', 'BOI-ORD', 'ATL-EGE', 'ATL-CID', 'IND-MSY', 'EGE-LAX', 'BUR-PDX', 'BTR-LGA', 'MIA-SLC', 'ONT-PDX', 'CLE-SBN', 'MSP-JAC', 'CMH-FLL', 'MEM-AUS', 'PHX-MFR', 'SJU-STL', 'ASE-SLC', 'CID-ATL', 'DFW-MLI', 'SCC-BRW', 'LGA-MSN', 'MCO-PFN', 'MDW-SJU', 'SEA-SIT', 'DTW-OMA', 'GRR-TPA', 'EGE-SFO', 'DFW-RST', 'GRR-LAS', 'TPA-TLH', 'PWM-CLT', 'TLH-MIA', 'PHF-FLL', 'SFO-EGE', 'SAT-STL', 'RSW-MKE', 'DTW-MSY', 'IAH-TXK', 'TLH-JFK', 'ATL-GUC', 'IAH-VCT', 'DEN-GRR', 'IND-SEA', 'PIE-MDW', 'BHM-IAD', 'IAD-BHM', 'BUR-MCO', 'MTJ-EWR', 'CLE-HOU', 'MSY-STL', 'DFW-SYR', 'BUF-LAS', 'LEX-EWR'] in column 0 during transform
你知道问题出在哪里吗?
谢谢
问题似乎是 OHE 在验证样本中找到了训练样本中没有的新类别。
不幸的是,sklearn 的实现无法处理这种情况out-of-the-box。因此必须检查新数据中的类别是否与训练集中的类别相同。关于如何对待新类别,可以有不同的策略。尝试 google 并尝试不同的事物。示例可以是:使 OHE 了解所有可能的类别,包括新数据中的类别(在构造函数中使用 categories
参数)或在新数据中删除这些新类别(将新数据与自动学习的 categories_
参数进行比较).当然,第一个选项在生产中没有意义,但第二个总是可以实现的
我正在尝试使用具有数值和分类特征的 sklearn 和 LightGBM。我创建了一个管道:
- 依赖 ColumnTransformer 进行数据预处理的 1 步(分类变量使用 OneHOtEncoder 编码)。
- 使用 LightGBM 进行实际模型训练的 1 步。
它可以很好地训练我的模型,但是当我想使用我的模型对测试数据集进行预测时出现错误消息。看起来预处理没有应用到这个测试数据集,但我不明白为什么。在我在网上找到的教程中,它似乎可以工作,但使用 sklearn 分类器。
这是我的代码:
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder,
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer,
from sklearn.impute import SimpleImputer
# Numerical features
numerical_features = ['Distance']
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
# Categorical features
categorical_features = ['Travel', 'Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder())])
# Build the preprocessor with ColumnTransformer
preprocess = ColumnTransformer(transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
]
)
# Build a pipeline
clf = Pipeline(steps=[('preprocess', preprocess),
('classifier', LGBMClassifier(random_state=17))])
# Fit
clf.fit(X_build, y_build)
# Scores
print("model training score (clf internal scoring function with standards parameters): {0}".format(clf.score(X_build, y_build))) # returns a score
print("Score: %f" % clf.score(X_valid, y_valid)) # Here is the problem
这里是错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-116-70bf0e236540> in <module>()
----> 1 print("Score: %f" % clf.predict(X_valid))
~/anaconda3/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
116
117 # lambda, but not partial, allows help() to work with update_wrapper
--> 118 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
119 # update the docstring of the returned function
120 update_wrapper(out, self.fn)
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
329 for name, transform in self.steps[:-1]:
330 if transform is not None:
--> 331 Xt = transform.transform(Xt)
332 return self.steps[-1][-1].predict(Xt, **predict_params)
333
~/anaconda3/lib/python3.6/site-packages/sklearn/compose/_column_transformer.py in transform(self, X)
491
492 X = _check_X(X)
--> 493 Xs = self._fit_transform(X, None, _transform_one, fitted=True)
494 self._validate_output(Xs)
495
~/anaconda3/lib/python3.6/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
391 _get_column(X, column), y, weight)
392 for _, trans, column, weight in self._iter(
--> 393 fitted=fitted, replace_strings=True))
394 except ValueError as e:
395 if "Expected 2D array, got 1D array instead" in str(e):
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
918 self._iterating = self._original_iterator is not None
919
--> 920 while self.dispatch_one_batch(iterator):
921 pass
922
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _transform_one(transformer, X, y, weight, **fit_params)
603
604 def _transform_one(transformer, X, y, weight, **fit_params):
--> 605 res = transformer.transform(X)
606 # if we have a weight for this transformer, multiply output
607 if weight is None:
~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _transform(self, X)
449 for name, transform in self.steps:
450 if transform is not None:
--> 451 Xt = transform.transform(Xt)
452 return Xt
453
~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in transform(self, X)
611 copy=True)
612 else:
--> 613 return self._transform_new(X)
614
615 def inverse_transform(self, X):
~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in _transform_new(self, X)
572 n_samples, n_features = X.shape
573
--> 574 X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
575
576 mask = X_mask.ravel()
~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in _transform(self, X, handle_unknown)
105 msg = ("Found unknown categories {0} in column {1}"
106 " during transform".format(diff, i))
--> 107 raise ValueError(msg)
108 else:
109 # Set the problematic rows to an acceptable value and
ValueError: Found unknown categories ['BOS-CHS', 'ORD-JAC', 'LAS-OKC', 'VCT-IAH', 'CVG-EGE', 'PIT-PVD', 'BDL-SLC', 'TEX-PHX', 'LAX-LGA', 'LEX-LGA', 'CLE-SLC', 'KOA-SNA', 'SNA-HNL', 'MDW-SNA', 'MIA-SEA', 'MEM-RDU', 'YUM-IPL', 'SLC-KOA', 'EGE-EWR', 'MTJ-DFW', 'TPA-CHS', 'FLL-OAK', 'PVD-MCI', 'SLC-DSM', 'RSW-DEN', 'ORD-JAN', 'ATL-FSD', 'CHS-JAX', 'MCO-MLI', 'FSD-SLC', 'SLC-LGA', 'GRB-DFW', 'PNS-JAX', 'BDL-LAX', 'ATL-SOP', 'MSP-FAI', 'CLT-CAE', 'PIT-SEA', 'SRQ-IND', 'PHF-CLT', 'MIA-CMH', 'FAR-SLC', 'TUL-LAS', 'EWR-TUS', 'ORD-STT', 'CLT-TRI', 'BHM-CLE', 'ORD-PWM', 'SRQ-IAH', 'BOI-ORD', 'ATL-EGE', 'ATL-CID', 'IND-MSY', 'EGE-LAX', 'BUR-PDX', 'BTR-LGA', 'MIA-SLC', 'ONT-PDX', 'CLE-SBN', 'MSP-JAC', 'CMH-FLL', 'MEM-AUS', 'PHX-MFR', 'SJU-STL', 'ASE-SLC', 'CID-ATL', 'DFW-MLI', 'SCC-BRW', 'LGA-MSN', 'MCO-PFN', 'MDW-SJU', 'SEA-SIT', 'DTW-OMA', 'GRR-TPA', 'EGE-SFO', 'DFW-RST', 'GRR-LAS', 'TPA-TLH', 'PWM-CLT', 'TLH-MIA', 'PHF-FLL', 'SFO-EGE', 'SAT-STL', 'RSW-MKE', 'DTW-MSY', 'IAH-TXK', 'TLH-JFK', 'ATL-GUC', 'IAH-VCT', 'DEN-GRR', 'IND-SEA', 'PIE-MDW', 'BHM-IAD', 'IAD-BHM', 'BUR-MCO', 'MTJ-EWR', 'CLE-HOU', 'MSY-STL', 'DFW-SYR', 'BUF-LAS', 'LEX-EWR'] in column 0 during transform
你知道问题出在哪里吗?
谢谢
问题似乎是 OHE 在验证样本中找到了训练样本中没有的新类别。
不幸的是,sklearn 的实现无法处理这种情况out-of-the-box。因此必须检查新数据中的类别是否与训练集中的类别相同。关于如何对待新类别,可以有不同的策略。尝试 google 并尝试不同的事物。示例可以是:使 OHE 了解所有可能的类别,包括新数据中的类别(在构造函数中使用 categories
参数)或在新数据中删除这些新类别(将新数据与自动学习的 categories_
参数进行比较).当然,第一个选项在生产中没有意义,但第二个总是可以实现的