如何使用包含 OneHotEncode 和 LightGBM 的管道预测验证集上的目标?

How to predict target on a validation set with a Pipeline containing OneHotEncode and LightGBM?

我正在尝试使用具有数值和分类特征的 sklearn 和 LightGBM。我创建了一个管道:

它可以很好地训练我的模型,但是当我想使用我的模型对测试数据集进行预测时出现错误消息。看起来预处理没有应用到这个测试数据集,但我不明白为什么。在我在网上找到的教程中,它似乎可以工作,但使用 sklearn 分类器。

这是我的代码:

from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer,
from sklearn.impute import SimpleImputer

# Numerical features
numerical_features = ['Distance']
numerical_transformer = Pipeline(steps=[
     ('imputer', SimpleImputer(strategy='median')),
     ('scaler', StandardScaler())])

# Categorical features
categorical_features = ['Travel', 'Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier']
categorical_transformer = Pipeline(steps=[
     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
     ('onehot', OneHotEncoder())])

# Build the preprocessor with ColumnTransformer
preprocess = ColumnTransformer(transformers=[
                                                 ('num', numerical_transformer, numerical_features),
                                                 ('cat', categorical_transformer, categorical_features)
                                               ]
                                 )

# Build a pipeline
clf = Pipeline(steps=[('preprocess', preprocess),
                      ('classifier', LGBMClassifier(random_state=17))])

# Fit
clf.fit(X_build, y_build)

# Scores
print("model training score (clf internal scoring function with standards parameters): {0}".format(clf.score(X_build, y_build))) # returns a score
print("Score: %f" % clf.score(X_valid, y_valid)) # Here is the problem

这里是错误:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-116-70bf0e236540> in <module>()
----> 1 print("Score: %f" % clf.predict(X_valid))

~/anaconda3/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
    116 
    117         # lambda, but not partial, allows help() to work with update_wrapper
--> 118         out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
    119         # update the docstring of the returned function
    120         update_wrapper(out, self.fn)

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
    329         for name, transform in self.steps[:-1]:
    330             if transform is not None:
--> 331                 Xt = transform.transform(Xt)
    332         return self.steps[-1][-1].predict(Xt, **predict_params)
    333 

~/anaconda3/lib/python3.6/site-packages/sklearn/compose/_column_transformer.py in transform(self, X)
    491 
    492         X = _check_X(X)
--> 493         Xs = self._fit_transform(X, None, _transform_one, fitted=True)
    494         self._validate_output(Xs)
    495 

~/anaconda3/lib/python3.6/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
    391                               _get_column(X, column), y, weight)
    392                 for _, trans, column, weight in self._iter(
--> 393                     fitted=fitted, replace_strings=True))
    394         except ValueError as e:
    395             if "Expected 2D array, got 1D array instead" in str(e):

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    918                 self._iterating = self._original_iterator is not None
    919 
--> 920             while self.dispatch_one_batch(iterator):
    921                 pass
    922 

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    757                 return False
    758             else:
--> 759                 self._dispatch(tasks)
    760                 return True
    761 

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    714         with self._lock:
    715             job_idx = len(self._jobs)
--> 716             job = self._backend.apply_async(batch, callback=cb)
    717             # A job can complete so quickly than its callback is
    718             # called before we get here, causing self._jobs to

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    180     def apply_async(self, func, callback=None):
    181         """Schedule a func to be run"""
--> 182         result = ImmediateResult(func)
    183         if callback:
    184             callback(result)

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    547         # Don't delay the application, to avoid keeping the input
    548         # arguments in memory
--> 549         self.results = batch()
    550 
    551     def get(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    223         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    224             return [func(*args, **kwargs)
--> 225                     for func, args, kwargs in self.items]
    226 
    227     def __len__(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _transform_one(transformer, X, y, weight, **fit_params)
    603 
    604 def _transform_one(transformer, X, y, weight, **fit_params):
--> 605     res = transformer.transform(X)
    606     # if we have a weight for this transformer, multiply output
    607     if weight is None:

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _transform(self, X)
    449         for name, transform in self.steps:
    450             if transform is not None:
--> 451                 Xt = transform.transform(Xt)
    452         return Xt
    453 

~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in transform(self, X)
    611                                        copy=True)
    612         else:
--> 613             return self._transform_new(X)
    614 
    615     def inverse_transform(self, X):

~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in _transform_new(self, X)
    572         n_samples, n_features = X.shape
    573 
--> 574         X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
    575 
    576         mask = X_mask.ravel()

~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/_encoders.py in _transform(self, X, handle_unknown)
    105                     msg = ("Found unknown categories {0} in column {1}"
    106                            " during transform".format(diff, i))
--> 107                     raise ValueError(msg)
    108                 else:
    109                     # Set the problematic rows to an acceptable value and

ValueError: Found unknown categories ['BOS-CHS', 'ORD-JAC', 'LAS-OKC', 'VCT-IAH', 'CVG-EGE', 'PIT-PVD', 'BDL-SLC', 'TEX-PHX', 'LAX-LGA', 'LEX-LGA', 'CLE-SLC', 'KOA-SNA', 'SNA-HNL', 'MDW-SNA', 'MIA-SEA', 'MEM-RDU', 'YUM-IPL', 'SLC-KOA', 'EGE-EWR', 'MTJ-DFW', 'TPA-CHS', 'FLL-OAK', 'PVD-MCI', 'SLC-DSM', 'RSW-DEN', 'ORD-JAN', 'ATL-FSD', 'CHS-JAX', 'MCO-MLI', 'FSD-SLC', 'SLC-LGA', 'GRB-DFW', 'PNS-JAX', 'BDL-LAX', 'ATL-SOP', 'MSP-FAI', 'CLT-CAE', 'PIT-SEA', 'SRQ-IND', 'PHF-CLT', 'MIA-CMH', 'FAR-SLC', 'TUL-LAS', 'EWR-TUS', 'ORD-STT', 'CLT-TRI', 'BHM-CLE', 'ORD-PWM', 'SRQ-IAH', 'BOI-ORD', 'ATL-EGE', 'ATL-CID', 'IND-MSY', 'EGE-LAX', 'BUR-PDX', 'BTR-LGA', 'MIA-SLC', 'ONT-PDX', 'CLE-SBN', 'MSP-JAC', 'CMH-FLL', 'MEM-AUS', 'PHX-MFR', 'SJU-STL', 'ASE-SLC', 'CID-ATL', 'DFW-MLI', 'SCC-BRW', 'LGA-MSN', 'MCO-PFN', 'MDW-SJU', 'SEA-SIT', 'DTW-OMA', 'GRR-TPA', 'EGE-SFO', 'DFW-RST', 'GRR-LAS', 'TPA-TLH', 'PWM-CLT', 'TLH-MIA', 'PHF-FLL', 'SFO-EGE', 'SAT-STL', 'RSW-MKE', 'DTW-MSY', 'IAH-TXK', 'TLH-JFK', 'ATL-GUC', 'IAH-VCT', 'DEN-GRR', 'IND-SEA', 'PIE-MDW', 'BHM-IAD', 'IAD-BHM', 'BUR-MCO', 'MTJ-EWR', 'CLE-HOU', 'MSY-STL', 'DFW-SYR', 'BUF-LAS', 'LEX-EWR'] in column 0 during transform

你知道问题出在哪里吗?

谢谢

问题似乎是 OHE 在验证样本中找到了训练样本中没有的新类别。

不幸的是,sklearn 的实现无法处理这种情况out-of-the-box。因此必须检查新数据中的类别是否与训练集中的类别相同。关于如何对待新类别,可以有不同的策略。尝试 google 并尝试不同的事物。示例可以是:使 OHE 了解所有可能的类别,包括新数据中的类别(在构造函数中使用 categories 参数)或在新数据中删除这些新类别(将新数据与自动学习的 categories_ 参数进行比较).当然,第一个选项在生产中没有意义,但第二个总是可以实现的