使用 CustomTransformers 的 AttributeError 和 TypeError

Question

我正在使用自定义转换器 () 构建模型。当我运行以下代码时，由于 .fit:

出现错误

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-165-289e1d466eb9> in <module>
     10 
     11 # fit on the complete pipeline
---> 12 training = full_pipeline.fit(X, y)
     13 
     14 # metrics

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    339         """
    340         fit_params_steps = self._check_fit_params(**fit_params)
--> 341         Xt = self._fit(X, y, **fit_params_steps)
    342         with _print_elapsed_time('Pipeline',
    343                                  self._log_message(len(self.steps) - 1)):

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
    301                 cloned_transformer = clone(transformer)
    302             # Fit or load from cache the current transformer
--> 303             X, fitted_transformer = fit_transform_one_cached(
    304                 cloned_transformer, X, y, None,
    305                 message_clsname='Pipeline',

~/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    385             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    386             if hasattr(last_step, 'fit_transform'):
--> 387                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
    388             else:
    389                 return last_step.fit(Xt, y,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    978             sum of n_components (output dimension) over transformers.
    979         """
--> 980         results = self._parallel_func(X, y, fit_params, _fit_transform_one)
    981         if not results:
    982             # All transformers are None

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
   1000         transformers = list(self._iter())
   1001 
-> 1002         return Parallel(n_jobs=self.n_jobs)(delayed(func)(
   1003             transformer, X, y, weight,
   1004             message_clsname='FeatureUnion',

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
   1042                 self._iterating = self._original_iterator is not None
   1043 
-> 1044             while self.dispatch_one_batch(iterator):
   1045                 pass
   1046 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    857                 return False
    858             else:
--> 859                 self._dispatch(tasks)
    860                 return True
    861 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in _dispatch(self, batch)
    775         with self._lock:
    776             job_idx = len(self._jobs)
--> 777             job = self._backend.apply_async(batch, callback=cb)
    778             # A job can complete so quickly than its callback is
    779             # called before we get here, causing self._jobs to

~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self)
    260         # change the default number of processes to -1
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262             return [func(*args, **kwargs)
    263                     for func, args, kwargs in self.items]
    264 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in <listcomp>(.0)
    260         # change the default number of processes to -1
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262             return [func(*args, **kwargs)
    263                     for func, args, kwargs in self.items]
    264 

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    220     def __call__(self, *args, **kwargs):
    221         with config_context(**self.config):
--> 222             return self.function(*args, **kwargs)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    385             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    386             if hasattr(last_step, 'fit_transform'):
--> 387                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
    388             else:
    389                 return last_step.fit(Xt, y,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
   1200         max_features = self.max_features
   1201 
-> 1202         vocabulary, X = self._count_vocab(raw_documents,
   1203                                           self.fixed_vocabulary_)
   1204 

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
   1112         for doc in raw_documents:
   1113             feature_counter = {}
-> 1114             for feature in analyze(doc):
   1115                 try:
   1116                     feature_idx = vocabulary[feature]

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
    102     else:
    103         if preprocessor is not None:
--> 104             doc = preprocessor(doc)
    105         if tokenizer is not None:
    106             doc = tokenizer(doc)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _preprocess(doc, accent_function, lower)
     67     """
     68     if lower:
---> 69         doc = doc.lower()
     70     if accent_function is not None:
     71         doc = accent_function(doc)

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

密码是

# MODEL
from sklearn import tree

# Decision Tree
decision_tree = tree.DecisionTreeClassifier()
# define full pipeline --> preprocessing + model
full_pipeline = Pipeline(steps=[
    ('preprocess_pipeline', preprocess_pipeline),
    ('model', decision_tree)])

# fit on the complete pipeline
training = full_pipeline.fit(X, y) # <- this step returns the error

# metrics
score_test = \
    round(training.score(X, y) * 100, 2)
print(f"\nTraining Accuracy: {score_test}")

我也尝试过 .fit_transform 但我得到了同样的错误。我读到这个：但似乎我没有像那个例子那样在决策树中传递 X 或 y，但也许我错了。添加

# Defining the steps in the text pipeline
text_pipeline = Pipeline(steps=[
    ('text_transformer', TextTransformer()),
    ('cv', CountVectorizer(analyzer='word', ngram_range=(2, 2), lowercase=False))])

我收到这个新错误：

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-159-289e1d466eb9> in <module>
     10 
     11 # fit on the complete pipeline
---> 12 training = full_pipeline.fit(X, y)
     13 
     14 # metrics

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    339         """
    340         fit_params_steps = self._check_fit_params(**fit_params)
--> 341         Xt = self._fit(X, y, **fit_params_steps)
    342         with _print_elapsed_time('Pipeline',
    343                                  self._log_message(len(self.steps) - 1)):

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
    301                 cloned_transformer = clone(transformer)
    302             # Fit or load from cache the current transformer
--> 303             X, fitted_transformer = fit_transform_one_cached(
    304                 cloned_transformer, X, y, None,
    305                 message_clsname='Pipeline',

~/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    385             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    386             if hasattr(last_step, 'fit_transform'):
--> 387                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
    388             else:
    389                 return last_step.fit(Xt, y,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    978             sum of n_components (output dimension) over transformers.
    979         """
--> 980         results = self._parallel_func(X, y, fit_params, _fit_transform_one)
    981         if not results:
    982             # All transformers are None

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
   1000         transformers = list(self._iter())
   1001 
-> 1002         return Parallel(n_jobs=self.n_jobs)(delayed(func)(
   1003             transformer, X, y, weight,
   1004             message_clsname='FeatureUnion',

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
   1042                 self._iterating = self._original_iterator is not None
   1043 
-> 1044             while self.dispatch_one_batch(iterator):
   1045                 pass
   1046 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    857                 return False
    858             else:
--> 859                 self._dispatch(tasks)
    860                 return True
    861 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in _dispatch(self, batch)
    775         with self._lock:
    776             job_idx = len(self._jobs)
--> 777             job = self._backend.apply_async(batch, callback=cb)
    778             # A job can complete so quickly than its callback is
    779             # called before we get here, causing self._jobs to

~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

~/opt/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self)
    260         # change the default number of processes to -1
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262             return [func(*args, **kwargs)
    263                     for func, args, kwargs in self.items]
    264 

~/opt/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in <listcomp>(.0)
    260         # change the default number of processes to -1
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262             return [func(*args, **kwargs)
    263                     for func, args, kwargs in self.items]
    264 

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    220     def __call__(self, *args, **kwargs):
    221         with config_context(**self.config):
--> 222             return self.function(*args, **kwargs)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    385             fit_params_last_step = fit_params_steps[self.steps[-1][0]]
    386             if hasattr(last_step, 'fit_transform'):
--> 387                 return last_step.fit_transform(Xt, y, **fit_params_last_step)
    388             else:
    389                 return last_step.fit(Xt, y,

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
   1200         max_features = self.max_features
   1201 
-> 1202         vocabulary, X = self._count_vocab(raw_documents,
   1203                                           self.fixed_vocabulary_)
   1204 

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
   1112         for doc in raw_documents:
   1113             feature_counter = {}
-> 1114             for feature in analyze(doc):
   1115                 try:
   1116                     feature_idx = vocabulary[feature]

~/opt/anaconda3/lib/python3.8/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
    104             doc = preprocessor(doc)
    105         if tokenizer is not None:
--> 106             doc = tokenizer(doc)
    107         if ngrams is not None:
    108             if stop_words is not None:

TypeError: cannot use a string pattern on a bytes-like object

如果我删除text_pipeline，错误不会发生，所以似乎是因为使用countVectorizer 的方式出了问题。文本示例是

an example
example number 1
this is another small example

我还有其他列是数字和分类。

您遇到过类似的问题吗？如果有，你是怎么处理的？

Answer 1

sklearn 的文本转换器中的一个常见错误涉及数据的形状：与大多数其他 sklearn 预处理器不同，文本转换器通常需要一维输入，并且 python的 duck-typing 导致数组和字符串都是可迭代的奇怪错误。

你的TextTransformer.transformreturnsX[['Tweet']]，是二维的，会给后面的CountVectorizer带来问题。（使用 .values 转换为 numpy 数组不会改变维数问题，但也没有令人信服的理由进行该转换。）返回 X['Tweet'] 应该可以解决该问题。

使用 CustomTransformers 的 AttributeError 和 TypeError

AttributeError and TypeError using CustomTransformers

python

pipeline

scikit-learn