'str' 对象在 sklearn 中没有属性 'apply'

'str' object has no attribute 'apply' with sklearn

我正在尝试将 sklearn Pipeline 中的 predict_proba 用于 DataFrame,其中只有一行我写了我的 Pipeline,如下所示。

def get_email_length(email) -> int:
    return len(email.split("@")[0])


def get_domain_length(email) -> int:
    parts = email.split("@")
    return len(parts[-1]) if len(parts) > 1 else 0

class EmailLengthTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.apply(lambda x: get_email_length(x)).values.reshape(-1, 1)


class DomainLengthTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.apply(lambda x: get_domain_length(x)).values.reshape(-1, 1)

class EmailTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.email_transformer = FeatureUnion(
            [
                ("email_length", EmailLengthTransformer()),
                ("domain_length", DomainLengthTransformer()),
            ]
        )

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_tr = X.squeeze() if len(X) > 1 else X
        return self.email_transformer.fit_transform(X_tr)

entities_list = ['TH', 'PH']
entities_list = list(np.array(entities_list).reshape(1, len(entities_list)))

preprocess = ColumnTransformer(
            transformers=[
                ("email_text", EmailTransformer(), ["email"]),
                ("entity_cat", OneHotEncoder(sparse=False, categories=entities_list), ["global_entity_id"]),
            ]
        )

xgb_model = XGBClassifier()

pipe = Pipeline([("preproc", preprocess), ("classifier", xgb_model)])

在此之后我训练它然后每当我尝试将这样的 DataFrame 传递给 pipe.predict_proba(test) 它失败

d={
    'email': ['sfsdfgdssdf@gmail.com'],
    'global_entity_id': ['TH']
}

test=pd.DataFrame.from_dict(d)

pipe.predict_proba(test)

我收到错误

~/.local/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
    118 
    119         # lambda, but not partial, allows help() to work with update_wrapper
--> 120         out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
    121         # update the docstring of the returned function
    122         update_wrapper(out, self.fn)

~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in predict_proba(self, X)
    472         Xt = X
    473         for _, name, transform in self._iter(with_final=False):
--> 474             Xt = transform.transform(Xt)
    475         return self.steps[-1][-1].predict_proba(Xt)
    476 

~/.local/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in transform(self, X)
    563                 "data given during fit."
    564             )
--> 565         Xs = self._fit_transform(X, None, _transform_one, fitted=True)
    566         self._validate_output(Xs)
    567 

~/.local/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
    442                     message=self._log_message(name, idx, len(transformers)))
    443                 for idx, (name, trans, column, weight) in enumerate(
--> 444                         self._iter(fitted=fitted, replace_strings=True), 1))
    445         except ValueError as e:
    446             if "Expected 2D array, got 1D array instead" in str(e):

~/.local/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
   1002             # remaining jobs.
   1003             self._iterating = False
-> 1004             if self.dispatch_one_batch(iterator):
   1005                 self._iterating = self._original_iterator is not None
   1006 

~/.local/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    833                 return False
    834             else:
--> 835                 self._dispatch(tasks)
    836                 return True
    837 

~/.local/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
    752         with self._lock:
    753             job_idx = len(self._jobs)
--> 754             job = self._backend.apply_async(batch, callback=cb)
    755             # A job can complete so quickly than its callback is
    756             # called before we get here, causing self._jobs to

~/.local/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    207     def apply_async(self, func, callback=None):
    208         """Schedule a func to be run"""
--> 209         result = ImmediateResult(func)
    210         if callback:
    211             callback(result)

~/.local/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    588         # Don't delay the application, to avoid keeping the input
    589         # arguments in memory
--> 590         self.results = batch()
    591 
    592     def get(self):

~/.local/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

~/.local/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

~/.local/lib/python3.7/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    220     def __call__(self, *args, **kwargs):
    221         with config_context(**self.config):
--> 222             return self.function(*args, **kwargs)

~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in _transform_one(transformer, X, y, weight, **fit_params)
    731 
    732 def _transform_one(transformer, X, y, weight, **fit_params):
--> 733     res = transformer.transform(X)
    734     # if we have a weight for this transformer, multiply output
    735     if weight is None:

~/data-fraud-email-susp-model/model_package/src/transformers.py in transform(self, X, y)
    150 
    151     def transform(self, X, y=None):
--> 152         return self.email_transformer.fit_transform(X.squeeze())

~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    978             sum of n_components (output dimension) over transformers.
    979         """
--> 980         results = self._parallel_func(X, y, fit_params, _fit_transform_one)
    981         if not results:
    982             # All transformers are None

~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
   1005             message=self._log_message(name, idx, len(transformers)),
   1006             **fit_params) for idx, (name, transformer,
-> 1007                                     weight) in enumerate(transformers, 1))
   1008 
   1009     def transform(self, X):

~/.local/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
   1002             # remaining jobs.
   1003             self._iterating = False
-> 1004             if self.dispatch_one_batch(iterator):
   1005                 self._iterating = self._original_iterator is not None
   1006 

~/.local/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    833                 return False
    834             else:
--> 835                 self._dispatch(tasks)
    836                 return True
    837 

~/.local/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
    752         with self._lock:
    753             job_idx = len(self._jobs)
--> 754             job = self._backend.apply_async(batch, callback=cb)
    755             # A job can complete so quickly than its callback is
    756             # called before we get here, causing self._jobs to

~/.local/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    207     def apply_async(self, func, callback=None):
    208         """Schedule a func to be run"""
--> 209         result = ImmediateResult(func)
    210         if callback:
    211             callback(result)

~/.local/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    588         # Don't delay the application, to avoid keeping the input
    589         # arguments in memory
--> 590         self.results = batch()
    591 
    592     def get(self):

~/.local/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

~/.local/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

~/.local/lib/python3.7/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    220     def __call__(self, *args, **kwargs):
    221         with config_context(**self.config):
--> 222             return self.function(*args, **kwargs)

~/.local/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

~/.local/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    697         if y is None:
    698             # fit method of arity 1 (unsupervised transformation)
--> 699             return self.fit(X, **fit_params).transform(X)
    700         else:
    701             # fit method of arity 2 (supervised transformation)

~/data-fraud-email-susp-model/model_package/src/transformers.py in transform(self, X, y)
     57 
     58     def transform(self, X, y=None):
---> 59         return X.apply(lambda x: get_email_length(x)).values.reshape(-1, 1)
     60 
     61 

AttributeError: 'str' object has no attribute 'apply'

在我看来,您在错误消息中显示的代码与提供的代码片段不同。在代码段中,您有:

X_tr = X.squeeze() if len(X) > 1 else X
return self.email_transformer.fit_transform(X_tr)

但错误信息显示

return self.email_transformer.fit_transform(X.squeeze())

错误消息中的代码导致该错误,因为 DataFrame 的长度为 1,并且 squeeze 将 return 为标量。您在代码段中所做的更新,其中您仅在 len(X) > 1 应该解决问题时才调用 squeeze

就是说,我没有在该代码中看到您索引到 'email' 列的部分,因此转换器将应用于两个列,这也可能会破坏内容。

如果您通过字符串而不是字符串列表指定列切片器,您将向转换器传递一个系列而不是数据框:("email_text", EmailTransformer(), "email") 这样你就可以摆脱 transform 方法中的 squeeze() 调用。