正确使用 sklearn 的 FeatureTransform

FeatureTransform from sklearn properly used

我正在尝试向我的 Pipeline 发送一封电子邮件,并根据训练抛出一些概率。为此,我使用了一堆函数来从电子邮件传递中获取

from collections import Counter

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin


class EmailLengthTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.array([len(e[0].split("@")[0]) for e in X]).reshape(-1, 1)


class DomainLengthTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.array([len(e[0].split("@")[-1]) for e in X]).reshape(-1, 1)


class NumberOfVoulsTransfomer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        vouls = "aeiouAEIOU"
        name = [e[0].split("@")[0] for e in X]
        return np.array(
            [sum(1 for char in name if char in vouls) for name in name]
        ).reshape(-1, 1)


class NumberOfCapitalsTransfomer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.array(
            [sum(1 for char in email[0] if char.isupper()) for email in X]
        ).reshape(-1, 1)


class NumberOfDigitsTransfomer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        digits = "0123456789"
        return np.array(
            [sum(1 for char in email[0] if char in digits) for email in X]
        ).reshape(-1, 1)

所以在这之后我将它们打包到另一个 class 中并像这样发送到 Pipeline

class EmailsSuspicionModel:
    def __init__(self, X_train, X_valid, y_train, y_valid, model_params):
        self.X_train = X_train
        self.X_valid = X_valid
        self.y_train = y_train
        self.y_valid = y_valid
        self.model_params = model_params
        self.preprocesser = FeatureUnion(
            [
                ("email_length", EmailLengthTransformer()),
                ("domain_length", DomainLengthTransformer()),
                ("number_of_vouls", NumberOfVoulsTransfomer()),
                ("number_of_capitals", NumberOfCapitalsTransfomer()),
                ("number_of_digits", NumberOfDigitsTransfomer()),
                ("highest_char_frequency", HighestCharFrequencyTransfomer()),
                ("number_of_different_chars", NumberOfDifferentChars()),
                (
                    "number_of_consecutive_or_identical_chars",
                    NumberOfConsecutiveOrIdenticalCharsTransfomer()
                ),
            ]
        )

    def transform(self):
        logging.info("Transform validation data - Required for evaluation")
        valid_preprocesser = self.preprocesser.fit(self.X_train)
        return valid_preprocesser.transform(self.X_valid)

    def pipeline(self):
        logging.info("Build sklearn pipeline with XGBoost model")
        xgb_model = XGBClassifier(eval_metric="logloss", use_label_encoder=False)
        if self.model_params:
            logging.info(f"XGBoost model params: {self.model_params}")
            xgb_model = XGBClassifier(**self.model_params)

        return Pipeline([("preproc", self.preprocesser), ("classifier", xgb_model)])

    def fit(self):
        self.pipeline().fit(
            self.X_train, self.y_train, classifier__eval_set=[(self.transform(), self.y_valid)]
        )

所以每当我开始使用 classes 时

X_valid_transformed = EmailsSuspicionModel(X_train.values, X_valid.values, y_train, y_valid, model_params=None).transform()
pipeline = EmailsSuspicionModel(X_train, X_valid, y_train, y_valid, model_params=None).pipeline()
pipeline.fit(
        X_train, y_train, classifier__eval_set=[(X_valid_transformed, y_valid)]
    )

我的模型没有产生我预期的结果(我用我不使用管道的笔记本仔细检查它)我认为是因为 X_train 没有接受适当的特征集训练,因为我每次这样做

pipeline['preproc'].transform(['lucasdresl@gmail.com'])
([1, 1, 0, 0, 0]) 

很明显,转换没有被正确应用,因为结果是 ([10, 9, 3, 0, 0]) 来自所提供的函数,我认为模型正在接受同样的错误训练

这是一个形状问题。如果您改为 [['lucasdres1@gmail.com']] 进行转换,您将恢复预期值。您已将转换器编写为期望二维数组输入(每个输入中的 e[0]email[0] 否则会选择电子邮件的第一个字符)。