正确使用 sklearn 的 FeatureTransform
FeatureTransform from sklearn properly used
我正在尝试向我的 Pipeline
发送一封电子邮件,并根据训练抛出一些概率。为此,我使用了一堆函数来从电子邮件传递中获取
from collections import Counter
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class EmailLengthTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.array([len(e[0].split("@")[0]) for e in X]).reshape(-1, 1)
class DomainLengthTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.array([len(e[0].split("@")[-1]) for e in X]).reshape(-1, 1)
class NumberOfVoulsTransfomer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
vouls = "aeiouAEIOU"
name = [e[0].split("@")[0] for e in X]
return np.array(
[sum(1 for char in name if char in vouls) for name in name]
).reshape(-1, 1)
class NumberOfCapitalsTransfomer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.array(
[sum(1 for char in email[0] if char.isupper()) for email in X]
).reshape(-1, 1)
class NumberOfDigitsTransfomer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
digits = "0123456789"
return np.array(
[sum(1 for char in email[0] if char in digits) for email in X]
).reshape(-1, 1)
所以在这之后我将它们打包到另一个 class 中并像这样发送到 Pipeline
class EmailsSuspicionModel:
def __init__(self, X_train, X_valid, y_train, y_valid, model_params):
self.X_train = X_train
self.X_valid = X_valid
self.y_train = y_train
self.y_valid = y_valid
self.model_params = model_params
self.preprocesser = FeatureUnion(
[
("email_length", EmailLengthTransformer()),
("domain_length", DomainLengthTransformer()),
("number_of_vouls", NumberOfVoulsTransfomer()),
("number_of_capitals", NumberOfCapitalsTransfomer()),
("number_of_digits", NumberOfDigitsTransfomer()),
("highest_char_frequency", HighestCharFrequencyTransfomer()),
("number_of_different_chars", NumberOfDifferentChars()),
(
"number_of_consecutive_or_identical_chars",
NumberOfConsecutiveOrIdenticalCharsTransfomer()
),
]
)
def transform(self):
logging.info("Transform validation data - Required for evaluation")
valid_preprocesser = self.preprocesser.fit(self.X_train)
return valid_preprocesser.transform(self.X_valid)
def pipeline(self):
logging.info("Build sklearn pipeline with XGBoost model")
xgb_model = XGBClassifier(eval_metric="logloss", use_label_encoder=False)
if self.model_params:
logging.info(f"XGBoost model params: {self.model_params}")
xgb_model = XGBClassifier(**self.model_params)
return Pipeline([("preproc", self.preprocesser), ("classifier", xgb_model)])
def fit(self):
self.pipeline().fit(
self.X_train, self.y_train, classifier__eval_set=[(self.transform(), self.y_valid)]
)
所以每当我开始使用 classes 时
X_valid_transformed = EmailsSuspicionModel(X_train.values, X_valid.values, y_train, y_valid, model_params=None).transform()
pipeline = EmailsSuspicionModel(X_train, X_valid, y_train, y_valid, model_params=None).pipeline()
pipeline.fit(
X_train, y_train, classifier__eval_set=[(X_valid_transformed, y_valid)]
)
我的模型没有产生我预期的结果(我用我不使用管道的笔记本仔细检查它)我认为是因为 X_train 没有接受适当的特征集训练,因为我每次这样做
pipeline['preproc'].transform(['lucasdresl@gmail.com'])
([1, 1, 0, 0, 0])
很明显,转换没有被正确应用,因为结果是 ([10, 9, 3, 0, 0])
来自所提供的函数,我认为模型正在接受同样的错误训练
这是一个形状问题。如果您改为 [['lucasdres1@gmail.com']]
进行转换,您将恢复预期值。您已将转换器编写为期望二维数组输入(每个输入中的 e[0]
或 email[0]
否则会选择电子邮件的第一个字符)。
我正在尝试向我的 Pipeline
发送一封电子邮件,并根据训练抛出一些概率。为此,我使用了一堆函数来从电子邮件传递中获取
from collections import Counter
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class EmailLengthTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.array([len(e[0].split("@")[0]) for e in X]).reshape(-1, 1)
class DomainLengthTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.array([len(e[0].split("@")[-1]) for e in X]).reshape(-1, 1)
class NumberOfVoulsTransfomer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
vouls = "aeiouAEIOU"
name = [e[0].split("@")[0] for e in X]
return np.array(
[sum(1 for char in name if char in vouls) for name in name]
).reshape(-1, 1)
class NumberOfCapitalsTransfomer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return np.array(
[sum(1 for char in email[0] if char.isupper()) for email in X]
).reshape(-1, 1)
class NumberOfDigitsTransfomer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
digits = "0123456789"
return np.array(
[sum(1 for char in email[0] if char in digits) for email in X]
).reshape(-1, 1)
所以在这之后我将它们打包到另一个 class 中并像这样发送到 Pipeline
class EmailsSuspicionModel:
def __init__(self, X_train, X_valid, y_train, y_valid, model_params):
self.X_train = X_train
self.X_valid = X_valid
self.y_train = y_train
self.y_valid = y_valid
self.model_params = model_params
self.preprocesser = FeatureUnion(
[
("email_length", EmailLengthTransformer()),
("domain_length", DomainLengthTransformer()),
("number_of_vouls", NumberOfVoulsTransfomer()),
("number_of_capitals", NumberOfCapitalsTransfomer()),
("number_of_digits", NumberOfDigitsTransfomer()),
("highest_char_frequency", HighestCharFrequencyTransfomer()),
("number_of_different_chars", NumberOfDifferentChars()),
(
"number_of_consecutive_or_identical_chars",
NumberOfConsecutiveOrIdenticalCharsTransfomer()
),
]
)
def transform(self):
logging.info("Transform validation data - Required for evaluation")
valid_preprocesser = self.preprocesser.fit(self.X_train)
return valid_preprocesser.transform(self.X_valid)
def pipeline(self):
logging.info("Build sklearn pipeline with XGBoost model")
xgb_model = XGBClassifier(eval_metric="logloss", use_label_encoder=False)
if self.model_params:
logging.info(f"XGBoost model params: {self.model_params}")
xgb_model = XGBClassifier(**self.model_params)
return Pipeline([("preproc", self.preprocesser), ("classifier", xgb_model)])
def fit(self):
self.pipeline().fit(
self.X_train, self.y_train, classifier__eval_set=[(self.transform(), self.y_valid)]
)
所以每当我开始使用 classes 时
X_valid_transformed = EmailsSuspicionModel(X_train.values, X_valid.values, y_train, y_valid, model_params=None).transform()
pipeline = EmailsSuspicionModel(X_train, X_valid, y_train, y_valid, model_params=None).pipeline()
pipeline.fit(
X_train, y_train, classifier__eval_set=[(X_valid_transformed, y_valid)]
)
我的模型没有产生我预期的结果(我用我不使用管道的笔记本仔细检查它)我认为是因为 X_train 没有接受适当的特征集训练,因为我每次这样做
pipeline['preproc'].transform(['lucasdresl@gmail.com'])
([1, 1, 0, 0, 0])
很明显,转换没有被正确应用,因为结果是 ([10, 9, 3, 0, 0])
来自所提供的函数,我认为模型正在接受同样的错误训练
这是一个形状问题。如果您改为 [['lucasdres1@gmail.com']]
进行转换,您将恢复预期值。您已将转换器编写为期望二维数组输入(每个输入中的 e[0]
或 email[0]
否则会选择电子邮件的第一个字符)。