如何在机器学习管道中添加自定义中间预处理器来处理 scikit-learn 中的 n-gram 列？

Question

ML 预处理步骤中 n-gram 变量（例如 SUBSTRING_4L_V3）的处理一直给我带来一些问题。

我能够分别转换和标准化数值、分类和 n-gram 变量，

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer

data = {
    'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
    'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
    'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'],
    'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']],
    'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']],
    'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'],
    'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']],
    'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],
    }

df = pd.DataFrame(data)

def transform_numerical():
    x_train, x_test, y_train, y_test = train_test_split(
        df[['AGE']], df['DISEASE'], test_size=0.5, random_state=3)

    scaler = preprocessing.StandardScaler().fit(x_train)
    x_trainT = scaler.transform(x_train)
    x_testT = scaler.transform(x_test)

    print(x_train)
    print(x_trainT)
    print()
    print(x_test)
    print(x_testT)
    print('/////////////////////////', '\n')

transform_numerical()

def transform_categorical():
    x_train, x_test, y_train, y_test = train_test_split(
        df[['URBAN', 'NAME']], df['DISEASE'], test_size=0.5, random_state=3)

    cat_imputer = SimpleImputer(strategy='constant', fill_value='')
    cat_imputer.fit(x_train)
    x_trainT = cat_imputer.transform(x_train)
    x_testT = cat_imputer.transform(x_test)

    encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
    encoder.fit(x_trainT)
    x_trainT = encoder.transform(x_trainT)
    x_testT = encoder.transform(x_testT)

    print(x_trainT.toarray())
    print(x_train)
    print()
    print(x_testT.toarray())
    print(x_test)
    print('/////////////////////////', '\n')

transform_categorical()

def transform_list():
    x_train, x_test, y_train, y_test = train_test_split(
        df[['SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)

    cat_imputer = SimpleImputer(strategy='constant', fill_value='')
    cat_imputer.fit(x_train)
    x_trainT = cat_imputer.transform(x_train)
    x_testT = cat_imputer.transform(x_test)
    x_trainT = x_trainT.ravel()
    x_testT = x_testT.ravel()

    count_vect = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) 
    x_trainT = count_vect.fit_transform(x_trainT)

    print(x_trainT.toarray())
    print('/////////////////////////', '\n')

transform_list()

对于 SUBSTRING_4L_V3，我需要在应用 CountVectorizer() 之前通过 ravel() 将其展平。

但是，我不熟悉如何在下面的 ML 管道中按顺序实现它们

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression

class RavelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self.ravel()

data = {
    'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
    'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
    'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'],
    'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']],
    'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']],
    'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'],
    'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']],
    'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],
    }

df = pd.DataFrame(data)

x_train, x_test, y_train, y_test = train_test_split(
    df[['AGE', 'NAME', 'URBAN', 'SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)

transformer_num = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

transformer_cat = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])

transformer_ngram = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('ravel', RavelTransformer()),
    ('countvectorizer', CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, 
        max_features=5000))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', transformer_num, ['AGE']),
        ('cat', transformer_cat, ['NAME', 'URBAN']),
        ('ngram', transformer_ngram, ['SUBSTRING_4L_V3']),
        ])

ml_algo = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=4000)
model = make_pipeline(preprocessor, ml_algo)
model.fit(x_train, y_train)
#print('Model score: %.3f' % model.score(x_test, y_test))

错误：

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RavelTransformer()' (type <class '__main__.RavelTranformer'>) doesn't

Answer 1

错误消息告诉您的是 RavelTransformer class.

中没有 transform 函数

我的假设是您想做这样的事情：

class RavelTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.ravel()

在这里，您的 RavelTransformer 在 fit 步骤中没有执行任何操作，而是按预期通过分解数据来转换数据。

如何在机器学习管道中添加自定义中间预处理器来处理 scikit-learn 中的 n-gram 列？

How do I add a custom intermediate preprocessor in machine learning pipeline that handles n-gram columns in scikit-learn?

python-3.x

machine-learning

scikit-learn

countvectorizer