如何在机器学习管道中添加自定义中间预处理器来处理 scikit-learn 中的 n-gram 列?
How do I add a custom intermediate preprocessor in machine learning pipeline that handles n-gram columns in scikit-learn?
ML 预处理步骤中 n-gram 变量(例如 SUBSTRING_4L_V3
)的处理一直给我带来一些问题。
我能够分别转换和标准化数值、分类和 n-gram 变量,
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
data = {
'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'],
'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']],
'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']],
'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'],
'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']],
'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],
}
df = pd.DataFrame(data)
def transform_numerical():
x_train, x_test, y_train, y_test = train_test_split(
df[['AGE']], df['DISEASE'], test_size=0.5, random_state=3)
scaler = preprocessing.StandardScaler().fit(x_train)
x_trainT = scaler.transform(x_train)
x_testT = scaler.transform(x_test)
print(x_train)
print(x_trainT)
print()
print(x_test)
print(x_testT)
print('/////////////////////////', '\n')
transform_numerical()
def transform_categorical():
x_train, x_test, y_train, y_test = train_test_split(
df[['URBAN', 'NAME']], df['DISEASE'], test_size=0.5, random_state=3)
cat_imputer = SimpleImputer(strategy='constant', fill_value='')
cat_imputer.fit(x_train)
x_trainT = cat_imputer.transform(x_train)
x_testT = cat_imputer.transform(x_test)
encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
encoder.fit(x_trainT)
x_trainT = encoder.transform(x_trainT)
x_testT = encoder.transform(x_testT)
print(x_trainT.toarray())
print(x_train)
print()
print(x_testT.toarray())
print(x_test)
print('/////////////////////////', '\n')
transform_categorical()
def transform_list():
x_train, x_test, y_train, y_test = train_test_split(
df[['SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)
cat_imputer = SimpleImputer(strategy='constant', fill_value='')
cat_imputer.fit(x_train)
x_trainT = cat_imputer.transform(x_train)
x_testT = cat_imputer.transform(x_test)
x_trainT = x_trainT.ravel()
x_testT = x_testT.ravel()
count_vect = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)
x_trainT = count_vect.fit_transform(x_trainT)
print(x_trainT.toarray())
print('/////////////////////////', '\n')
transform_list()
对于 SUBSTRING_4L_V3
,我需要在应用 CountVectorizer()
之前通过 ravel()
将其展平。
但是,我不熟悉如何在下面的 ML 管道中按顺序实现它们
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
class RavelTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self.ravel()
data = {
'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'],
'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']],
'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']],
'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'],
'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']],
'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],
}
df = pd.DataFrame(data)
x_train, x_test, y_train, y_test = train_test_split(
df[['AGE', 'NAME', 'URBAN', 'SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)
transformer_num = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
transformer_cat = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='')),
('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])
transformer_ngram = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='')),
('ravel', RavelTransformer()),
('countvectorizer', CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None,
max_features=5000))])
preprocessor = ColumnTransformer(
transformers=[
('num', transformer_num, ['AGE']),
('cat', transformer_cat, ['NAME', 'URBAN']),
('ngram', transformer_ngram, ['SUBSTRING_4L_V3']),
])
ml_algo = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=4000)
model = make_pipeline(preprocessor, ml_algo)
model.fit(x_train, y_train)
#print('Model score: %.3f' % model.score(x_test, y_test))
错误:
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RavelTransformer()' (type <class '__main__.RavelTranformer'>) doesn't
错误消息告诉您的是 RavelTransformer
class.
中没有 transform
函数
我的假设是您想做这样的事情:
class RavelTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X.ravel()
在这里,您的 RavelTransformer
在 fit
步骤中没有执行任何操作,而是按预期通过分解数据来转换数据。
ML 预处理步骤中 n-gram 变量(例如 SUBSTRING_4L_V3
)的处理一直给我带来一些问题。
我能够分别转换和标准化数值、分类和 n-gram 变量,
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
data = {
'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'],
'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']],
'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']],
'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'],
'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']],
'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],
}
df = pd.DataFrame(data)
def transform_numerical():
x_train, x_test, y_train, y_test = train_test_split(
df[['AGE']], df['DISEASE'], test_size=0.5, random_state=3)
scaler = preprocessing.StandardScaler().fit(x_train)
x_trainT = scaler.transform(x_train)
x_testT = scaler.transform(x_test)
print(x_train)
print(x_trainT)
print()
print(x_test)
print(x_testT)
print('/////////////////////////', '\n')
transform_numerical()
def transform_categorical():
x_train, x_test, y_train, y_test = train_test_split(
df[['URBAN', 'NAME']], df['DISEASE'], test_size=0.5, random_state=3)
cat_imputer = SimpleImputer(strategy='constant', fill_value='')
cat_imputer.fit(x_train)
x_trainT = cat_imputer.transform(x_train)
x_testT = cat_imputer.transform(x_test)
encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
encoder.fit(x_trainT)
x_trainT = encoder.transform(x_trainT)
x_testT = encoder.transform(x_testT)
print(x_trainT.toarray())
print(x_train)
print()
print(x_testT.toarray())
print(x_test)
print('/////////////////////////', '\n')
transform_categorical()
def transform_list():
x_train, x_test, y_train, y_test = train_test_split(
df[['SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)
cat_imputer = SimpleImputer(strategy='constant', fill_value='')
cat_imputer.fit(x_train)
x_trainT = cat_imputer.transform(x_train)
x_testT = cat_imputer.transform(x_test)
x_trainT = x_trainT.ravel()
x_testT = x_testT.ravel()
count_vect = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)
x_trainT = count_vect.fit_transform(x_trainT)
print(x_trainT.toarray())
print('/////////////////////////', '\n')
transform_list()
对于 SUBSTRING_4L_V3
,我需要在应用 CountVectorizer()
之前通过 ravel()
将其展平。
但是,我不熟悉如何在下面的 ML 管道中按顺序实现它们
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
class RavelTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self.ravel()
data = {
'AGE': [39, np.nan, 21, 13, 45, 26, np.nan, 48],
'URBAN': ['urban', np.nan, 'urban', 'rural', 'urban', 'rural', 'urban', 'urban'],
'NAME': ['jack', 'juste', 'ann', np.nan, 'jack', 'gil', 'phil', 'justo'],
'SUBSTRING_4L': [['jack'], ['just', 'uste'], [], [], ['jack'], [], ['phil'], ['just', 'usto']],
'SUBSTRING_4L_V2': [['jack'], ['just, uste'], [], [], ['jack'], [], ['phil'], ['just, usto']],
'SUBSTRING_4L_V3': ['jack', 'just, uste', '', '', 'jack', '', 'phil', 'just, usto'],
'SUBSTRING_5L': [[], ['juste'], [], [], [], [], [], ['justo']],
'DISEASE': ['healthy', 'cancer', 'cancer', 'dementia', 'cancer', 'heart', 'healthy', 'cancer'],
}
df = pd.DataFrame(data)
x_train, x_test, y_train, y_test = train_test_split(
df[['AGE', 'NAME', 'URBAN', 'SUBSTRING_4L_V3']], df['DISEASE'], test_size=0.5, random_state=3)
transformer_num = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
transformer_cat = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='')),
('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])
transformer_ngram = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='')),
('ravel', RavelTransformer()),
('countvectorizer', CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None,
max_features=5000))])
preprocessor = ColumnTransformer(
transformers=[
('num', transformer_num, ['AGE']),
('cat', transformer_cat, ['NAME', 'URBAN']),
('ngram', transformer_ngram, ['SUBSTRING_4L_V3']),
])
ml_algo = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=4000)
model = make_pipeline(preprocessor, ml_algo)
model.fit(x_train, y_train)
#print('Model score: %.3f' % model.score(x_test, y_test))
错误:
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'RavelTransformer()' (type <class '__main__.RavelTranformer'>) doesn't
错误消息告诉您的是 RavelTransformer
class.
transform
函数
我的假设是您想做这样的事情:
class RavelTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X.ravel()
在这里,您的 RavelTransformer
在 fit
步骤中没有执行任何操作,而是按预期通过分解数据来转换数据。