使用 DictVectorizer 的 sklearn 管道中的分类变量
Categorical variables in sklearn pipeline with DictVectorizer
我想应用带有数字和分类变量的管道,如下所示
import numpy as np
import pandas as pd
from sklearn import linear_model, pipeline, preprocessing
from sklearn.feature_extraction import DictVectorizer
df = pd.DataFrame({'a':range(12), 'b':[1,2,3,1,2,3,1,2,3,3,1,2], 'c':['a', 'b', 'c']*4, 'd': ['m', 'f']*6})
y = df['a']
X = df[['b', 'c', 'd']]
我为数字创建索引
numeric = ['b']
numeric_indices = np.array([(column in numeric) for column in X.columns], dtype = bool)
& 对于分类变量
categorical = ['c', 'd']
categorical_indices = np.array([(column in categorical) for column in X.columns], dtype = bool)
然后我创建一个管道
regressor = linear_model.SGDRegressor()
encoder = DictVectorizer(sparse = False)
estimator = pipeline.Pipeline(steps = [
('feature_processing', pipeline.FeatureUnion(transformer_list = [
#numeric
('numeric_variables_processing', pipeline.Pipeline(steps = [
('selecting', preprocessing.FunctionTransformer(lambda data: data[:, numeric_indices])),
('scaling', preprocessing.StandardScaler(with_mean = 0.))
])),
#categorical
('categorical_variables_processing', pipeline.Pipeline(steps = [
('selecting', preprocessing.FunctionTransformer(lambda data: data[:, categorical_indices])),
('DictVectorizer', encoder )
])),
])),
('model_fitting', regressor)
]
)
我得到
estimator.fit(X, y)
ValueError: could not convert string to float: 'f'
我知道我必须申请
encoder.fit()
在管道中,但不了解如何应用它
或者我们讨厌使用 preprocessing.OneHotEncoder() 但我们又需要将字符串转换为浮点数
如何改进?
我是这样看的
import numpy as np
import pandas as pd
from sklearn import linear_model, metrics, pipeline, preprocessing
df = pd.DataFrame({'a':range(12), 'b':[1,2,3,1,2,3,1,2,3,3,1,2], 'c':['a', 'b', 'c']*4, 'd': ['m', 'f']*6})
y = df.a
num = df[['b']]
cat = df[['c', 'd']]
from sklearn.feature_extraction import DictVectorizer
enc = DictVectorizer(sparse = False)
enc_data = enc.fit_transform(cat .T.to_dict().values())
crat = pd.DataFrame(enc_data, columns=enc.get_feature_names())
X = pd.concat([crat, num], axis=1)
cat_columns = ['c=a', 'c=b', 'c=c', 'd=f', 'd=m']
cat_indices = np.array([(column in cat_columns) for column in X.columns], dtype = bool)
numeric_col = ['b']
num_indices = np.array([(column in numeric_col) for column in X.columns], dtype = bool)
reg = linear_model.SGDRegressor()
estimator = pipeline.Pipeline(steps = [
('feature_processing', pipeline.FeatureUnion(transformer_list = [
('categorical', preprocessing.FunctionTransformer(lambda data: data[:, cat_indices])),
#numeric
('numeric', pipeline.Pipeline(steps = [
('select', preprocessing.FunctionTransformer(lambda data: data[:, num_indices])),
('scale', preprocessing.StandardScaler())
]))
])),
('model', reg)
]
)
estimator.fit(X, y)
我想应用带有数字和分类变量的管道,如下所示
import numpy as np
import pandas as pd
from sklearn import linear_model, pipeline, preprocessing
from sklearn.feature_extraction import DictVectorizer
df = pd.DataFrame({'a':range(12), 'b':[1,2,3,1,2,3,1,2,3,3,1,2], 'c':['a', 'b', 'c']*4, 'd': ['m', 'f']*6})
y = df['a']
X = df[['b', 'c', 'd']]
我为数字创建索引
numeric = ['b']
numeric_indices = np.array([(column in numeric) for column in X.columns], dtype = bool)
& 对于分类变量
categorical = ['c', 'd']
categorical_indices = np.array([(column in categorical) for column in X.columns], dtype = bool)
然后我创建一个管道
regressor = linear_model.SGDRegressor()
encoder = DictVectorizer(sparse = False)
estimator = pipeline.Pipeline(steps = [
('feature_processing', pipeline.FeatureUnion(transformer_list = [
#numeric
('numeric_variables_processing', pipeline.Pipeline(steps = [
('selecting', preprocessing.FunctionTransformer(lambda data: data[:, numeric_indices])),
('scaling', preprocessing.StandardScaler(with_mean = 0.))
])),
#categorical
('categorical_variables_processing', pipeline.Pipeline(steps = [
('selecting', preprocessing.FunctionTransformer(lambda data: data[:, categorical_indices])),
('DictVectorizer', encoder )
])),
])),
('model_fitting', regressor)
]
)
我得到
estimator.fit(X, y)
ValueError: could not convert string to float: 'f'
我知道我必须申请 encoder.fit() 在管道中,但不了解如何应用它 或者我们讨厌使用 preprocessing.OneHotEncoder() 但我们又需要将字符串转换为浮点数
如何改进?
我是这样看的
import numpy as np
import pandas as pd
from sklearn import linear_model, metrics, pipeline, preprocessing
df = pd.DataFrame({'a':range(12), 'b':[1,2,3,1,2,3,1,2,3,3,1,2], 'c':['a', 'b', 'c']*4, 'd': ['m', 'f']*6})
y = df.a
num = df[['b']]
cat = df[['c', 'd']]
from sklearn.feature_extraction import DictVectorizer
enc = DictVectorizer(sparse = False)
enc_data = enc.fit_transform(cat .T.to_dict().values())
crat = pd.DataFrame(enc_data, columns=enc.get_feature_names())
X = pd.concat([crat, num], axis=1)
cat_columns = ['c=a', 'c=b', 'c=c', 'd=f', 'd=m']
cat_indices = np.array([(column in cat_columns) for column in X.columns], dtype = bool)
numeric_col = ['b']
num_indices = np.array([(column in numeric_col) for column in X.columns], dtype = bool)
reg = linear_model.SGDRegressor()
estimator = pipeline.Pipeline(steps = [
('feature_processing', pipeline.FeatureUnion(transformer_list = [
('categorical', preprocessing.FunctionTransformer(lambda data: data[:, cat_indices])),
#numeric
('numeric', pipeline.Pipeline(steps = [
('select', preprocessing.FunctionTransformer(lambda data: data[:, num_indices])),
('scale', preprocessing.StandardScaler())
]))
])),
('model', reg)
]
)
estimator.fit(X, y)