如何在 Python 中结合文本特征和分类特征?
How to combine text features and categorical features in Python?
我正在尝试构建一个管道来分别对文本和分类特征进行转换和编码,并将它们组合起来以输入到分类器中。我目前有以下Class到select的数据:
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
print(X[self.attribute_names].head())
return X[self.attribute_names]
然后使用它,我将以下 FeatureUnion 与管道相结合:
preprocessing = FeatureUnion([
("text_pipeline", Pipeline([
("select_text", DataFrameSelector(text_features)),
("count_vect", CountVectorizer()),
("word_count_to_vector", TfidfTransformer()),
])),
("cat_pipeline", Pipeline([
("select_cat", DataFrameSelector(cat_features)),
("cat_encoder", OneHotEncoder(sparse=False)),
])),
])
执行 full_pipeline.fit_transform(X_train) 时出现以下错误:
ValueError Traceback (most recent call last)
<ipython-input-69-6927adc0ed62> in <module>()
22 ])
23
---> 24 full_pipeline.fit_transform(X_train)
/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
298 Xt, fit_params = self._fit(X, y, **fit_params)
299 if hasattr(last_step, 'fit_transform'):
--> 300 return last_step.fit_transform(Xt, y, **fit_params)
301 elif last_step is None:
302 return Xt
/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
798 self._update_transformer_list(transformers)
799 if any(sparse.issparse(f) for f in Xs):
--> 800 Xs = sparse.hstack(Xs).tocsr()
801 else:
802 Xs = np.hstack(Xs)
/anaconda3/lib/python3.6/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
462
463 """
--> 464 return bmat([blocks], format=format, dtype=dtype)
465
466
/anaconda3/lib/python3.6/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
583 exp=brow_lengths[i],
584 got=A.shape[0]))
--> 585 raise ValueError(msg)
586
587 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 1, expected 19634.
而且我不知道我做错了什么。感谢任何帮助。
所以我通过使用 spicy.sparse 中的 hstack 连接两个稀疏矩阵来让它工作。请参阅下面的代码:
from scipy.sparse import coo_matrix, hstack
from sklearn.preprocessing import OneHotEncoder
with_prod_tfidf = text_pipeline.fit_transform(with_prod['Text'])
#as per
with_prod_all = hstack([with_prod_tfidf, OneHotEncoder().fit_transform(with_prod[cat_features])])
print(with_prod_all.shape)
我正在尝试构建一个管道来分别对文本和分类特征进行转换和编码,并将它们组合起来以输入到分类器中。我目前有以下Class到select的数据:
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
print(X[self.attribute_names].head())
return X[self.attribute_names]
然后使用它,我将以下 FeatureUnion 与管道相结合:
preprocessing = FeatureUnion([
("text_pipeline", Pipeline([
("select_text", DataFrameSelector(text_features)),
("count_vect", CountVectorizer()),
("word_count_to_vector", TfidfTransformer()),
])),
("cat_pipeline", Pipeline([
("select_cat", DataFrameSelector(cat_features)),
("cat_encoder", OneHotEncoder(sparse=False)),
])),
])
执行 full_pipeline.fit_transform(X_train) 时出现以下错误:
ValueError Traceback (most recent call last)
<ipython-input-69-6927adc0ed62> in <module>()
22 ])
23
---> 24 full_pipeline.fit_transform(X_train)
/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
298 Xt, fit_params = self._fit(X, y, **fit_params)
299 if hasattr(last_step, 'fit_transform'):
--> 300 return last_step.fit_transform(Xt, y, **fit_params)
301 elif last_step is None:
302 return Xt
/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
798 self._update_transformer_list(transformers)
799 if any(sparse.issparse(f) for f in Xs):
--> 800 Xs = sparse.hstack(Xs).tocsr()
801 else:
802 Xs = np.hstack(Xs)
/anaconda3/lib/python3.6/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
462
463 """
--> 464 return bmat([blocks], format=format, dtype=dtype)
465
466
/anaconda3/lib/python3.6/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
583 exp=brow_lengths[i],
584 got=A.shape[0]))
--> 585 raise ValueError(msg)
586
587 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 1, expected 19634.
而且我不知道我做错了什么。感谢任何帮助。
所以我通过使用 spicy.sparse 中的 hstack 连接两个稀疏矩阵来让它工作。请参阅下面的代码:
from scipy.sparse import coo_matrix, hstack
from sklearn.preprocessing import OneHotEncoder
with_prod_tfidf = text_pipeline.fit_transform(with_prod['Text'])
#as per
with_prod_all = hstack([with_prod_tfidf, OneHotEncoder().fit_transform(with_prod[cat_features])])
print(with_prod_all.shape)