Using CountVectorizer with Pipeline and ColumnTransformer and getting AttributeError: 'numpy.ndarray' object has no attribute 'lower'
Using CountVectorizer with Pipeline and ColumnTransformer and getting AttributeError: 'numpy.ndarray' object has no attribute 'lower'
我正在尝试将 CountVectorizer()
与 Pipeline
和 ColumnTransformer
一起使用。因为 CountVectorizer()
产生稀疏矩阵,所以我使用 FunctionTransformer
来确保 ColumnTransformer
在组合结果矩阵时可以 hstack
正确。
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from typing import Callable
# Dataset
df = pd.DataFrame([['a', 'Hi Tom', 'It is hot', 1],
['b', 'How you been Tom', 'hot coffee', 2],
['c', 'Hi you', 'I want some coffee', 3]],
columns=['col_for_ohe', 'col_for_countvectorizer_1', 'col_for_countvectorizer_2', 'num_col'])
# Use FunctionTransformer to ensure dense matrix
def tf_text(X, vectorizer_tf: Callable):
X_vect_ = vectorizer_tf.fit_transform(X)
return X_vect_.toarray()
tf_transformer = FunctionTransformer(tf_text, kw_args={'vectorizer_tf': CountVectorizer()})
# Transformation Pipelines
tf_transformer_pipe = Pipeline(
steps = [('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('tf', tf_transformer)])
ohe_transformer_pipe = Pipeline(
steps = [('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))])
transformer = ColumnTransformer(transformers=[
('cat_ohe', ohe_transformer_pipe, ['col_for_ohe']),
('cat_tf', tf_transformer_pipe, ['col_for_countvectorizer_1', 'col_for_countvectorizer_2'])
], remainder='passthrough')
transformed_df = transformer.fit_transform(df)
我得到 AttributeError: 'numpy.ndarray' object has no attribute 'lower.' 我已经看到这个 question 并怀疑 CountVectorizer()
是罪魁祸首但不确定如何解决它(上一个问题没有使用 ColumnTransformer
)。我偶然发现了一个 DenseTransformer
,我希望我可以使用它来代替 FunctionTransformer
,但不幸的是我的公司不支持它。
我认为你真的应该再回顾一下你的基础知识。你的问题告诉我你对功能的理解不够好,无法有效地实现它。当你自己做了足够多的研究以免让自己难堪时再问一次。
在 CountVectorizer 中,传递 lower_case=False。
Imo,首先要考虑的是CountVectorizer()
需要一维输入;您的示例不起作用,因为插补返回一个 2D numpy 数组,这意味着您需要添加自定义处理才能使其起作用。
那么你还应该考虑在 ColumnTransformer()
中使用 CountVectorizer()
实例(再次需要一维输入)作为转换器时你应该如何传递 transformers'列:
columns: str, array-like of str, int, array-like of int, array-like of bool, slice or callable
Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns by name. A scalar string or int should be used where transformer expects X to be a 1d array-like (vector), otherwise a 2d array will be passed to the transformer. [...]
这将有助于解释我将 post 作为可能解决方案的代码段。
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from typing import Callable
from sklearn.base import BaseEstimator, TransformerMixin
# Dataset
df = pd.DataFrame([['a', 'Hi Tom', 'It is hot', 1],
['b', 'How you been Tom', 'hot coffee', 2],
['c', 'Hi you', 'I want some coffee', 3]],
columns=['col_for_ohe', 'col_for_countvectorizer_1', 'col_for_countvectorizer_2', 'num_col'])
class DimTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, *_):
return self
def transform(self, X, *_):
return pd.DataFrame(X)
# Use FunctionTransformer to ensure dense matrix
def tf_text(X, vectorizer_tf: Callable):
X_vect_ = vectorizer_tf.fit_transform(X)
return X_vect_.toarray()
tf_transformer = FunctionTransformer(tf_text, kw_args={'vectorizer_tf': CountVectorizer()})
# Transformation Pipelines
tf_transformer_pipe = Pipeline(
steps = [('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('dt', DimTransformer()),
('ct', ColumnTransformer([
('tf1', tf_transformer, 0),
('tf2', tf_transformer, 1)
]))
])
ohe_transformer_pipe = Pipeline(
steps = [('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))])
transformer = ColumnTransformer(transformers=[
('cat_ohe', ohe_transformer_pipe, ['col_for_ohe']),
('cat_tf', tf_transformer_pipe, ['col_for_countvectorizer_1', 'col_for_countvectorizer_2'])
], remainder='passthrough')
transformed_df = transformer.fit_transform(df)
也就是说,我添加了一个转换器,它简单地转换由 DataFrame 中的 SimpleImputer
实例返回的数组。然后 - 最重要的是 - 因为似乎无法对前两个步骤('imputer'
和 'dt'
)产生的 2D 输入应用矢量化,所以我进一步添加 ColumnTransformer
将矢量化分为两个并行步骤(每列一个矢量化)。请注意,此时列是按位置引用的,因为列名可能已更改。当然,这是一个自定义的解决方案,但至少可以提供一些提示。
鉴于您实际上没有缺失值,您可以通过将其与以下输出进行比较来发现它确实有效:
dt = DimTransformer().fit_transform(df)
ct = ColumnTransformer([
('tf1', tf_transformer, 1),
('tf2', tf_transformer, 2)
])
ct.fit_transform(dt)
print(ct.named_transformers_['tf1'].kw_args['vectorizer_tf'].vocabulary_)
print(ct.named_transformers_['tf2'].kw_args['vectorizer_tf'].vocabulary_)
并注意到从第四列到最后一列,但之前的输出之一(即受 'cat_tf'
应用影响的列)确实与下面的列重合。
这里有几个 post 重点介绍了 CountVectorizer
在 ColumnTransformer
实例中的用法,尽管他们没有考虑事先输入数据集。
- Vectorize only text column and standardize numeric column using pipeline
我正在尝试将 CountVectorizer()
与 Pipeline
和 ColumnTransformer
一起使用。因为 CountVectorizer()
产生稀疏矩阵,所以我使用 FunctionTransformer
来确保 ColumnTransformer
在组合结果矩阵时可以 hstack
正确。
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from typing import Callable
# Dataset
df = pd.DataFrame([['a', 'Hi Tom', 'It is hot', 1],
['b', 'How you been Tom', 'hot coffee', 2],
['c', 'Hi you', 'I want some coffee', 3]],
columns=['col_for_ohe', 'col_for_countvectorizer_1', 'col_for_countvectorizer_2', 'num_col'])
# Use FunctionTransformer to ensure dense matrix
def tf_text(X, vectorizer_tf: Callable):
X_vect_ = vectorizer_tf.fit_transform(X)
return X_vect_.toarray()
tf_transformer = FunctionTransformer(tf_text, kw_args={'vectorizer_tf': CountVectorizer()})
# Transformation Pipelines
tf_transformer_pipe = Pipeline(
steps = [('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('tf', tf_transformer)])
ohe_transformer_pipe = Pipeline(
steps = [('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))])
transformer = ColumnTransformer(transformers=[
('cat_ohe', ohe_transformer_pipe, ['col_for_ohe']),
('cat_tf', tf_transformer_pipe, ['col_for_countvectorizer_1', 'col_for_countvectorizer_2'])
], remainder='passthrough')
transformed_df = transformer.fit_transform(df)
我得到 AttributeError: 'numpy.ndarray' object has no attribute 'lower.' 我已经看到这个 question 并怀疑 CountVectorizer()
是罪魁祸首但不确定如何解决它(上一个问题没有使用 ColumnTransformer
)。我偶然发现了一个 DenseTransformer
,我希望我可以使用它来代替 FunctionTransformer
,但不幸的是我的公司不支持它。
我认为你真的应该再回顾一下你的基础知识。你的问题告诉我你对功能的理解不够好,无法有效地实现它。当你自己做了足够多的研究以免让自己难堪时再问一次。
在 CountVectorizer 中,传递 lower_case=False。
Imo,首先要考虑的是CountVectorizer()
需要一维输入;您的示例不起作用,因为插补返回一个 2D numpy 数组,这意味着您需要添加自定义处理才能使其起作用。
那么你还应该考虑在 ColumnTransformer()
中使用 CountVectorizer()
实例(再次需要一维输入)作为转换器时你应该如何传递 transformers'列:
columns: str, array-like of str, int, array-like of int, array-like of bool, slice or callable
Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns by name. A scalar string or int should be used where transformer expects X to be a 1d array-like (vector), otherwise a 2d array will be passed to the transformer. [...]
这将有助于解释我将 post 作为可能解决方案的代码段。
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from typing import Callable
from sklearn.base import BaseEstimator, TransformerMixin
# Dataset
df = pd.DataFrame([['a', 'Hi Tom', 'It is hot', 1],
['b', 'How you been Tom', 'hot coffee', 2],
['c', 'Hi you', 'I want some coffee', 3]],
columns=['col_for_ohe', 'col_for_countvectorizer_1', 'col_for_countvectorizer_2', 'num_col'])
class DimTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, *_):
return self
def transform(self, X, *_):
return pd.DataFrame(X)
# Use FunctionTransformer to ensure dense matrix
def tf_text(X, vectorizer_tf: Callable):
X_vect_ = vectorizer_tf.fit_transform(X)
return X_vect_.toarray()
tf_transformer = FunctionTransformer(tf_text, kw_args={'vectorizer_tf': CountVectorizer()})
# Transformation Pipelines
tf_transformer_pipe = Pipeline(
steps = [('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('dt', DimTransformer()),
('ct', ColumnTransformer([
('tf1', tf_transformer, 0),
('tf2', tf_transformer, 1)
]))
])
ohe_transformer_pipe = Pipeline(
steps = [('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))])
transformer = ColumnTransformer(transformers=[
('cat_ohe', ohe_transformer_pipe, ['col_for_ohe']),
('cat_tf', tf_transformer_pipe, ['col_for_countvectorizer_1', 'col_for_countvectorizer_2'])
], remainder='passthrough')
transformed_df = transformer.fit_transform(df)
也就是说,我添加了一个转换器,它简单地转换由 DataFrame 中的 SimpleImputer
实例返回的数组。然后 - 最重要的是 - 因为似乎无法对前两个步骤('imputer'
和 'dt'
)产生的 2D 输入应用矢量化,所以我进一步添加 ColumnTransformer
将矢量化分为两个并行步骤(每列一个矢量化)。请注意,此时列是按位置引用的,因为列名可能已更改。当然,这是一个自定义的解决方案,但至少可以提供一些提示。
鉴于您实际上没有缺失值,您可以通过将其与以下输出进行比较来发现它确实有效:
dt = DimTransformer().fit_transform(df)
ct = ColumnTransformer([
('tf1', tf_transformer, 1),
('tf2', tf_transformer, 2)
])
ct.fit_transform(dt)
print(ct.named_transformers_['tf1'].kw_args['vectorizer_tf'].vocabulary_) print(ct.named_transformers_['tf2'].kw_args['vectorizer_tf'].vocabulary_)
并注意到从第四列到最后一列,但之前的输出之一(即受 'cat_tf'
应用影响的列)确实与下面的列重合。
这里有几个 post 重点介绍了 CountVectorizer
在 ColumnTransformer
实例中的用法,尽管他们没有考虑事先输入数据集。
- Vectorize only text column and standardize numeric column using pipeline