我们可以使用 numpy 数组作为输入,在 make_column_transform() 内部对文本数据执行 Tfidfvectorizer() 吗?
Can we use numpy array as input to perform Tfidfvectorizer() on text data, inside of make_column_transform()?
我正在尝试使用 OneHotEncoder()
和 TfidfVectorizer()
对我的训练数据执行多列转换,这是一个 numpy 数组。我正在尝试使用 make_column_transformer()
一次执行所有转换。 X_train 是我的输入数据。
输入数据
print(X_train.shape)
>>> (75117, 6)
示例实例
print(X_train[5,:])
>>> ['electrical_contractor_license-electrical_contractor_license-general_contractor_license-refrigeration_contractor_lic.'
'brennan_heating_company_inc' 'instal new electr boiler'
'single_family_/_duplex' 0.0 0]
列转换代码
column_trans = make_column_transformer(
(OneHotEncoder(sparse=False, handle_unknown='ignore'), [0, 1, 3]),
(TfidfVectorizer(min_df=1, stop_words='english', lowercase=False), [2]),
remainder='passthrough')
z = column_trans.fit_transform(X_train)
使用上面的代码,OneHotEncoder()
在列 (0, 1, 3)
上工作正常,但是当我为列 2
添加 TfidfVectorizer()
时,它会抛出以下错误。
TypeError: cannot use a string pattern on a bytes-like object
完整错误:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-1167-68498e1c856a> in <module>
4 remainder='passthrough')
5
----> 6 z = column_trans.fit_transform(X_train)
7 print(z[0,:].shape)
8 print(z[0,:])
/opt/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
516 self._validate_remainder(X)
517
--> 518 result = self._fit_transform(X, y, _fit_transform_one)
519
520 if not result:
/opt/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
455 message=self._log_message(name, idx, len(transformers)))
456 for idx, (name, trans, column, weight) in enumerate(
--> 457 self._iter(fitted=fitted, replace_strings=True), 1))
458 except ValueError as e:
459 if "Expected 2D array, got 1D array instead" in str(e):
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1005 self._iterating = self._original_iterator is not None
1006
-> 1007 while self.dispatch_one_batch(iterator):
1008 pass
1009
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
726 with _print_elapsed_time(message_clsname, message):
727 if hasattr(transformer, 'fit_transform'):
--> 728 res = transformer.fit_transform(X, y, **fit_params)
729 else:
730 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
1857 """
1858 self._check_params()
-> 1859 X = super().fit_transform(raw_documents)
1860 self._tfidf.fit(X)
1861 # X is already a transformed view of raw_documents so
/opt/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
1218
1219 vocabulary, X = self._count_vocab(raw_documents,
-> 1220 self.fixed_vocabulary_)
1221
1222 if self.binary:
/opt/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
1129 for doc in raw_documents:
1130 feature_counter = {}
-> 1131 for feature in analyze(doc):
1132 try:
1133 feature_idx = vocabulary[feature]
/opt/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
103 doc = preprocessor(doc)
104 if tokenizer is not None:
--> 105 doc = tokenizer(doc)
106 if ngrams is not None:
107 if stop_words is not None:
TypeError: cannot use a string pattern on a bytes-like object
当我在 make_column_transformer()
之外使用它时它确实有效,但我使用 make_column_transformer()
而不是单独使用的原因是,如果我先使用 One hot encoding
然后再使用tfidf
,那么很可能一个热编码器生成的特征数量可能会有所不同,因此对 tfidf 的列索引进行硬编码可能不是一个好主意。
tf = TfidfVectorizer(min_df=1, stop_words='english')
n = tf.fit_transform(X_train[:,2])
n.toarray()
>>> array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
我遇到了同样的问题。解决方案是更换:
(TfidfVectorizer(min_df=1, stop_words='english', lowercase=False), [2]),
来自
(TfidfVectorizer(min_df=1, stop_words='english', lowercase=False), 2),
否则,TfidfVectorizer 在调用 fit
时会收到它无法处理的 2D 数据(形状 (#documents, 1)
)。
在ColumnTransformer
中有说明 documentation:
columns: str, array-like of str, int, array-like of int, array-like of bool, slice or callable
Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns by name. A scalar string or int should be used where transformer expects X to be a 1d array-like (vector), otherwise a 2d array will be passed to the transformer. [...]
我正在尝试使用 OneHotEncoder()
和 TfidfVectorizer()
对我的训练数据执行多列转换,这是一个 numpy 数组。我正在尝试使用 make_column_transformer()
一次执行所有转换。 X_train 是我的输入数据。
输入数据
print(X_train.shape)
>>> (75117, 6)
示例实例
print(X_train[5,:])
>>> ['electrical_contractor_license-electrical_contractor_license-general_contractor_license-refrigeration_contractor_lic.'
'brennan_heating_company_inc' 'instal new electr boiler'
'single_family_/_duplex' 0.0 0]
列转换代码
column_trans = make_column_transformer(
(OneHotEncoder(sparse=False, handle_unknown='ignore'), [0, 1, 3]),
(TfidfVectorizer(min_df=1, stop_words='english', lowercase=False), [2]),
remainder='passthrough')
z = column_trans.fit_transform(X_train)
使用上面的代码,OneHotEncoder()
在列 (0, 1, 3)
上工作正常,但是当我为列 2
添加 TfidfVectorizer()
时,它会抛出以下错误。
TypeError: cannot use a string pattern on a bytes-like object
完整错误:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-1167-68498e1c856a> in <module>
4 remainder='passthrough')
5
----> 6 z = column_trans.fit_transform(X_train)
7 print(z[0,:].shape)
8 print(z[0,:])
/opt/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
516 self._validate_remainder(X)
517
--> 518 result = self._fit_transform(X, y, _fit_transform_one)
519
520 if not result:
/opt/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
455 message=self._log_message(name, idx, len(transformers)))
456 for idx, (name, trans, column, weight) in enumerate(
--> 457 self._iter(fitted=fitted, replace_strings=True), 1))
458 except ValueError as e:
459 if "Expected 2D array, got 1D array instead" in str(e):
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1005 self._iterating = self._original_iterator is not None
1006
-> 1007 while self.dispatch_one_batch(iterator):
1008 pass
1009
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/opt/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
726 with _print_elapsed_time(message_clsname, message):
727 if hasattr(transformer, 'fit_transform'):
--> 728 res = transformer.fit_transform(X, y, **fit_params)
729 else:
730 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
1857 """
1858 self._check_params()
-> 1859 X = super().fit_transform(raw_documents)
1860 self._tfidf.fit(X)
1861 # X is already a transformed view of raw_documents so
/opt/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
1218
1219 vocabulary, X = self._count_vocab(raw_documents,
-> 1220 self.fixed_vocabulary_)
1221
1222 if self.binary:
/opt/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
1129 for doc in raw_documents:
1130 feature_counter = {}
-> 1131 for feature in analyze(doc):
1132 try:
1133 feature_idx = vocabulary[feature]
/opt/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
103 doc = preprocessor(doc)
104 if tokenizer is not None:
--> 105 doc = tokenizer(doc)
106 if ngrams is not None:
107 if stop_words is not None:
TypeError: cannot use a string pattern on a bytes-like object
当我在 make_column_transformer()
之外使用它时它确实有效,但我使用 make_column_transformer()
而不是单独使用的原因是,如果我先使用 One hot encoding
然后再使用tfidf
,那么很可能一个热编码器生成的特征数量可能会有所不同,因此对 tfidf 的列索引进行硬编码可能不是一个好主意。
tf = TfidfVectorizer(min_df=1, stop_words='english')
n = tf.fit_transform(X_train[:,2])
n.toarray()
>>> array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
我遇到了同样的问题。解决方案是更换:
(TfidfVectorizer(min_df=1, stop_words='english', lowercase=False), [2]),
来自
(TfidfVectorizer(min_df=1, stop_words='english', lowercase=False), 2),
否则,TfidfVectorizer 在调用 fit
时会收到它无法处理的 2D 数据(形状 (#documents, 1)
)。
在ColumnTransformer
中有说明 documentation:
columns: str, array-like of str, int, array-like of int, array-like of bool, slice or callable
Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns by name. A scalar string or int should be used where transformer expects X to be a 1d array-like (vector), otherwise a 2d array will be passed to the transformer. [...]