在 sklearn 中将 DataFrameMapper() 用于 PolynomialFeature() 时出错
Error in using DataFrameMapper() for PolynomialFeature() in sklearn
对于 housing data set,我正在尝试使用 sklearn_pandas 中的 DataFrameMapper() 在所选列上应用多项式特征。
我的代码:
from sklearn.preprocessing import PolynomialFeatures
from sklearn_pandas import DataFrameMapper
mapper = DataFrameMapper([
('houseAge_income', PolynomialFeatures(2)),
('median_income', PolynomialFeatures(2)),
(['latitude', 'housing_median_age', 'total_rooms', 'population', 'median_house_value',
'ocean_proximity']], None)
])
poly_feature = mapper.fit_transform(housing)
我遇到了这个错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-44-30679ae791ae> in <module>
11
12 # fit
---> 13 poly_feature = mapper.fit_transform(df)
e:\Anaconda3\lib\site-packages\sklearn_pandas\dataframe_mapper.py in fit_transform(self, X, y)
397 y the target vector relative to X, optional
398 """
--> 399 return self._transform(X, y, True)
e:\Anaconda3\lib\site-packages\sklearn_pandas\dataframe_mapper.py in _transform(self, X, y, do_fit)
308 with add_column_names_to_exception(columns):
309 if do_fit and hasattr(transformers, 'fit_transform'):
--> 310 Xt = _call_fit(transformers.fit_transform, Xt, y)
311 else:
312 if do_fit:
e:\Anaconda3\lib\site-packages\sklearn_pandas\pipeline.py in _call_fit(fit_method, X, y, **kwargs)
22 """
23 try:
---> 24 return fit_method(X, y, **kwargs)
25 except TypeError:
26 # fit takes only one argument
e:\Anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
688 if y is None:
689 # fit method of arity 1 (unsupervised transformation)
--> 690 return self.fit(X, **fit_params).transform(X)
691 else:
692 # fit method of arity 2 (supervised transformation)
e:\Anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y)
1510 self : instance
1511 """
-> 1512 n_samples, n_features = self._validate_data(
1513 X, accept_sparse=True).shape
1514 combinations = self._combinations(n_features, self.degree,
e:\Anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
418 f"requires y to be passed, but the target y is None."
419 )
--> 420 X = check_array(X, **check_params)
421 out = X
422 else:
e:\Anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
e:\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
617 # If input is 1D raise error
618 if array.ndim == 1:
--> 619 raise ValueError(
620 "Expected 2D array, got 1D array instead:\narray={}.\n"
621 "Reshape your data either using array.reshape(-1, 1) if "
ValueError: houseAge_income: Expected 2D array, got 1D array instead:
array=[341.3332 174.3294 377.3848 ... 28.9 33.6096 38.2176].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
当我尝试使用
houseAge_income.reshape(-1, 1)
在 DataFrameMapper() 中,我遇到了另一个问题:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2645 try:
-> 2646 return self._engine.get_loc(key)
2647 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'houseAge_income.reshape(-1, 1)'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
5 frames
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2646 return self._engine.get_loc(key)
2647 except KeyError:
-> 2648 return self._engine.get_loc(self._maybe_cast_indexer(key))
2649 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2650 if indexer.ndim > 1 or indexer.size > 1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'houseAge_income.reshape(-1, 1)'
谁能告诉我,我错过了什么?
我知道形状有问题但无法弄清楚。
没有帮助。
注意:houseAge_income 是由
创建的交互项
housing['houseAge_income'] = housing['housing_median_age']*housing['median_income']
- 来自documentation
- 将列选择器指定为
'column'
(作为简单字符串)和 ['column']
(作为具有一个元素的列表)之间的区别在于数组的形状传递给变压器。在第一种情况下,将传递一个一维数组,而在第二种情况下,它将传递一个具有一列的二维数组,即列向量。
- 必须使用相同类型的列选择器传递所有列。
- 在本例中,
list
,因为要保留 list
个 non-transformed 列。
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn_pandas import DataFrameMapper
# load data
df = pd.read_csv('https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv')
# create houseAge_income
df['houseAge_income'] = df.housing_median_age.mul(df.median_income)
# configure mapper with all columns passed as lists
mapper = DataFrameMapper([(['houseAge_income'], PolynomialFeatures(2)),
(['median_income'], PolynomialFeatures(2)),
(['latitude', 'housing_median_age', 'total_rooms', 'population', 'median_house_value', 'ocean_proximity'], None)])
# fit
poly_feature = mapper.fit_transform(df)
# display(pd.DataFrame(poly_feature).head())
0 1 2 3 4 5 6 7 8 9 10 11
0 1 341.33 1.1651e+05 1 8.3252 69.309 37.88 41 880 322 4.526e+05 NEAR BAY
1 1 174.33 30391 1 8.3014 68.913 37.86 21 7099 2401 3.585e+05 NEAR BAY
2 1 377.38 1.4242e+05 1 7.2574 52.67 37.85 52 1467 496 3.521e+05 NEAR BAY
3 1 293.44 86108 1 5.6431 31.845 37.85 52 1274 558 3.413e+05 NEAR BAY
4 1 200 40001 1 3.8462 14.793 37.85 52 1627 565 3.422e+05 NEAR BAY
对于 housing data set,我正在尝试使用 sklearn_pandas 中的 DataFrameMapper() 在所选列上应用多项式特征。
我的代码:
from sklearn.preprocessing import PolynomialFeatures
from sklearn_pandas import DataFrameMapper
mapper = DataFrameMapper([
('houseAge_income', PolynomialFeatures(2)),
('median_income', PolynomialFeatures(2)),
(['latitude', 'housing_median_age', 'total_rooms', 'population', 'median_house_value',
'ocean_proximity']], None)
])
poly_feature = mapper.fit_transform(housing)
我遇到了这个错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-44-30679ae791ae> in <module>
11
12 # fit
---> 13 poly_feature = mapper.fit_transform(df)
e:\Anaconda3\lib\site-packages\sklearn_pandas\dataframe_mapper.py in fit_transform(self, X, y)
397 y the target vector relative to X, optional
398 """
--> 399 return self._transform(X, y, True)
e:\Anaconda3\lib\site-packages\sklearn_pandas\dataframe_mapper.py in _transform(self, X, y, do_fit)
308 with add_column_names_to_exception(columns):
309 if do_fit and hasattr(transformers, 'fit_transform'):
--> 310 Xt = _call_fit(transformers.fit_transform, Xt, y)
311 else:
312 if do_fit:
e:\Anaconda3\lib\site-packages\sklearn_pandas\pipeline.py in _call_fit(fit_method, X, y, **kwargs)
22 """
23 try:
---> 24 return fit_method(X, y, **kwargs)
25 except TypeError:
26 # fit takes only one argument
e:\Anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
688 if y is None:
689 # fit method of arity 1 (unsupervised transformation)
--> 690 return self.fit(X, **fit_params).transform(X)
691 else:
692 # fit method of arity 2 (supervised transformation)
e:\Anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y)
1510 self : instance
1511 """
-> 1512 n_samples, n_features = self._validate_data(
1513 X, accept_sparse=True).shape
1514 combinations = self._combinations(n_features, self.degree,
e:\Anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
418 f"requires y to be passed, but the target y is None."
419 )
--> 420 X = check_array(X, **check_params)
421 out = X
422 else:
e:\Anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
e:\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
617 # If input is 1D raise error
618 if array.ndim == 1:
--> 619 raise ValueError(
620 "Expected 2D array, got 1D array instead:\narray={}.\n"
621 "Reshape your data either using array.reshape(-1, 1) if "
ValueError: houseAge_income: Expected 2D array, got 1D array instead:
array=[341.3332 174.3294 377.3848 ... 28.9 33.6096 38.2176].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
当我尝试使用
houseAge_income.reshape(-1, 1)
在 DataFrameMapper() 中,我遇到了另一个问题:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2645 try:
-> 2646 return self._engine.get_loc(key)
2647 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'houseAge_income.reshape(-1, 1)'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
5 frames
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2646 return self._engine.get_loc(key)
2647 except KeyError:
-> 2648 return self._engine.get_loc(self._maybe_cast_indexer(key))
2649 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2650 if indexer.ndim > 1 or indexer.size > 1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'houseAge_income.reshape(-1, 1)'
谁能告诉我,我错过了什么?
我知道形状有问题但无法弄清楚。
注意:houseAge_income 是由
创建的交互项housing['houseAge_income'] = housing['housing_median_age']*housing['median_income']
- 来自documentation
- 将列选择器指定为
'column'
(作为简单字符串)和['column']
(作为具有一个元素的列表)之间的区别在于数组的形状传递给变压器。在第一种情况下,将传递一个一维数组,而在第二种情况下,它将传递一个具有一列的二维数组,即列向量。
- 将列选择器指定为
- 必须使用相同类型的列选择器传递所有列。
- 在本例中,
list
,因为要保留list
个 non-transformed 列。
- 在本例中,
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn_pandas import DataFrameMapper
# load data
df = pd.read_csv('https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv')
# create houseAge_income
df['houseAge_income'] = df.housing_median_age.mul(df.median_income)
# configure mapper with all columns passed as lists
mapper = DataFrameMapper([(['houseAge_income'], PolynomialFeatures(2)),
(['median_income'], PolynomialFeatures(2)),
(['latitude', 'housing_median_age', 'total_rooms', 'population', 'median_house_value', 'ocean_proximity'], None)])
# fit
poly_feature = mapper.fit_transform(df)
# display(pd.DataFrame(poly_feature).head())
0 1 2 3 4 5 6 7 8 9 10 11
0 1 341.33 1.1651e+05 1 8.3252 69.309 37.88 41 880 322 4.526e+05 NEAR BAY
1 1 174.33 30391 1 8.3014 68.913 37.86 21 7099 2401 3.585e+05 NEAR BAY
2 1 377.38 1.4242e+05 1 7.2574 52.67 37.85 52 1467 496 3.521e+05 NEAR BAY
3 1 293.44 86108 1 5.6431 31.845 37.85 52 1274 558 3.413e+05 NEAR BAY
4 1 200 40001 1 3.8462 14.793 37.85 52 1627 565 3.422e+05 NEAR BAY