RandomForestRegressor:输入包含 NaN、无穷大或对于 kaggle 学习上的 dtype('float32') 来说太大的值
RandomForestRegressor: Input contains NaN, infinity or a value too large for dtype('float32') on kaggle learn
在 Kaggle Learn 上执行 练习:分类变量 的第 5 步时,我在测试集的预测阶段得到 ValueError: Input contains NaN, infinity or a value too large for dtype('float32')
。
完整的 jupyter notebook 可用 here。
post.
末尾显示的完整代码
该代码旨在为 "Housing Prices Competition for Kaggle Learn Users".
准备提交数据集
问题是预处理包含测试集的[=15=]数据集。起初我使用 SimpleImputer
和 most_frequent
策略。然后对数据集的分类变量进行了一次热编码。
我发现在 X_train
(和 X_valid
)数据集和 X_test
之间,一些特征 具有不同的数据类型 。具体来说,['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']
列在训练数据(X_train
和 X_valid
)中属于 int64
类型,而在测试数据(X_test
中属于 'float64' 类型]).我想问题可能出在这里,但我无法解决。尝试使用以下块
转换值
# normalize datatypes columns
#for colName in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']:
# OH_X_train[colName] = OH_X_train[colName].astype('float64')
# OH_X_valid[colName] = OH_X_train[colName].astype('float64')
但是没用。有什么建议吗?
#### DATASETS LOAD ####
import pandas as pd
from sklearn.model_selection import train_test_split
# Read the data
X = pd.read_csv('../input/train.csv', index_col='Id')
X_test = pd.read_csv('../input/test.csv', index_col='Id')
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)
# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
train_size=0.8, test_size=0.2,
random_state=0)
#### IMPUTATION OF MISSING VALUES FOR X_TEST ####
from sklearn.impute import SimpleImputer
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
# Fill in the lines below: imputation
my_imputer = SimpleImputer(strategy='most_frequent')
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))
# Fill in the lines below: imputation removed column names; put them back
imputed_X_test.columns = X_test.columns
#### ONEHOT ENCODING FOR DATA #####
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(imputed_X_test[low_cardinality_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = X_test.index
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)
##### BUILD MODEL AND CREATE SUBMISSION ####
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# normalize datatypes columns
#for colName in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']:
# OH_X_train[colName] = OH_X_train[colName].astype('float64')
# OH_X_valid[colName] = OH_X_train[colName].astype('float64')
# Build model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(OH_X_train, y_train)
preds_test = model.predict(OH_X_test)
# Save test predictions to file
#output = pd.DataFrame({'Id': OH_X_test.index,
# 'SalePrice': preds_test})
#output.to_csv('submission.csv', index=False)
这里是完整的错误日志:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-2-2d85be0f6b26> in <module>
74 model = RandomForestRegressor(n_estimators=100, random_state=0)
75 model.fit(OH_X_train, y_train)
---> 76 preds_test = model.predict(OH_X_test)
77
78 # Save test predictions to file
/opt/conda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in predict(self, X)
691 check_is_fitted(self, 'estimators_')
692 # Check data
--> 693 X = self._validate_X_predict(X)
694
695 # Assign chunk of trees to jobs
/opt/conda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in _validate_X_predict(self, X)
357 "call `fit` before exploiting the model.")
358
--> 359 return self.estimators_[0]._validate_X_predict(X, check_input=True)
360
361 @property
/opt/conda/lib/python3.6/site-packages/sklearn/tree/tree.py in _validate_X_predict(self, X, check_input)
389 """Validate X whenever one tries to predict, apply, predict_proba"""
390 if check_input:
--> 391 X = check_array(X, dtype=DTYPE, accept_sparse="csr")
392 if issparse(X) and (X.indices.dtype != np.intc or
393 X.indptr.dtype != np.intc):
/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
540 if force_all_finite:
541 _assert_all_finite(array,
--> 542 allow_nan=force_all_finite == 'allow-nan')
543
544 if ensure_min_samples > 0:
/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan)
54 not allow_nan and not np.isfinite(X).all()):
55 type_err = 'infinity' if allow_nan else 'NaN, infinity'
---> 56 raise ValueError(msg_err.format(type_err, X.dtype))
57 # for object dtype data, we only check for NaNs (GH-13254)
58 elif X.dtype == np.dtype('object') and not allow_nan:
ValueError: Input contains NaN, infinity or a value too large for dtype('float32').
如错误消息所述,问题是由 OH_X_test
中的 NaN
值引起的。由于数据帧的索引混淆,因此在 concat
语句中引入了这些值。
因此,我在下面的代码中添加了 3 个修复:查看 ###FIX
标记。
#### DATASETS LOAD ####
import pandas as pd
from sklearn.model_selection import train_test_split
# Read the data
X = pd.read_csv('../input/train.csv', index_col='Id')
X_test = pd.read_csv('../input/test.csv', index_col='Id')
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)
# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
train_size=0.8, test_size=0.2,
random_state=0)
#### IMPUTATION OF MISSING VALUES FOR X_TEST ####
from sklearn.impute import SimpleImputer
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
# Fill in the lines below: imputation
my_imputer = SimpleImputer(strategy='most_frequent')
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))
# Fill in the lines below: imputation removed column names; put them back
imputed_X_test.columns = X_test.columns
imputed_X_test.index = X_test.index ###FIX
#### ONEHOT ENCODING FOR DATA #####
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(imputed_X_test[low_cardinality_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = imputed_X_test.index ####FIX
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
num_X_test = imputed_X_test.drop(object_cols, axis=1) ####FIX
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)
##### BUILD MODEL AND CREATE SUBMISSION ####
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# normalize datatypes columns
#for colName in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']:
# OH_X_train[colName] = OH_X_train[colName].astype('float64')
# OH_X_valid[colName] = OH_X_train[colName].astype('float64')
# Build model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(OH_X_train, y_train)
preds_test = model.predict(OH_X_test)
# Save test predictions to file
output = pd.DataFrame({'Id': OH_X_test.index,
'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)
在 Kaggle Learn 上执行 练习:分类变量 的第 5 步时,我在测试集的预测阶段得到 ValueError: Input contains NaN, infinity or a value too large for dtype('float32')
。
完整的 jupyter notebook 可用 here。 post.
末尾显示的完整代码该代码旨在为 "Housing Prices Competition for Kaggle Learn Users".
准备提交数据集问题是预处理包含测试集的[=15=]数据集。起初我使用 SimpleImputer
和 most_frequent
策略。然后对数据集的分类变量进行了一次热编码。
我发现在 X_train
(和 X_valid
)数据集和 X_test
之间,一些特征 具有不同的数据类型 。具体来说,['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']
列在训练数据(X_train
和 X_valid
)中属于 int64
类型,而在测试数据(X_test
中属于 'float64' 类型]).我想问题可能出在这里,但我无法解决。尝试使用以下块
# normalize datatypes columns
#for colName in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']:
# OH_X_train[colName] = OH_X_train[colName].astype('float64')
# OH_X_valid[colName] = OH_X_train[colName].astype('float64')
但是没用。有什么建议吗?
#### DATASETS LOAD ####
import pandas as pd
from sklearn.model_selection import train_test_split
# Read the data
X = pd.read_csv('../input/train.csv', index_col='Id')
X_test = pd.read_csv('../input/test.csv', index_col='Id')
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)
# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
train_size=0.8, test_size=0.2,
random_state=0)
#### IMPUTATION OF MISSING VALUES FOR X_TEST ####
from sklearn.impute import SimpleImputer
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
# Fill in the lines below: imputation
my_imputer = SimpleImputer(strategy='most_frequent')
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))
# Fill in the lines below: imputation removed column names; put them back
imputed_X_test.columns = X_test.columns
#### ONEHOT ENCODING FOR DATA #####
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(imputed_X_test[low_cardinality_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = X_test.index
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
num_X_test = X_test.drop(object_cols, axis=1)
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)
##### BUILD MODEL AND CREATE SUBMISSION ####
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# normalize datatypes columns
#for colName in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']:
# OH_X_train[colName] = OH_X_train[colName].astype('float64')
# OH_X_valid[colName] = OH_X_train[colName].astype('float64')
# Build model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(OH_X_train, y_train)
preds_test = model.predict(OH_X_test)
# Save test predictions to file
#output = pd.DataFrame({'Id': OH_X_test.index,
# 'SalePrice': preds_test})
#output.to_csv('submission.csv', index=False)
这里是完整的错误日志:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-2-2d85be0f6b26> in <module>
74 model = RandomForestRegressor(n_estimators=100, random_state=0)
75 model.fit(OH_X_train, y_train)
---> 76 preds_test = model.predict(OH_X_test)
77
78 # Save test predictions to file
/opt/conda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in predict(self, X)
691 check_is_fitted(self, 'estimators_')
692 # Check data
--> 693 X = self._validate_X_predict(X)
694
695 # Assign chunk of trees to jobs
/opt/conda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in _validate_X_predict(self, X)
357 "call `fit` before exploiting the model.")
358
--> 359 return self.estimators_[0]._validate_X_predict(X, check_input=True)
360
361 @property
/opt/conda/lib/python3.6/site-packages/sklearn/tree/tree.py in _validate_X_predict(self, X, check_input)
389 """Validate X whenever one tries to predict, apply, predict_proba"""
390 if check_input:
--> 391 X = check_array(X, dtype=DTYPE, accept_sparse="csr")
392 if issparse(X) and (X.indices.dtype != np.intc or
393 X.indptr.dtype != np.intc):
/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
540 if force_all_finite:
541 _assert_all_finite(array,
--> 542 allow_nan=force_all_finite == 'allow-nan')
543
544 if ensure_min_samples > 0:
/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan)
54 not allow_nan and not np.isfinite(X).all()):
55 type_err = 'infinity' if allow_nan else 'NaN, infinity'
---> 56 raise ValueError(msg_err.format(type_err, X.dtype))
57 # for object dtype data, we only check for NaNs (GH-13254)
58 elif X.dtype == np.dtype('object') and not allow_nan:
ValueError: Input contains NaN, infinity or a value too large for dtype('float32').
如错误消息所述,问题是由 OH_X_test
中的 NaN
值引起的。由于数据帧的索引混淆,因此在 concat
语句中引入了这些值。
因此,我在下面的代码中添加了 3 个修复:查看 ###FIX
标记。
#### DATASETS LOAD ####
import pandas as pd
from sklearn.model_selection import train_test_split
# Read the data
X = pd.read_csv('../input/train.csv', index_col='Id')
X_test = pd.read_csv('../input/test.csv', index_col='Id')
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)
# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
train_size=0.8, test_size=0.2,
random_state=0)
#### IMPUTATION OF MISSING VALUES FOR X_TEST ####
from sklearn.impute import SimpleImputer
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]
# Columns that will be one-hot encoded
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
# Fill in the lines below: imputation
my_imputer = SimpleImputer(strategy='most_frequent')
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))
# Fill in the lines below: imputation removed column names; put them back
imputed_X_test.columns = X_test.columns
imputed_X_test.index = X_test.index ###FIX
#### ONEHOT ENCODING FOR DATA #####
from sklearn.preprocessing import OneHotEncoder
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(imputed_X_test[low_cardinality_cols]))
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
OH_cols_test.index = imputed_X_test.index ####FIX
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)
num_X_test = imputed_X_test.drop(object_cols, axis=1) ####FIX
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)
##### BUILD MODEL AND CREATE SUBMISSION ####
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# normalize datatypes columns
#for colName in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']:
# OH_X_train[colName] = OH_X_train[colName].astype('float64')
# OH_X_valid[colName] = OH_X_train[colName].astype('float64')
# Build model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(OH_X_train, y_train)
preds_test = model.predict(OH_X_test)
# Save test predictions to file
output = pd.DataFrame({'Id': OH_X_test.index,
'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)