Jupyter Notebook 中的逻辑回归;输入包含 NaN、无穷大或对于 dtype('float64') 来说太大的值
Logistic Regression in Jupyter Notebook; Input contains NaN, infinity or a value too large for dtype('float64')
我想创建一个逻辑回归模型来预测关系是已知还是未知,我在数据集中将已知值设置为 1,将未知值设置为 0。我还添加了几个特征来训练数据和预测关系。
我有运行这个代码:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn.datasets
from sklearn.model_selection import train_test_split
import pandas as pd
df = pd.read_csv('boo.csv')
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
y = pd.get_dummies(df.Relationship, prefix='Relationship')
X = pd.get_dummies(df, columns=['Relationship', 'Month','Year','Victim Age', 'Perpetrator
Age', 'Victim Sex', 'Victim Race', 'Perpetrator Sex', 'Crime Type', 'Perpetrator Race'], drop_first = True )
X_train, X_test, y_train, y_test = train_test_split(df[['Month','Year','Victim Age', 'Perpetrator Age', 'Victim Sex', 'Victim Race', 'Perpetrator Sex', 'Crime Type', 'Perpetrator Race']], df.Relationship, test_size=0.1)
model = LogisticRegression()
np.isnan(X)
np.where(np.isnan(X))
np.nan_to_num(X)
model.fit(X, y)
我遇到了这个错误:
ValueError Traceback (most recent call last)
<ipython-input-101-68355fc70ed4> in <module>
15 np.where(np.isnan(X))
16 np.nan_to_num(X)
---> 17 model.fit(X, y)
~\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py in fit(self, X, y, sample_weight)
1342 _dtype = [np.float64, np.float32]
1343
-> 1344 X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype,
1345 order="C",
1346 accept_large_sparse=solver != 'liblinear')
~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
431 y = check_array(y, **check_y_params)
432 else:
--> 433 X, y = check_X_y(X, y, **check_params)
434 out = X, y
435
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
812 raise ValueError("y cannot be None")
813
--> 814 X = check_array(X, accept_sparse=accept_sparse,
815 accept_large_sparse=accept_large_sparse,
816 dtype=dtype, order=order, copy=copy,
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
661
662 if force_all_finite:
--> 663 _assert_all_finite(array,
664 allow_nan=force_all_finite == 'allow-nan')
665
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
101 not allow_nan and not np.isfinite(X).all()):
102 type_err = 'infinity' if allow_nan else 'NaN, infinity'
--> 103 raise ValueError(
104 msg_err.format
105 (type_err,
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
我查看了几个 Whosebug 问题并尝试了他们的解决方案,但似乎没有任何效果。仅当我尝试在模型中拟合 X 和 y 时才会出现此错误。
您需要修复数据集中的 'nan' 值。您正在检查 nan 值但没有修复它。以下是示例数据。
X = np.concatenate((np.arange(1,15),[np.NaN,np.NaN]))
print(np.isnan(X), np.where(np.isnan(X)), np.nan_to_num(X))
输出:
[False False False False False False False False False False False False
False False True True]
(array([14, 15]),)
[ 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11. 12. 13. 14. 0. 0.]
您需要进行下一步并将其分配回去。代码不做 'inplace' assigning.if 你不赋值,原数组不会有变化。
X = np.nan_to_num(X) # assign it
现在如果再次检查 X,将不会有 'nan' 个值,然后继续训练模型。
我想创建一个逻辑回归模型来预测关系是已知还是未知,我在数据集中将已知值设置为 1,将未知值设置为 0。我还添加了几个特征来训练数据和预测关系。
我有运行这个代码:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn.datasets
from sklearn.model_selection import train_test_split
import pandas as pd
df = pd.read_csv('boo.csv')
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
y = pd.get_dummies(df.Relationship, prefix='Relationship')
X = pd.get_dummies(df, columns=['Relationship', 'Month','Year','Victim Age', 'Perpetrator
Age', 'Victim Sex', 'Victim Race', 'Perpetrator Sex', 'Crime Type', 'Perpetrator Race'], drop_first = True )
X_train, X_test, y_train, y_test = train_test_split(df[['Month','Year','Victim Age', 'Perpetrator Age', 'Victim Sex', 'Victim Race', 'Perpetrator Sex', 'Crime Type', 'Perpetrator Race']], df.Relationship, test_size=0.1)
model = LogisticRegression()
np.isnan(X)
np.where(np.isnan(X))
np.nan_to_num(X)
model.fit(X, y)
我遇到了这个错误:
ValueError Traceback (most recent call last)
<ipython-input-101-68355fc70ed4> in <module>
15 np.where(np.isnan(X))
16 np.nan_to_num(X)
---> 17 model.fit(X, y)
~\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py in fit(self, X, y, sample_weight)
1342 _dtype = [np.float64, np.float32]
1343
-> 1344 X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype,
1345 order="C",
1346 accept_large_sparse=solver != 'liblinear')
~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
431 y = check_array(y, **check_y_params)
432 else:
--> 433 X, y = check_X_y(X, y, **check_params)
434 out = X, y
435
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
812 raise ValueError("y cannot be None")
813
--> 814 X = check_array(X, accept_sparse=accept_sparse,
815 accept_large_sparse=accept_large_sparse,
816 dtype=dtype, order=order, copy=copy,
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
661
662 if force_all_finite:
--> 663 _assert_all_finite(array,
664 allow_nan=force_all_finite == 'allow-nan')
665
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in _assert_all_finite(X, allow_nan, msg_dtype)
101 not allow_nan and not np.isfinite(X).all()):
102 type_err = 'infinity' if allow_nan else 'NaN, infinity'
--> 103 raise ValueError(
104 msg_err.format
105 (type_err,
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
我查看了几个 Whosebug 问题并尝试了他们的解决方案,但似乎没有任何效果。仅当我尝试在模型中拟合 X 和 y 时才会出现此错误。
您需要修复数据集中的 'nan' 值。您正在检查 nan 值但没有修复它。以下是示例数据。
X = np.concatenate((np.arange(1,15),[np.NaN,np.NaN]))
print(np.isnan(X), np.where(np.isnan(X)), np.nan_to_num(X))
输出:
[False False False False False False False False False False False False
False False True True]
(array([14, 15]),)
[ 1. 2. 3. 4. 5. 6. 7. 8. 9. 10. 11. 12. 13. 14. 0. 0.]
您需要进行下一步并将其分配回去。代码不做 'inplace' assigning.if 你不赋值,原数组不会有变化。
X = np.nan_to_num(X) # assign it
现在如果再次检查 X,将不会有 'nan' 个值,然后继续训练模型。