SVM ValueError: Input contains NaN, infinity or a value too large for dtype('float64')
SVM ValueError: Input contains NaN, infinity or a value too large for dtype('float64')
请帮帮我!我不知道为什么当我尝试输入一些文本来检测分类时会发生这个错误。
这是我训练数据的代码。
如何解决?
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)
if request.method == 'POST':
message = request.form['message']
data = [message]
vect = vectorizer.transform(data).toarray()
my_prediction = clf.predict(vect)
return render_template('result.html',prediction = my_prediction)`
使用 your_data.isnull().any()
检查数据中是否有空值。
如果您有空值,请使用 your_data = your_data.dropna()
.
使用 np.isfinite(your_data)
检查您的数据是否包含 inf。如果有 inf 值,可以使用 your_data.replace([np.inf, -np.inf], np.nan)
然后 your_data = your_data.dropna()
删除它们。
将 your_data
更改为您正在使用的数据框的任何名称,f.e X
、y
或 X_train_tfidf
此外,检查 this answer 和 post 评论中标记为可能重复的那个。
编辑:按需添加示例。在 X 和 y 上做最明显的事情。
from sklearn.model_selection import train_test_split
# Add these lines
X = X.replace([np.inf, -np.inf], np.nan)
y = y.replace([np.inf, -np.inf], np.nan)
X = X.dropna()
y = y.dropna()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)
if request.method == 'POST':
message = request.form['message']
data = [message]
vect = vectorizer.transform(data).toarray()
my_prediction = clf.predict(vect)
return render_template('result.html',prediction = my_prediction)
请帮帮我!我不知道为什么当我尝试输入一些文本来检测分类时会发生这个错误。
这是我训练数据的代码。 如何解决?
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)
if request.method == 'POST':
message = request.form['message']
data = [message]
vect = vectorizer.transform(data).toarray()
my_prediction = clf.predict(vect)
return render_template('result.html',prediction = my_prediction)`
使用
your_data.isnull().any()
检查数据中是否有空值。 如果您有空值,请使用your_data = your_data.dropna()
.使用
np.isfinite(your_data)
检查您的数据是否包含 inf。如果有 inf 值,可以使用your_data.replace([np.inf, -np.inf], np.nan)
然后your_data = your_data.dropna()
删除它们。将
your_data
更改为您正在使用的数据框的任何名称,f.eX
、y
或X_train_tfidf
此外,检查 this answer 和 post 评论中标记为可能重复的那个。
编辑:按需添加示例。在 X 和 y 上做最明显的事情。
from sklearn.model_selection import train_test_split
# Add these lines
X = X.replace([np.inf, -np.inf], np.nan)
y = y.replace([np.inf, -np.inf], np.nan)
X = X.dropna()
y = y.dropna()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)
if request.method == 'POST':
message = request.form['message']
data = [message]
vect = vectorizer.transform(data).toarray()
my_prediction = clf.predict(vect)
return render_template('result.html',prediction = my_prediction)