情感分析如何得到结果的概率?
Sentiment Analysis how to get the probability of the result?
所以我这里有这个简单的情绪分析应用程序
到目前为止,我只能在 Positive/negative/neutral 中打印结果
但我希望它也能打印出它是肯定句的概率。
喜欢这个
Positive
85.2%
谁能帮帮我?
这是我的代码
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score
wordnet_lemmatizer = WordNetLemmatizer()
df = pd.read_csv('Tweets.csv')
def normalizer(comment):
only_letters = re.sub("[^a-zA-Z]", " ", comment)
only_letters = only_letters.lower()
only_letters = only_letters.split()
filtered_result = [word for word in only_letters if word not in stopwords.words('english')]
lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
lemmas = ' '.join(lemmas)
return lemmas
df = shuffle(df)
y = df['airline_sentiment']
x = df.text.apply(normalizer)
vectorizer = CountVectorizer()
x_vectorized = vectorizer.fit_transform(x)
train_x,val_x,train_y,val_y = train_test_split(x_vectorized,y)
regressor = LogisticRegression(multi_class='multinomial', solver='newton-cg')
model = regressor.fit(train_x, train_y)
params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
gs_clf = GridSearchCV(model, params, n_jobs=1, cv=5)
gs_clf = gs_clf.fit(train_x, train_y)
model = gs_clf.best_estimator_
#_f1 = f1_score(val_y, y_pred, average='micro')
#_confusion = confusion_matrix(val_y, y_pred)
#__precision = precision_score(val_y, y_pred, average='micro')
#_recall = recall_score(val_y, y_pred, average='micro')
#_statistics = {'f1_score': _f1,
# 'confusion_matrix': _confusion,
# 'precision': __precision,
# 'recall': _recall
# }
y_pred = model.predict(val_x)
print(accuracy_score(val_y, y_pred))
test_feature = vectorizer.transform(['The Movie is good'])
print(model.predict(test_feature,))
我当前的输出是:
0.7846994535519126
['positive']
prediction_probablities = model.predict_proba(val_x)
更多信息:
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
所以我这里有这个简单的情绪分析应用程序
到目前为止,我只能在 Positive/negative/neutral 中打印结果 但我希望它也能打印出它是肯定句的概率。
喜欢这个
Positive
85.2%
谁能帮帮我?
这是我的代码
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score
wordnet_lemmatizer = WordNetLemmatizer()
df = pd.read_csv('Tweets.csv')
def normalizer(comment):
only_letters = re.sub("[^a-zA-Z]", " ", comment)
only_letters = only_letters.lower()
only_letters = only_letters.split()
filtered_result = [word for word in only_letters if word not in stopwords.words('english')]
lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
lemmas = ' '.join(lemmas)
return lemmas
df = shuffle(df)
y = df['airline_sentiment']
x = df.text.apply(normalizer)
vectorizer = CountVectorizer()
x_vectorized = vectorizer.fit_transform(x)
train_x,val_x,train_y,val_y = train_test_split(x_vectorized,y)
regressor = LogisticRegression(multi_class='multinomial', solver='newton-cg')
model = regressor.fit(train_x, train_y)
params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
gs_clf = GridSearchCV(model, params, n_jobs=1, cv=5)
gs_clf = gs_clf.fit(train_x, train_y)
model = gs_clf.best_estimator_
#_f1 = f1_score(val_y, y_pred, average='micro')
#_confusion = confusion_matrix(val_y, y_pred)
#__precision = precision_score(val_y, y_pred, average='micro')
#_recall = recall_score(val_y, y_pred, average='micro')
#_statistics = {'f1_score': _f1,
# 'confusion_matrix': _confusion,
# 'precision': __precision,
# 'recall': _recall
# }
y_pred = model.predict(val_x)
print(accuracy_score(val_y, y_pred))
test_feature = vectorizer.transform(['The Movie is good'])
print(model.predict(test_feature,))
我当前的输出是:
0.7846994535519126
['positive']
prediction_probablities = model.predict_proba(val_x)
更多信息:
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html