如何将字符串值传递给情感分析 RNN 序列模型并取回预测
How to pass a string value to a Sentiment Analysis RNN Sequential Model and get back a prediction
我使用自己的数据集重新创建了一个情感分析机器学习项目,并进行了一些小的修改以缩短其完成时间,我可以创建良好的模型、编译它、拟合它并毫无问题地测试它,但是问题来了关于如何向模型传递新的字符串/文章,它在 return 中传递关于字符串评论是正面还是负面的预测,希望有人能帮助我。
我在下面发布了我的代码供您查看。
class tensor_rnn():
def __init__(self, corp_paths, hidden_layers=3, loadfile=True):
self.h_layers = hidden_layers
self.num_words = []
if loadfile == False:
data_set = pd.DataFrame(columns=['Article', 'Polarity'])
craptopass = []
for files in os.listdir(corp_paths[0]):
with open(corp_paths[0] + '\' + files, 'r', errors='replace') as text_file:
line = text_file.readline().replace('|', '')
text_file.close()
if len(line.split(' ')) > 3:
line = ''.join([i if ord(i) < 128 else ' ' for i in line])
craptopass.append([line, 1])
good = data_set.append(pd.DataFrame(craptopass, columns=['Article', 'Polarity']), ignore_index=True)
data_set = pd.DataFrame(columns=['Article', 'Polarity'])
craptopass = []
for files in os.listdir(corp_paths[1]):
with open(corp_paths[1] + '\' + files, 'r', errors='replace') as text_file:
line = text_file.readline().replace('|', '')
text_file.close()
if len(line.split(' ')) > 3:
line = ''.join([i if ord(i) < 128 else ' ' for i in line])
craptopass.append([line, -1])
bad = data_set .append(pd.DataFrame(craptopass, columns=['Article', 'Polarity']), ignore_index=True)
for line in good['Article'].tolist():
counter = len(line.split())
self.num_words.append(counter)
for line in bad['Article'].tolist():
counter = len(line.split())
self.num_words.append(counter)
self.features = pd.concat([good, bad]).reset_index(drop=True)
# self.features = self.features.str.replace(',', '')
self.features.to_csv('Headlines.csv', sep='|')
else:
self.features = pd.read_csv('Headlines.csv', sep='|')
self.features['totalwords'] = self.features['Article'].str.count(' ') + 1
self.num_words.extend(self.features['totalwords'].tolist())
self.features = shuffle(self.features)
self.max_len = len(max(self.features['Article'].tolist()))
tokenizer = self.tok = preprocessing.text.Tokenizer(num_words=len(self.num_words), split=' ')
self.tok.fit_on_texts(self.features['Article'].values)
X = tokenizer.texts_to_sequences(self.features['Article'].values)
self.X = preprocessing.sequence.pad_sequences(X)
self.Y = pd.get_dummies(self.features['Polarity']).values
self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(self.X, self.Y,
test_size=0.20, random_state=36)
def RNN(self):
embed_dim = 128
lstm_out = 128
model = Sequential()
model.add(Embedding(len(self.num_words), embed_dim, input_length=self.X.shape[1]))
model.add(Bidirectional(CuDNNLSTM(lstm_out)))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))
opt = Adam(lr=0.0001, decay=1e-4) #1e-3
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
return model
def model_train(self):
self.model = self.RNN()
def model_test(self):
batch_size = 128
self.model.fit(self.X_train, self.Y_train, epochs=4, batch_size=batch_size, verbose=2,
callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001,
patience=5, verbose=2, mode='auto')], validation_split=0.2)
if __name__ == "__main__":
paths = 'PATHS TO ARTICLES'
a = tensor_rnn([paths + '\pos', paths + '\neg'])
a.model_train()
a.model_test()
a.model.save('RNNModelArticles.h5', include_optimizer=True)
您需要做的就是像预处理训练文本一样预处理要提供给模型的新文本。之后,您应该有一个预测方法,它将以与模型在训练中输出预测相同的方式输出它的预测。所以,在 predict 方法中你应该这样写:
def predict(self, sequence):
presprocessed = preprocess(sequence)
prediction = self.model.predict(preprocessed, batch_size=None, verbose=0, steps=None)
这是否为您澄清了事情?
我使用自己的数据集重新创建了一个情感分析机器学习项目,并进行了一些小的修改以缩短其完成时间,我可以创建良好的模型、编译它、拟合它并毫无问题地测试它,但是问题来了关于如何向模型传递新的字符串/文章,它在 return 中传递关于字符串评论是正面还是负面的预测,希望有人能帮助我。
我在下面发布了我的代码供您查看。
class tensor_rnn():
def __init__(self, corp_paths, hidden_layers=3, loadfile=True):
self.h_layers = hidden_layers
self.num_words = []
if loadfile == False:
data_set = pd.DataFrame(columns=['Article', 'Polarity'])
craptopass = []
for files in os.listdir(corp_paths[0]):
with open(corp_paths[0] + '\' + files, 'r', errors='replace') as text_file:
line = text_file.readline().replace('|', '')
text_file.close()
if len(line.split(' ')) > 3:
line = ''.join([i if ord(i) < 128 else ' ' for i in line])
craptopass.append([line, 1])
good = data_set.append(pd.DataFrame(craptopass, columns=['Article', 'Polarity']), ignore_index=True)
data_set = pd.DataFrame(columns=['Article', 'Polarity'])
craptopass = []
for files in os.listdir(corp_paths[1]):
with open(corp_paths[1] + '\' + files, 'r', errors='replace') as text_file:
line = text_file.readline().replace('|', '')
text_file.close()
if len(line.split(' ')) > 3:
line = ''.join([i if ord(i) < 128 else ' ' for i in line])
craptopass.append([line, -1])
bad = data_set .append(pd.DataFrame(craptopass, columns=['Article', 'Polarity']), ignore_index=True)
for line in good['Article'].tolist():
counter = len(line.split())
self.num_words.append(counter)
for line in bad['Article'].tolist():
counter = len(line.split())
self.num_words.append(counter)
self.features = pd.concat([good, bad]).reset_index(drop=True)
# self.features = self.features.str.replace(',', '')
self.features.to_csv('Headlines.csv', sep='|')
else:
self.features = pd.read_csv('Headlines.csv', sep='|')
self.features['totalwords'] = self.features['Article'].str.count(' ') + 1
self.num_words.extend(self.features['totalwords'].tolist())
self.features = shuffle(self.features)
self.max_len = len(max(self.features['Article'].tolist()))
tokenizer = self.tok = preprocessing.text.Tokenizer(num_words=len(self.num_words), split=' ')
self.tok.fit_on_texts(self.features['Article'].values)
X = tokenizer.texts_to_sequences(self.features['Article'].values)
self.X = preprocessing.sequence.pad_sequences(X)
self.Y = pd.get_dummies(self.features['Polarity']).values
self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(self.X, self.Y,
test_size=0.20, random_state=36)
def RNN(self):
embed_dim = 128
lstm_out = 128
model = Sequential()
model.add(Embedding(len(self.num_words), embed_dim, input_length=self.X.shape[1]))
model.add(Bidirectional(CuDNNLSTM(lstm_out)))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))
opt = Adam(lr=0.0001, decay=1e-4) #1e-3
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
return model
def model_train(self):
self.model = self.RNN()
def model_test(self):
batch_size = 128
self.model.fit(self.X_train, self.Y_train, epochs=4, batch_size=batch_size, verbose=2,
callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001,
patience=5, verbose=2, mode='auto')], validation_split=0.2)
if __name__ == "__main__":
paths = 'PATHS TO ARTICLES'
a = tensor_rnn([paths + '\pos', paths + '\neg'])
a.model_train()
a.model_test()
a.model.save('RNNModelArticles.h5', include_optimizer=True)
您需要做的就是像预处理训练文本一样预处理要提供给模型的新文本。之后,您应该有一个预测方法,它将以与模型在训练中输出预测相同的方式输出它的预测。所以,在 predict 方法中你应该这样写:
def predict(self, sequence):
presprocessed = preprocess(sequence)
prediction = self.model.predict(preprocessed, batch_size=None, verbose=0, steps=None)
这是否为您澄清了事情?