使用 LSTM 节点训练 RNN

Question

这是我用 LSTM 节点训练 RNN 的代码：

# LSTM RNN with dropout for sequence classification
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pickle, numpy, pandas as pd

###################################### CONSTANTS #############################################

SEED = 7                        # Fixes random seed for reproducibility.
URL = 'ibcData.tsv'             # Specified dataset to gather data from.
SEPERATOR = '\t'                # Seperator the dataset uses to divide data.
RANDOM_STATE = 1                # Pseudo-random number generator state used for random sampling.
TOP_WORDS = 5000                # Most used words in the dataset.
MAX_REVIEW_LENGTH = 500         # Length of each sentence being sent in (necessary).
EMBEDDING_VECTOR_LENGTH = 32    # The specific Embedded later will have 32-length vectors to
                                # represent each word.
BATCH_SIZE = 64                 # Takes 64 sentences at a time and continually retrains RNN.
NUMBER_OF_EPOCHS = 3            # Fits RNN to more accurately guess the data's political bias.
DROPOUT = 0.2                   # Helps slow down overfitting of data (slower convergence rate)
RECURRENT_DROPOUT = 0.2         # Helps slow down overfitting of data when recurrently training

##############################################################################################

# fix random seed for reproducibility
numpy.random.seed(SEED)


readData = pd.read_csv(URL, header=None, names=['label', 'message'], sep=SEPERATOR)

# convert label to a numerical variable
readData['label_num'] = readData.label.map({'Liberal' : 0, 'Neutral': 0.5, 'Conservative' : 1})
X = readData.message    # Contains the dataset's actual sentences that were labeled
Y = readData.label_num  # Either 0.0, 0.5, or 1.0 depending on label mapped to

# load the dataset into training and testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=RANDOM_STATE)

# truncate and pad input sequences
for sentence in X_train:
    sentence.zfill(MAX_REVIEW_LENGTH)
for sentence in X_test:
    sentence.zfill(MAX_REVIEW_LENGTH)

# create the model
model = Sequential()
model.add(Embedding(TOP_WORDS, EMBEDDING_VECTOR_LENGTH, input_length=MAX_REVIEW_LENGTH))
model.add(LSTM(100, recurrent_dropout=RECURRENT_DROPOUT dropout=DROPOUT))    # Dropouts help prevent overfitting

model.add(Dense(2, activation='sigmoid'))                   # Layers deal with a 2D tensor, and output a 2D tensor
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=NUMBER_OF_EPOCHS, batch_size=BATCH_SIZE)

# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

它正在训练一个包含如下数据的 .tsv 文件：

"Liberal","Forcing middle-class workers to bear a greater share of the cost of government weakens their support for needed investments and stirs resentment toward those who depend on public services the most ."

"Liberal", "Because it would not be worthwhile to bring a case for $ 30.22 , the arbitration clause would , as a practical matter , deny the Concepcions any relief and , more important , eliminate a class action that might punish AT&T for its pattern of fraudulent behavior ."

我尝试运行它，我从控制台得到了这个，我不知道如何修复它，我的教授也没有试图帮助我进行这项研究：

Layer (type)                 Output Shape              Param #
=================================================================
embedding_1 (Embedding)      (None, 500, 32)           160000
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202
=================================================================
Total params: 213,402
Trainable params: 213,402
Non-trainable params: 0
_________________________________________________________________
None
Traceback (most recent call last):
  File "LSTM-RNN.py", line 55, in <module>
    model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=NUMBER_OF_EPOCHS
, batch_size=BATCH_SIZE)
  File "C:\Users\Hydur\Anaconda3\lib\site-packages\keras\keras\models.py", line 871, in f
it
    initial_epoch=initial_epoch)
  File "C:\Users\Hydur\Anaconda3\lib\site-packages\keras\keras\engine\training.py", line
1525, in fit
    batch_size=batch_size)
  File "C:\Users\Hydur\Anaconda3\lib\site-packages\keras\keras\engine\training.py", line
1379, in _standardize_user_data
    exception_prefix='input')
  File "C:\Users\Hydur\Anaconda3\lib\site-packages\keras\keras\engine\training.py", line
144, in _standardize_input_data
    str(array.shape))
ValueError: Error when checking input: expected embedding_1_input to have shape (None, 50
0) but got array with shape (3244, 1)

Answer 1

主要问题似乎是 X 包含原始字符串，而嵌入层期望数据已经用数字编码。 Keras text preprocessing 实用程序将负责：

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=MAX_REVIEW_LENGTH)
tokenizer.fit_on_texts(readData.message)
X = numpy.array(tokenizer.texts_to_matrix(readData.message)) # shape (None, 500)

This will code each message as a 500 integers, with a unique integer assigned to each word.

修复后，"dense_1" 层也出现错误。您网络中的最后一层被指定为具有两个输出节点，但您使用的损失函数 (binary_cross_entropy) 需要一个编码为 0/1 的列。我对其进行了编辑，以便该层只有一个输出节点，因此该过程将完成，但怀疑使用具有二进制交叉熵的 0、0.5、1 是否会达到您想要的效果。我认为您可能介于 3 级单热编码和 categorical_cross_entropy 之间，但这超出了这个问题的范围。

这是运行为我编写的完整脚本。根据您提供的两个观察结果，我只能运行它，但它确实完成了。

from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import os, pickle, numpy, pandas as pd
from keras.preprocessing.text import Tokenizer
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

################################### CONSTANTS ################################################
SEED = 7                        # Fixes random seed for reproducibility.
URL = 'ibcData.tsv'             # Specified dataset to gather data from.
SEPERATOR = '\t'                # Seperator the dataset uses to divide data.
RANDOM_STATE = 1                # Pseudo-random number generator state used for random sampling.
TOP_WORDS = 5000                # Most used words in the dataset.
MAX_REVIEW_LENGTH = 500         # Length of each sentence being sent in (necessary).
EMBEDDING_VECTOR_LENGTH = 32    # The specific Embedded later will have 32-length vectors to
                                # represent each word.
BATCH_SIZE = 64                 # Takes 64 sentences at a time and continually retrains RNN.
NUMBER_OF_EPOCHS = 3            # Fits RNN to more accurately guess the data's political bias.

# fix random seed for reproducibility
numpy.random.seed(SEED)


readData = pd.read_csv(URL, header=None, names=['label', 'message'], sep=SEPERATOR)

# convert label to a numerical variable
tokenizer = Tokenizer(num_words=MAX_REVIEW_LENGTH)
tokenizer.fit_on_texts(readData.message)
X = numpy.array(tokenizer.texts_to_matrix(readData.message)) # shape (None, 32)
readData['label_num'] = readData.label.map({'Liberal' : 0, 'Neutral': 0.5, 'Conservative' : 1})
Y = numpy.array(readData.label_num)  # Either 0.0, 0.5, or 1.0 depending on label mapped to


# load the dataset into training and testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=RANDOM_STATE)

# create the model
model = Sequential()
model.add(Embedding(TOP_WORDS, EMBEDDING_VECTOR_LENGTH, input_length=MAX_REVIEW_LENGTH))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))   # Layers deal with a 2D tensor, and output a 2D tensor
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=NUMBER_OF_EPOCHS, batch_size=BATCH_SIZE)

# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

然后我收到以下输出：

Using TensorFlow backend.
_________________________________________________________________
Layer (type)                 Output Shape              Param #
=================================================================
embedding_1 (Embedding)      (None, 500, 32)           160000
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101
=================================================================
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None
Train on 1 samples, validate on 1 samples
Epoch 1/3
1/1 [==============================] - 0s - loss: 0.6953 - acc: 0.0000e+00 - val_loss: 0.6814 - val_acc: 1.0000
Epoch 2/3
1/1 [==============================] - 0s - loss: 0.6814 - acc: 1.0000 - val_loss: 0.6670 - val_acc: 1.0000
Epoch 3/3
1/1 [==============================] - 0s - loss: 0.6670 - acc: 1.0000 - val_loss: 0.6516 - val_acc: 1.0000

希望对您有所帮助。

使用 LSTM 节点训练 RNN

Training RNN with LSTM Nodes

python

scikit-learn

keras

keras-layer