如何设置 LSTM 以使用 n-gram 而不是序列长度？

Question

我目前有一个使用序列长度作为输入的 LSTM，但这只允许 LSTM 预测输入长度何时等于使用的序列长度。但我希望 LSTM 使用 n-gram 以便我可以预测完整的单词。

示例：

因此对于输入（序列长度 = 10）：

Input: "no sweet t" 
Ouput(5 options): "['he ', 'o ', 'aste ', 'ime ', 'errible ']"

我想要的是输入可以是：

Input: "No sweet"
Output: ['tea', 'taste', 'but', 'the', 'and']

这样我就可以预测完整的单词并且不受序列长度的限制。

我当前的代码：

#Setup
import numpy as np
import tensorflow as tf
from numpy.core.multiarray import dtype
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.layers import LSTM, Dropout
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Dense, Activation, Dropout, RepeatVector
from tensorflow.keras.optimizers import RMSprop
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import pickle
import sys
import heapq
import seaborn as sns
from pylab import rcParams



#Loading the data
path = 'text_2.txt'
text = open(path, encoding='utf8').read().lower()
# print ('Corpus length: ',len(text))

#Preprocessing
#Finding all the unique characters in the corpus
chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

print ("unique chars: ",len(chars))

#Cutting the corpus into chunks of 10 chars, spacing the sequences by 3 characters
#We will additionally store the next character (the one we need to predict) for every sequence

SEQUENCE_LENGTH = 10
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - SEQUENCE_LENGTH, step):
    sentences.append(text[i:i+SEQUENCE_LENGTH])
    next_chars.append(text[i+SEQUENCE_LENGTH])
print ('num training examples: ',len(sentences))

#Generating features and labels.
#Using previously generated sequences and characters that need to be predicted to create one-hot encoded vectors

X = np.zeros((len(sentences), SEQUENCE_LENGTH, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1



#Building the model

model = Sequential();
model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))


#Training
optimizer = RMSprop(lr= 0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

history = model.fit(X, y, validation_split=0.05, batch_size=128, epochs=1, shuffle=True).history

#Predicting

#Testing
def prepare_input(text):
    x = np.zeros((1, SEQUENCE_LENGTH, len(chars)))
    for t, char in enumerate(text):
        x[0, t, char_indices[char]] = 1
    return x
#The sequences must be 40 chars long and the tensor is of the shape (1, 40, 57)



#The sample function
#This function allows us to ask our model what are the next probable characters (The heap simplifies the job)
def sample(preds, top_n = 3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return heapq.nlargest(top_n, range(len(preds)), preds.take)


#Prediction function
def predict_completion(text):
    original_text = text
    generalised = text
    completion = ''
    while True:
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_char = indices_char[next_index]

        text = text[1:] + next_char
        completion += next_char

        if len(original_text + completion) + 2 > len(original_text) and next_char == ' ':
            return completion

#This methods wraps everything and allows us to predict multiple completions
def predict_completions(text, n = 3):
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [indices_char[idx] + predict_completion(text[1:] + indices_char[idx]) for idx in next_indices]

Answer 1

要向 LSTM 提供单词序列而不是字符序列并让 LSTM 预测单词，必须对文本进行不同的标记化。不是将文档拆分为 n 个字符的序列，而是必须将文档拆分为 m 个单词，而不是具有不同的词汇表字符（索引映射），您将拥有语料库中不同单词的词汇表。我已经修改了提供的代码来说明这个想法以供您审查。出于测试目的修改了一些参数（我创建了自己的文本文件用于评估），但可以根据您的文档进行更改。

# Setup
import re

import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.layers import LSTM, Dropout
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import RMSprop
import matplotlib


matplotlib.use('agg')

import heapq

# Loading the data
path = 'text_2.txt'
text = open(path, encoding='utf8').read().lower()
# print ('Corpus length: ',len(text))

# Preprocessing
# Finding all the unique characters in the corpus
text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
words = sorted(list(set(text.split()))) # split into distinct words
# define vocabulary
word_indices = dict((c, i) for i, c in enumerate(words))
indices_words = dict((i, c) for i, c in enumerate(words))


print("unique chars: ", len(words))

# Cutting the corpus into chunks of 10 chars, spacing the sequences by 3 characters
# We will additionally store the next character (the one we need to predict) for every sequence

SEQUENCE_LENGTH = 3
step = 3
sentences = []
next_chars = []
for i in range(0, len(words) - SEQUENCE_LENGTH, step):
    sentences.append(words[i:i + SEQUENCE_LENGTH])
    next_chars.append(words[i + SEQUENCE_LENGTH])
print('num training examples: ', len(sentences))

# Generating features and labels.
# Using previously generated sequences and characters that need to be predicted to create one-hot encoded vectors

X = np.zeros((len(sentences), SEQUENCE_LENGTH, len(words)), dtype=np.bool)
y = np.zeros((len(sentences), len(words)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence):
        X[i, t, word_indices[word]] = 1
    y[i, word_indices[next_chars[i]]] = 1

# Building the model

model = Sequential()
model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, len(words))))
model.add(Dense(len(words)))
model.add(Activation('softmax'))

# Training
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

history = model.fit(X, y, validation_split=0.05, batch_size=128, epochs=1, shuffle=True).history


# Predicting

# Testing
def prepare_input(text):
    x = np.zeros((1, SEQUENCE_LENGTH, len(words)))
    for t, word in enumerate(text.split()):
        x[0, t, word_indices[word]] = 1
    return x


# The sequences must be 40 chars long and the tensor is of the shape (1, 40, 57)


# The sample function
# This function allows us to ask our model what are the next probable characters (The heap simplifies the job)
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return heapq.nlargest(top_n, range(len(preds)), preds.take)


# Prediction function
def predict_completion(text):
    prediction = []
    while len(prediction) < SEQUENCE_LENGTH:
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_word = indices_words[next_index]

        text = " ".join([text, next_word])
        prediction += [next_word]

    return " ".join(prediction)


# This methods wraps everything and allows us to predict multiple completions
def predict_completions(text, n=3):
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [indices_words[idx] + predict_completion(text[1:] + indices_words[idx]) for idx in next_indices]

print(predict_completion("hello"))

用于评估标记化步骤并验证它可以正确输入到模型的文本文件 (text_2.txt) 是：

hello. how are you today? i am doing well thank you for asking. i like sweet tea.

更新

我可能误解了问题的一部分。关于如何在 n-grams 上进行训练，您可以计算 n-grams（例如二元组、三元组）并将它们添加为训练数据的一部分，使用 n-gram 之后的下一个词作为预言。标记化更新是一样的，但我们会修改语料库的预处理：

def compute_n_gram(words, n):
    # compute n-grams
    return [words[i:i+n] for i in range(len(words)-n+1)]


def compute_n_gram_with_next_word(words, n):
    n_gram = compute_n_gram(words, n)[:-1]
    next_words = [words[i+n] for i in range(len(n_gram))]
    return n_gram, next_words

words = text.split()
vocab = sorted(list(set(words)))  # determine distinct words from corpus
vocab_size = len(vocab)

bigrams, next_word_bigrams = compute_n_gram_with_next_word(words, 2)
trigrams, next_word_trigrams = compute_n_gram_with_next_word(words, 3)
sentences = bigrams + trigrams
next_words = next_word_bigrams + next_word_trigrams

所以现在，我们不再使用三个单词的每个子句来预测第四个单词，而是使用二字母组和三字母组来分别预测第三个或第四个单词。如果您决定执行更高的订单（four-gram、five-gram 等），此 n-gram 逻辑也将适用。 SEQUENCE_LENGTH 必须大于或等于您使用的最大 n-gram 大小，因为它控制着您输入的大小（并且任何较短的输入都用零填充）。它还使用当前预测代码控制输出大小，因为生成的文本不能超过该长度（除非您选择使用窗口方法）。整个更新后的代码与之前的代码片段类似，但为了完整起见：

# Setup
import re

import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.layers import LSTM, Dropout
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import RMSprop
import matplotlib
from tensorflow.python.keras import Input

matplotlib.use('agg')

import heapq

def compute_n_gram(words, n):
    # compute n-grams
    return [words[i:i+n] for i in range(len(words)-n+1)]


def compute_n_gram_with_next_word(words, n):
    n_gram = compute_n_gram(words, n)[:-1]
    next_words = [words[i+n] for i in range(len(n_gram))]
    return n_gram, next_words

# Loading the data
path = 'text_2.txt'
text = open(path, encoding='utf8').read().lower()
# print ('Corpus length: ',len(text))

# Preprocessing
# Finding all the unique characters in the corpus
text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
words = text.split()
vocab = sorted(list(set(words)))  # determine distinct words from corpus
vocab_size = len(vocab)
# define vocabulary
word_indices = dict((c, i) for i, c in enumerate(vocab))
indices_words = dict((i, c) for i, c in enumerate(vocab))


print("unique chars: ", len(words))

# Cutting the corpus into chunks of 10 chars, spacing the sequences by 3 characters
# We will additionally store the next character (the one we need to predict) for every sequence

SEQUENCE_LENGTH = 10
bigrams, next_word_bigrams = compute_n_gram_with_next_word(words, 2)
trigrams, next_word_trigrams = compute_n_gram_with_next_word(words, 3)
sentences = bigrams + trigrams
next_words = next_word_bigrams + next_word_trigrams
print('num training examples: ', len(sentences))

# Generating features and labels.
# Using previously generated sequences and characters that need to be predicted to create one-hot encoded vectors

X = np.zeros((len(sentences), SEQUENCE_LENGTH, len(vocab)), dtype=np.bool)
y = np.zeros((len(sentences), len(vocab)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence):
        X[i, t, word_indices[word]] = 1
    y[i, word_indices[next_words[i]]] = 1

# Building the model

model = Sequential()
model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, vocab_size)))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))

# Training
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

history = model.fit(X, y, validation_split=0.05, batch_size=128, epochs=1, shuffle=True).history


# Predicting

# Testing
def prepare_input(text):
    x = np.zeros((1, SEQUENCE_LENGTH, vocab_size))
    for t, word in enumerate(text.split()):
        x[0, t, word_indices[word]] = 1
    return x


# The sequences must be 40 chars long and the tensor is of the shape (1, 40, 57)


# The sample function
# This function allows us to ask our model what are the next probable characters (The heap simplifies the job)
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return heapq.nlargest(top_n, range(len(preds)), preds.take)


# Prediction function
def predict_completion(text):
    prediction = []
    while len(prediction) < SEQUENCE_LENGTH:
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_word = indices_words[next_index]

        text = " ".join([text, next_word])
        prediction += [next_word]

    return " ".join(prediction)


# This methods wraps everything and allows us to predict multiple completions
def predict_completions(text, n=3):
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [indices_words[idx] + predict_completion(text[1:] + indices_words[idx]) for idx in next_indices]

print(predict_completion("hello"))

如何设置 LSTM 以使用 n-gram 而不是序列长度？

How to setup LSTM to use n-grams instead of sequence length?

python

tensorflow

nlp

lstm

recurrent-neural-network

更新