通过预训练的词嵌入(例如 GloVe)使用 LSTM 创建问题表示

create representation of questions using LSTM via a pre-trained word embedding such as GloVe

我是 LSTM 和 python 的新手。我的目标是使用 LSTM 表示句子。 你能告诉我我做的对吗?如何修复 运行 以下代码时的错误?

"TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not str"

import torch
import torch.nn as nn
import numpy as np
from torch import optim
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
import torchvision.datasets as datasets  # Standard datasets
import torchvision.transforms as transforms
import json

class RNN_LSTM(nn.Module)
    
    def __init__(self, input_size, hidden_size, num_layers, num_classes, vocab_size,
                 lstm_dropout, device, word_emb_file):
        super(RNN_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm_dropout = lstm_dropout
        self.lstm_drop = nn.Dropout(p=self.lstm_dropout)
        self.word_emb_file = word_emb_file
        self.device = device
        
        # initialize text embeddings
        self.word_embeddings = nn.Embedding(vocab_size, input_size)
        self.word_embeddings.weight = nn.Parameter(
                torch.from_numpy(
                    np.pad(np.load(self.word_emb_file), ((0, 1), (0, 0)), 'constant')).type(
                    'torch.FloatTensor'))
        
    
        
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)   
    
    def forward(self, sentence, question_len):
        embeds = self.word_embeddings(sentence)
        packed_output = pack_padded_sequence(embeds, question_len, batch_first=True)
        outputs, (hidden, cell_state) = self.lstm(packed_output)
        outputs, outputs_length = pad_packed_sequence(outputs, batch_first=True)
        outputs = torch.cat([hidden[0,:,:], hidden[1,:,:]], dim=-1)
        return outputs

lstm_dropout = 0.3
input_size = 300
hidden_size = 256
num_classes = 10
num_layers = 1
device = 'cpu'
vocab_size = 2000
word_emb_file = "/home/project/word_emb_300d.npy"

model = RNN_LSTM(input_size, hidden_size, num_layers, num_classes, vocab_size, lstm_dropout, device, word_emb_file)

model.word_embeddings('Blue Skye')

有关词嵌入的知识,请参阅torch embedding tutorial and use embedding with keras

基本上nn.Embedding(vocab_size, embed_size)是一个矩阵vocab_size*embed_size,其中每一行对应一个词的表示。

为了知道哪个单词对应于哪一行,您应该定义一个可以将单词转换为索引的词汇表(例如 python 字典 {'hello': 0, 'word': 1})。

在此之前,要将句子转换为单词(以便计算词汇量),您需要对句子进行分词(例如使用 nltk.word_tokenizestr.split())。

虽然torch.nn.Embedding需要一个Tensor,所以如果你想将多个句子分批处理,并且句子的长度不同,你需要用空标记填充句子,以便将它们放入一个Tensor .

# This is pseudo code

# Preprocess data
documents = ['first sentence', 'the second sentence']
tokenized_documents = [d.split(' ') for d in documents]
# Create vocabulary and add a special token for padding
words = [w for d in documents for word in tokenized_documents]
vocabulary = {w: i+1 for i, w in enumerate(set(words))}
vocabulary['PAD'] = 0
indexed_documents = [[vocabulary[w] for w in d] for d in tokenized_documents]
# indexed_documents will look like : [[0, 1], [2, 3, 1]]
padded_documents = torch.nn.utils.rnn.pad_sequence(
    indexed_documents,
    padding_value=vocabulary['PAD'])

# Data can be fed to the neural network
model.word_embeddings(torch.tensor(padded_documents))