通过预训练的词嵌入(例如 GloVe)使用 LSTM 创建问题表示
create representation of questions using LSTM via a pre-trained word embedding such as GloVe
我是 LSTM 和 python 的新手。我的目标是使用 LSTM 表示句子。
你能告诉我我做的对吗?如何修复 运行 以下代码时的错误?
"TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not str"
import torch
import torch.nn as nn
import numpy as np
from torch import optim
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
import torchvision.datasets as datasets # Standard datasets
import torchvision.transforms as transforms
import json
class RNN_LSTM(nn.Module)
def __init__(self, input_size, hidden_size, num_layers, num_classes, vocab_size,
lstm_dropout, device, word_emb_file):
super(RNN_LSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm_dropout = lstm_dropout
self.lstm_drop = nn.Dropout(p=self.lstm_dropout)
self.word_emb_file = word_emb_file
self.device = device
# initialize text embeddings
self.word_embeddings = nn.Embedding(vocab_size, input_size)
self.word_embeddings.weight = nn.Parameter(
torch.from_numpy(
np.pad(np.load(self.word_emb_file), ((0, 1), (0, 0)), 'constant')).type(
'torch.FloatTensor'))
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
def forward(self, sentence, question_len):
embeds = self.word_embeddings(sentence)
packed_output = pack_padded_sequence(embeds, question_len, batch_first=True)
outputs, (hidden, cell_state) = self.lstm(packed_output)
outputs, outputs_length = pad_packed_sequence(outputs, batch_first=True)
outputs = torch.cat([hidden[0,:,:], hidden[1,:,:]], dim=-1)
return outputs
lstm_dropout = 0.3
input_size = 300
hidden_size = 256
num_classes = 10
num_layers = 1
device = 'cpu'
vocab_size = 2000
word_emb_file = "/home/project/word_emb_300d.npy"
model = RNN_LSTM(input_size, hidden_size, num_layers, num_classes, vocab_size, lstm_dropout, device, word_emb_file)
model.word_embeddings('Blue Skye')
有关词嵌入的知识,请参阅torch embedding tutorial and use embedding with keras。
基本上nn.Embedding(vocab_size, embed_size)
是一个矩阵vocab_size*embed_size,其中每一行对应一个词的表示。
为了知道哪个单词对应于哪一行,您应该定义一个可以将单词转换为索引的词汇表(例如 python 字典 {'hello': 0, 'word': 1}
)。
在此之前,要将句子转换为单词(以便计算词汇量),您需要对句子进行分词(例如使用 nltk.word_tokenize
或 str.split()
)。
虽然torch.nn.Embedding
需要一个Tensor,所以如果你想将多个句子分批处理,并且句子的长度不同,你需要用空标记填充句子,以便将它们放入一个Tensor .
# This is pseudo code
# Preprocess data
documents = ['first sentence', 'the second sentence']
tokenized_documents = [d.split(' ') for d in documents]
# Create vocabulary and add a special token for padding
words = [w for d in documents for word in tokenized_documents]
vocabulary = {w: i+1 for i, w in enumerate(set(words))}
vocabulary['PAD'] = 0
indexed_documents = [[vocabulary[w] for w in d] for d in tokenized_documents]
# indexed_documents will look like : [[0, 1], [2, 3, 1]]
padded_documents = torch.nn.utils.rnn.pad_sequence(
indexed_documents,
padding_value=vocabulary['PAD'])
# Data can be fed to the neural network
model.word_embeddings(torch.tensor(padded_documents))
我是 LSTM 和 python 的新手。我的目标是使用 LSTM 表示句子。 你能告诉我我做的对吗?如何修复 运行 以下代码时的错误?
"TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not str"
import torch
import torch.nn as nn
import numpy as np
from torch import optim
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
import torchvision.datasets as datasets # Standard datasets
import torchvision.transforms as transforms
import json
class RNN_LSTM(nn.Module)
def __init__(self, input_size, hidden_size, num_layers, num_classes, vocab_size,
lstm_dropout, device, word_emb_file):
super(RNN_LSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm_dropout = lstm_dropout
self.lstm_drop = nn.Dropout(p=self.lstm_dropout)
self.word_emb_file = word_emb_file
self.device = device
# initialize text embeddings
self.word_embeddings = nn.Embedding(vocab_size, input_size)
self.word_embeddings.weight = nn.Parameter(
torch.from_numpy(
np.pad(np.load(self.word_emb_file), ((0, 1), (0, 0)), 'constant')).type(
'torch.FloatTensor'))
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
def forward(self, sentence, question_len):
embeds = self.word_embeddings(sentence)
packed_output = pack_padded_sequence(embeds, question_len, batch_first=True)
outputs, (hidden, cell_state) = self.lstm(packed_output)
outputs, outputs_length = pad_packed_sequence(outputs, batch_first=True)
outputs = torch.cat([hidden[0,:,:], hidden[1,:,:]], dim=-1)
return outputs
lstm_dropout = 0.3
input_size = 300
hidden_size = 256
num_classes = 10
num_layers = 1
device = 'cpu'
vocab_size = 2000
word_emb_file = "/home/project/word_emb_300d.npy"
model = RNN_LSTM(input_size, hidden_size, num_layers, num_classes, vocab_size, lstm_dropout, device, word_emb_file)
model.word_embeddings('Blue Skye')
有关词嵌入的知识,请参阅torch embedding tutorial and use embedding with keras。
基本上nn.Embedding(vocab_size, embed_size)
是一个矩阵vocab_size*embed_size,其中每一行对应一个词的表示。
为了知道哪个单词对应于哪一行,您应该定义一个可以将单词转换为索引的词汇表(例如 python 字典 {'hello': 0, 'word': 1}
)。
在此之前,要将句子转换为单词(以便计算词汇量),您需要对句子进行分词(例如使用 nltk.word_tokenize
或 str.split()
)。
虽然torch.nn.Embedding
需要一个Tensor,所以如果你想将多个句子分批处理,并且句子的长度不同,你需要用空标记填充句子,以便将它们放入一个Tensor .
# This is pseudo code
# Preprocess data
documents = ['first sentence', 'the second sentence']
tokenized_documents = [d.split(' ') for d in documents]
# Create vocabulary and add a special token for padding
words = [w for d in documents for word in tokenized_documents]
vocabulary = {w: i+1 for i, w in enumerate(set(words))}
vocabulary['PAD'] = 0
indexed_documents = [[vocabulary[w] for w in d] for d in tokenized_documents]
# indexed_documents will look like : [[0, 1], [2, 3, 1]]
padded_documents = torch.nn.utils.rnn.pad_sequence(
indexed_documents,
padding_value=vocabulary['PAD'])
# Data can be fed to the neural network
model.word_embeddings(torch.tensor(padded_documents))