标准中的目标大小不匹配
Missmatching target size in criterion
我正在尝试将 pytorch 用于 IMBD 数据集,以预测正面和负面评论。当我进入训练状态时,criterion
函数给出以下错误:
ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([1136, 64, 1]))
经过一些研究,我发现错误是因为模型的输出返回了一个大小为 [1136, 64, 1]
的张量,而 criterion
只期望 batch
结果。
但是,我不知道如何解决这个错误。
我的代码:
import torch
import spacy
import torch.nn as nn
from torchtext.legacy import data
import sys
import csv
import torch.optim as optim
import re
import nltk
from nltk.corpus import stopwords
from torchtext import vocab
from torchtext.legacy.data import Field
from torchtext.legacy import datasets
import pandas as pd
import re
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import random
SEED = 1234
torch.manual_seed(SEED) # For reproducibility
torch.backends.cudnn.deterministic = True
import torch.nn.functional as F
import torch.nn as nn
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
#text = [sent len, batch size]
embedded = self.embedding(text)
h_1 = F.relu(self.hidden_fc(embedded))
# assert torch.equal(output[-1,:,:], h_1.squeeze(0))
# [batch size, output dim]
return self.fc(h_1.squeeze(0))
def binary_accuracy(preds, y):
"""
Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
"""
#round predictions to the closest integer
rounded_preds = torch.round(torch.sigmoid(preds)) # 0.75 --> 1 0.4 --> 0
correct = (rounded_preds == y).float() #convert into float for division
acc = correct.sum() / len(correct)
return acc
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train() #Train mode is on
for batch in iterator:
optimizer.zero_grad() #Reset the gradients
predictions = model(batch.text) ## forward propagation
print(predictions.shape)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
loss.backward() ## backward propagation / calculate gradients
optimizer.step() ## update parameters
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval() #Evaluation mode is on
with torch.no_grad():
for batch in iterator:
predictions = model(batch.text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
TEXT = data.Field(tokenize = 'spacy',
tokenizer_language = 'en_core_web_sm',
lower = True)
LABEL = data.LabelField(dtype = torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) ## IMDB reviews dataset
train_data, valid_data = train_data.split(random_state = random.seed(SEED))
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE) #Build the vocabulary using the top frequent 25K words
LABEL.build_vocab(train_data)
BATCH_SIZE = 64
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE)
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
model = MLP(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
N_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
总结一下您的问题,您有评论要分类为正面或负面。为此,您训练一个嵌入 space 以将每个单词映射到一个向量,然后输出每个句子的概率,并使用二进制 cross-entropy 损失 nn.BCELossWithLogits
.[= 对相应的标签进行监督。 45=]
您当前的模型包括:
nn.Embedding
:将每个单词独立嵌入序列中,从而将输入张量形状从 (seq_len, batch_size)
转换为 (seq_len, batch_size, embedding_dim)
。其中 seq_len
是您输入序列中的标记数。
nn.Linear
层通过投影特征来降维,张量形状由(seq_len, batch_size, embedding_dim)
转换为(seq_len, batch_size, hidden_dim)
.
一个non-linearity层应用于词向量序列。注意句子的结构是如何保留的。最后,应用第二个线性层从 (seq_len, batch_size, hidden_dim)
映射到 (seq_len, batch_size, output_dim)
。还是用句式结构(cf. the dim=0
with seq_len
).
这就是为什么你得到 (1136, 64, 1)
作为 predictions
形状的原因:1136
必须是你的序列长度,64
是 BATCH_SIZE
,而 1
是 OUTPUT_DIM
.
然而你正试图将每个序列作为一个整体进行分类,你需要的是每个句子的单个张量或标量值,即 [=31= 的形状].这意味着将对应于序列维度的第一个维度减少为单个值。
减少维度以便可以用单个向量表示整个句子的一种简单易行的方法是对句子应用平均池。每个句子中单词的平均向量应该给你整个句子的 positiveness/negativeness 的情绪。您可以在最终投影之前应用此运算符以保持在相对较高的维度,或者使用 nn.AdaptiveAvgPool1d
with an output size of 1
or simply torch.Tensor.mean
.
这是 nn.AdaptiveAvgPool1d
的可能实现:
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.avg = nn.AdaptiveAvgPool1d(1) # reduces (N, C, L_in) to (N, C, 1)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
# (seq_len, batch_size) = (1136, 64)
embedded = self.embedding(text)
# (seq_len, batch_size, embedding_dim) = (1136, 64, 100)
h_1 = F.relu(self.hidden_fc(embedded))
# (seq_len, batch_size, hidden_dim) = (1136, 64, 256)
avg = self.avg(h_1.permute(1,2,0))
# (batch_size, hidden_dim, 1) = (64, 256, 1)
out = self.fc(avg.squeeze(-1))
# (batch_size, 1, 1) = (64, 1)
return out
或 torch.Tensor.mean
:
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
# (seq_len, batch_size) = (1136, 64)
embedded = self.embedding(text)
# (seq_len, batch_size, embedding_dim) = (1136, 64, 100)
h_1 = F.relu(self.hidden_fc(embedded))
# (seq_len, batch_size, hidden_dim) = (1136, 64, 256)
avg = h_1.mean(0)
# (batch_size, hidden_dim, 1) = (64, 256)
out = self.fc(avg)
# (batch_size, 1, 1) = (64, 1)
return out
我正在尝试将 pytorch 用于 IMBD 数据集,以预测正面和负面评论。当我进入训练状态时,criterion
函数给出以下错误:
ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([1136, 64, 1]))
经过一些研究,我发现错误是因为模型的输出返回了一个大小为 [1136, 64, 1]
的张量,而 criterion
只期望 batch
结果。
但是,我不知道如何解决这个错误。
我的代码:
import torch
import spacy
import torch.nn as nn
from torchtext.legacy import data
import sys
import csv
import torch.optim as optim
import re
import nltk
from nltk.corpus import stopwords
from torchtext import vocab
from torchtext.legacy.data import Field
from torchtext.legacy import datasets
import pandas as pd
import re
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import random
SEED = 1234
torch.manual_seed(SEED) # For reproducibility
torch.backends.cudnn.deterministic = True
import torch.nn.functional as F
import torch.nn as nn
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
#text = [sent len, batch size]
embedded = self.embedding(text)
h_1 = F.relu(self.hidden_fc(embedded))
# assert torch.equal(output[-1,:,:], h_1.squeeze(0))
# [batch size, output dim]
return self.fc(h_1.squeeze(0))
def binary_accuracy(preds, y):
"""
Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
"""
#round predictions to the closest integer
rounded_preds = torch.round(torch.sigmoid(preds)) # 0.75 --> 1 0.4 --> 0
correct = (rounded_preds == y).float() #convert into float for division
acc = correct.sum() / len(correct)
return acc
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train() #Train mode is on
for batch in iterator:
optimizer.zero_grad() #Reset the gradients
predictions = model(batch.text) ## forward propagation
print(predictions.shape)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
loss.backward() ## backward propagation / calculate gradients
optimizer.step() ## update parameters
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval() #Evaluation mode is on
with torch.no_grad():
for batch in iterator:
predictions = model(batch.text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
TEXT = data.Field(tokenize = 'spacy',
tokenizer_language = 'en_core_web_sm',
lower = True)
LABEL = data.LabelField(dtype = torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) ## IMDB reviews dataset
train_data, valid_data = train_data.split(random_state = random.seed(SEED))
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE) #Build the vocabulary using the top frequent 25K words
LABEL.build_vocab(train_data)
BATCH_SIZE = 64
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE)
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
model = MLP(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
N_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
总结一下您的问题,您有评论要分类为正面或负面。为此,您训练一个嵌入 space 以将每个单词映射到一个向量,然后输出每个句子的概率,并使用二进制 cross-entropy 损失 nn.BCELossWithLogits
.[= 对相应的标签进行监督。 45=]
您当前的模型包括:
nn.Embedding
:将每个单词独立嵌入序列中,从而将输入张量形状从(seq_len, batch_size)
转换为(seq_len, batch_size, embedding_dim)
。其中seq_len
是您输入序列中的标记数。nn.Linear
层通过投影特征来降维,张量形状由(seq_len, batch_size, embedding_dim)
转换为(seq_len, batch_size, hidden_dim)
.一个non-linearity层应用于词向量序列。注意句子的结构是如何保留的。最后,应用第二个线性层从
(seq_len, batch_size, hidden_dim)
映射到(seq_len, batch_size, output_dim)
。还是用句式结构(cf. thedim=0
withseq_len
).
这就是为什么你得到 (1136, 64, 1)
作为 predictions
形状的原因:1136
必须是你的序列长度,64
是 BATCH_SIZE
,而 1
是 OUTPUT_DIM
.
然而你正试图将每个序列作为一个整体进行分类,你需要的是每个句子的单个张量或标量值,即 [=31= 的形状].这意味着将对应于序列维度的第一个维度减少为单个值。
减少维度以便可以用单个向量表示整个句子的一种简单易行的方法是对句子应用平均池。每个句子中单词的平均向量应该给你整个句子的 positiveness/negativeness 的情绪。您可以在最终投影之前应用此运算符以保持在相对较高的维度,或者使用 nn.AdaptiveAvgPool1d
with an output size of 1
or simply torch.Tensor.mean
.
这是 nn.AdaptiveAvgPool1d
的可能实现:
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.avg = nn.AdaptiveAvgPool1d(1) # reduces (N, C, L_in) to (N, C, 1)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
# (seq_len, batch_size) = (1136, 64)
embedded = self.embedding(text)
# (seq_len, batch_size, embedding_dim) = (1136, 64, 100)
h_1 = F.relu(self.hidden_fc(embedded))
# (seq_len, batch_size, hidden_dim) = (1136, 64, 256)
avg = self.avg(h_1.permute(1,2,0))
# (batch_size, hidden_dim, 1) = (64, 256, 1)
out = self.fc(avg.squeeze(-1))
# (batch_size, 1, 1) = (64, 1)
return out
或 torch.Tensor.mean
:
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
# (seq_len, batch_size) = (1136, 64)
embedded = self.embedding(text)
# (seq_len, batch_size, embedding_dim) = (1136, 64, 100)
h_1 = F.relu(self.hidden_fc(embedded))
# (seq_len, batch_size, hidden_dim) = (1136, 64, 256)
avg = h_1.mean(0)
# (batch_size, hidden_dim, 1) = (64, 256)
out = self.fc(avg)
# (batch_size, 1, 1) = (64, 1)
return out