Convolutional encoder error - 'RuntimeError: input and target shapes do not match'

Question

在下面的代码中，创建并保存了三幅图像，卷积自动编码器尝试将它们编码为低维表示。

%reset -f

import torch.utils.data as data_utils
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import metrics
import datetime
from sklearn.preprocessing import MultiLabelBinarizer
import seaborn as sns
sns.set_style("darkgrid")
from ast import literal_eval
import numpy as np
from sklearn.preprocessing import scale
import seaborn as sns
sns.set_style("darkgrid")
import torch
import torch
import torchvision
import torch.nn as nn
from torch.autograd import Variable
from os import listdir
import cv2
import torch.nn.functional as F
import numpy as np
from numpy.polynomial.polynomial import polyfit
import matplotlib.pyplot as plt


number_channels = 3

%matplotlib inline

x = np.arange(10)
m = 1
b = 2
y = x * x
plt.plot(x, y)
plt.axis('off')
plt.savefig('1-increasing.jpg')

x = np.arange(10)
m = 0.01
b = 2
y = x * x * x
plt.plot(x, y)
plt.axis('off')
plt.savefig('2-increasing.jpg')

x = np.arange(10)
m = 0
b = 2
y = (m*x)+b
plt.plot(x, y)
plt.axis('off')
plt.savefig('constant.jpg')

batch_size_value = 2

train_image = []

train_image.append(cv2.imread('1-increasing.jpg', cv2.IMREAD_UNCHANGED).reshape(3, 288, 432))
train_image.append(cv2.imread('2-increasing.jpg', cv2.IMREAD_UNCHANGED).reshape(3, 288, 432))
train_image.append(cv2.imread('decreasing.jpg', cv2.IMREAD_UNCHANGED).reshape(3, 288, 432))
train_image.append(cv2.imread('constant.jpg', cv2.IMREAD_UNCHANGED).reshape(3, 288, 432))


data_loader = data_utils.DataLoader(train_image, batch_size=batch_size_value, shuffle=False,drop_last=True)

import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.utils import save_image
from torchvision.datasets import MNIST
import os

def to_img(x):
    x = 0.5 * (x + 1)
    x = x.clamp(0, 1)
    x = x.view(x.size(0), 1, 28, 28)
    return x


num_epochs = 100
# batch_size = 128
batch_size = 2

learning_rate = 1e-3
dataloader = data_loader

class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
#         torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True)
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=3, padding=1),  # b, 16, 10, 10
            nn.ReLU(True),
            nn.MaxPool2d(2, stride=2),  # b, 16, 5, 5
            nn.Conv2d(16, 8, 3, stride=2, padding=1),  # b, 8, 3, 3
            nn.ReLU(True),
            nn.MaxPool2d(3, stride=1)  # b, 8, 2, 2
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(8, 16, 2, stride=1),  # b, 16, 5, 5
            nn.ReLU(True),
            nn.ConvTranspose2d(16, 8, 3, stride=3, padding=1),  # b, 8, 15, 15
            nn.ReLU(True),
            nn.ConvTranspose2d(8, 3, 2, stride=2, padding=1),  # b, 1, 28, 28
            nn.Tanh()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


model = autoencoder().cuda().double()

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                             weight_decay=1e-5)

for epoch in range(num_epochs):
    for data in dataloader:
        img, _ = data
        img = img.double()
        img = Variable(img).cuda()
        img = img.unsqueeze_(0)

        # ===================forward=====================
        output = model(img)
        loss = criterion(output, img)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log=================to_img=======
    print('epoch [{}/{}], loss:{:.4f}'
          .format(epoch+1, num_epochs, loss.data[0]))

torch.save(model.state_dict(), './conv_autoencoder.pth')

但返回错误：

RuntimeError: input and target shapes do not match: input [1 x 3 x 132 x 204], target [1 x 3 x 288 x 432] at /pytorch/aten/src/THCUNN/generic/MSECriterion.cu:15

图片的形状是(3, 288, 432)。如何更改模型的配置以允许 [1 x 3 x 288 x 432] 而不是 [1 x 3 x 132 x 204]？

更新：

我改了

nn.ConvTranspose2d(8, 3, 2, stride=2, padding=1)

到：

nn.ConvTranspose2d(8, 3, 3, stride=4, padding=2)

这会导致更接近维度的输出但不准确，所以现在的错误是：

RuntimeError: input and target shapes do not match: input [1 x 3 x 263 x 407], target [1 x 3 x 288 x 432] at /pytorch/aten/src/THCUNN/generic/MSECriterion.cu:12

应如何计算输出解码器维度以产生正确的维度？

Answer 1

有几种方法，这是一个解决方案：

class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
#         torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True)
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=3, padding=1),  # b, 16, 10, 10
            nn.ReLU(True),
            nn.MaxPool2d(2, stride=2),  # b, 16, 5, 5
            nn.Conv2d(16, 8, 3, stride=2, padding=1),  # b, 8, 3, 3
            nn.ReLU(True),
            nn.MaxPool2d(3, stride=1)  # b, 8, 2, 2
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(8, 16, 2, stride=1),  # b, 16, 5, 5
            nn.ReLU(True),
            nn.ConvTranspose2d(16, 8, 3, stride=3, padding=1),  # b, 8, 15, 15
            nn.ReLU(True),
            nn.ConvTranspose2d(8, 3, 2, stride=2, padding=1),  # b, 1, 28, 28
            nn.ReLU(True),
            nn.ConvTranspose2d(3, 3, 2, stride=2, padding=1),  # b, 1, 28, 28
            nn.ReLU(True),
            nn.ConvTranspose2d(3, 3, 25, stride=1),
            nn.ReLU(True),
            nn.ConvTranspose2d(3, 3, 3, stride=1),
            nn.Tanh()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

这是公式；

N --> 输入尺寸，F --> 滤波器尺寸，步幅 -> 步幅大小，pdg-> 填充大小

ConvTranspose2d;

OutputSize = N*stride + F - stride - pdg*2

Conv2d;

OutputSize = (N - F)/stride + 1 + pdg*2/stride [例如32/3=10 忽略逗号后的]

Convolutional encoder error - 'RuntimeError: input and target shapes do not match'

Convolutional encoder error - 'RuntimeError: input and target shapes do not match'

autoencoder

deep-learning

conv-neural-network

pytorch