RuntimeError: shape '[128, 3, 5, 4, 5, 4]' is invalid for input of size 185856
RuntimeError: shape '[128, 3, 5, 4, 5, 4]' is invalid for input of size 185856
我正在使用以下代码使用 Visual Transformer (vit_b_16) 从图像中提取描述符,但出现错误:RuntimeError: shape '[128, 3, 5, 4, 5, 4 ]' 对于大小为 185856 的输入无效。有谁知道我做错了什么以及如何解决它?
def img_to_patch(x, patch_size, flatten_channels=True):
B, C, H, W = x.shape
x = x.reshape(B, C, H//patch_size, patch_size, W//patch_size, patch_size)
x = x.permute(0, 2, 4, 1, 3, 5) # [B, H', W', C, p_H, p_W]
x = x.flatten(1,2) # [B, H'*W', C, p_H, p_W]
if flatten_channels:
x = x.flatten(2,4) # [B, H'*W', C*p_H*p_W]
return x
class AttentionBlock(nn.Module):
def __init__(self, embed_dim, hidden_dim, num_heads, dropout=0.0):
super().__init__()
self.layer_norm_1 = nn.LayerNorm(embed_dim)
self.attn = nn.MultiheadAttention(embed_dim, num_heads)
self.layer_norm_2 = nn.LayerNorm(embed_dim)
self.linear = nn.Sequential(
nn.Linear(embed_dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, embed_dim),
nn.Dropout(dropout)
)
def forward(self, x):
inp_x = self.layer_norm_1(x)
x = x + self.attn(inp_x, inp_x, inp_x)[0]
x = x + self.linear(self.layer_norm_2(x))
return x
class VisionTransformer(nn.Module):
def __init__(self, embed_dim, hidden_dim, num_channels, num_heads, num_layers, num_classes, patch_size, num_patches, dropout=0.0):
super().__init__()
self.patch_size = patch_size
# Layers/Networks
self.input_layer = nn.Linear(num_channels*(patch_size**2), embed_dim)
self.transformer = nn.Sequential(*[AttentionBlock(embed_dim, hidden_dim, num_heads, dropout=dropout) for _ in range(num_layers)])
self.mlp_head = nn.Sequential(
nn.LayerNorm(embed_dim),
nn.Linear(embed_dim, num_classes)
)
self.dropout = nn.Dropout(dropout)
# Parameters/Embeddings
self.cls_token = nn.Parameter(torch.randn(1,1,embed_dim))
self.pos_embedding = nn.Parameter(torch.randn(1,1+num_patches,embed_dim))
def forward(self, x):
# Preprocess input
print(x.shape)
x = img_to_patch(x, self.patch_size)
print(x.shape)
B, T, _ = x.shape
x = self.input_layer(x)
# Add CLS token and positional encoding
cls_token = self.cls_token.repeat(B, 1, 1)
x = torch.cat([cls_token, x], dim=1)
x = x + self.pos_embedding[:,:T+1]
# Apply Transforrmer
x = self.dropout(x)
x = x.transpose(0, 1)
x = self.transformer(x)
# Perform classification prediction
cls = x[0]
out = self.mlp_head(cls)
return out
class ViT(pl.LightningModule):
def __init__(self, model_kwargs, lr):
super().__init__()
self.save_hyperparameters()
self.model = VisionTransformer(**model_kwargs)
# self.example_input_array = next(iter(train_loader))[0]
def forward(self, x):
return self.model(x)
我正在像这样初始化 Transformer:
if network_variant == 'vb16':
net = ViT(model_kwargs={
'embed_dim': 256,
'hidden_dim': 512,
'num_heads': 8,
'num_layers': 6,
'patch_size': 4,
'num_channels': 3,
'num_patches': 64,
'num_classes': num_classes,
'dropout': 0.2
},
lr=3e-4)
这是我第一次使用 PyTorch 和 Vision Transformers,所以我真的不确定我做错了什么。
错误是说 numpy 正在尝试填充尺寸为 128 * 35454 的矩阵,这需要 153,600元素。但是,整形前的数据有 185,856 个元素。您很可能误算了一些指数。区别是 32256=12843*21,它提供了一些关于您可能误算了哪些索引的提示......您唯一调用 reshape() 的地方是第二行img_to_patch()
,所以我认为这就是错误的来源。 (在你的问题中包含追溯将有助于确认这一点。)
检查导致该行的变量的大小并确认它们是否符合您的预期。
看看你的代码,most likely 问题是你在划分 H
和 W
通过 patch_size
。如果 H
和 W
不是 patch_size
的倍数,您将需要一些东西来决定要丢弃哪些像素:np.reshape()
不会自行做出决定。
我正在使用以下代码使用 Visual Transformer (vit_b_16) 从图像中提取描述符,但出现错误:RuntimeError: shape '[128, 3, 5, 4, 5, 4 ]' 对于大小为 185856 的输入无效。有谁知道我做错了什么以及如何解决它?
def img_to_patch(x, patch_size, flatten_channels=True):
B, C, H, W = x.shape
x = x.reshape(B, C, H//patch_size, patch_size, W//patch_size, patch_size)
x = x.permute(0, 2, 4, 1, 3, 5) # [B, H', W', C, p_H, p_W]
x = x.flatten(1,2) # [B, H'*W', C, p_H, p_W]
if flatten_channels:
x = x.flatten(2,4) # [B, H'*W', C*p_H*p_W]
return x
class AttentionBlock(nn.Module):
def __init__(self, embed_dim, hidden_dim, num_heads, dropout=0.0):
super().__init__()
self.layer_norm_1 = nn.LayerNorm(embed_dim)
self.attn = nn.MultiheadAttention(embed_dim, num_heads)
self.layer_norm_2 = nn.LayerNorm(embed_dim)
self.linear = nn.Sequential(
nn.Linear(embed_dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, embed_dim),
nn.Dropout(dropout)
)
def forward(self, x):
inp_x = self.layer_norm_1(x)
x = x + self.attn(inp_x, inp_x, inp_x)[0]
x = x + self.linear(self.layer_norm_2(x))
return x
class VisionTransformer(nn.Module):
def __init__(self, embed_dim, hidden_dim, num_channels, num_heads, num_layers, num_classes, patch_size, num_patches, dropout=0.0):
super().__init__()
self.patch_size = patch_size
# Layers/Networks
self.input_layer = nn.Linear(num_channels*(patch_size**2), embed_dim)
self.transformer = nn.Sequential(*[AttentionBlock(embed_dim, hidden_dim, num_heads, dropout=dropout) for _ in range(num_layers)])
self.mlp_head = nn.Sequential(
nn.LayerNorm(embed_dim),
nn.Linear(embed_dim, num_classes)
)
self.dropout = nn.Dropout(dropout)
# Parameters/Embeddings
self.cls_token = nn.Parameter(torch.randn(1,1,embed_dim))
self.pos_embedding = nn.Parameter(torch.randn(1,1+num_patches,embed_dim))
def forward(self, x):
# Preprocess input
print(x.shape)
x = img_to_patch(x, self.patch_size)
print(x.shape)
B, T, _ = x.shape
x = self.input_layer(x)
# Add CLS token and positional encoding
cls_token = self.cls_token.repeat(B, 1, 1)
x = torch.cat([cls_token, x], dim=1)
x = x + self.pos_embedding[:,:T+1]
# Apply Transforrmer
x = self.dropout(x)
x = x.transpose(0, 1)
x = self.transformer(x)
# Perform classification prediction
cls = x[0]
out = self.mlp_head(cls)
return out
class ViT(pl.LightningModule):
def __init__(self, model_kwargs, lr):
super().__init__()
self.save_hyperparameters()
self.model = VisionTransformer(**model_kwargs)
# self.example_input_array = next(iter(train_loader))[0]
def forward(self, x):
return self.model(x)
我正在像这样初始化 Transformer:
if network_variant == 'vb16':
net = ViT(model_kwargs={
'embed_dim': 256,
'hidden_dim': 512,
'num_heads': 8,
'num_layers': 6,
'patch_size': 4,
'num_channels': 3,
'num_patches': 64,
'num_classes': num_classes,
'dropout': 0.2
},
lr=3e-4)
这是我第一次使用 PyTorch 和 Vision Transformers,所以我真的不确定我做错了什么。
错误是说 numpy 正在尝试填充尺寸为 128 * 35454 的矩阵,这需要 153,600元素。但是,整形前的数据有 185,856 个元素。您很可能误算了一些指数。区别是 32256=12843*21,它提供了一些关于您可能误算了哪些索引的提示......您唯一调用 reshape() 的地方是第二行img_to_patch()
,所以我认为这就是错误的来源。 (在你的问题中包含追溯将有助于确认这一点。)
检查导致该行的变量的大小并确认它们是否符合您的预期。
看看你的代码,most likely 问题是你在划分 H
和 W
通过 patch_size
。如果 H
和 W
不是 patch_size
的倍数,您将需要一些东西来决定要丢弃哪些像素:np.reshape()
不会自行做出决定。