运行 可以在 pytorch 中散布 matmul 吗?
Is it possible to run scatter matmul in pytorch?
编辑:显然 DGL 已经在处理它了:https://github.com/dmlc/dgl/pull/3641
我有几种类型的嵌入,每一种都需要自己的线性投影。我可以使用以下类型的 for 循环解决问题:
emb_out = dict()
for ntype in ntypes:
emb_out[ntype] = self.lin_layer[ntype](emb[ntype])
但理想情况下,我想对 运行 它并行执行某种分散操作。类似于:
pytorch_scatter(lin_layers, embeddings, layer_map, reduce='matmul')
,其中层映射告诉哪个嵌入应该经过哪个层。如果我有 2 种类型的线性层并且 batch_size = 5,那么 layer_map 将类似于 [1,0,1,1,0].
是否可以像 pytorch_scatter 那样以高效的方式对 for 循环进行向量化?请检查以下最小示例。
import torch
import random
import numpy as np
seed = 42
torch.manual_seed(seed)
random.seed(seed)
def matmul_single_embtype(lin_layers, embeddings, layer_map):
#run single linear layer over all embeddings, irrespective of type
output_embeddings = torch.matmul(lin_layers[0], embeddings.T).T
return output_embeddings
def matmul_for_loop(lin_layers, embeddings, layer_map):
#let each embedding type have its own projection, looping over emb types
output_embeddings = dict()
for emb_type in np.unique(layer_map):
output_embeddings[emb_type] = torch.matmul(lin_layers[emb_type], embeddings[layer_map == emb_type].T).T
return output_embeddings
def matmul_scatter(lin_layers, embeddings, layer_map):
#parallelize the for loop by creating a diagonal matrix of lin layers
#this is very innefficient, because creates a copy of the layer for each embedding, instead of broadcasting
mapped_lin_layers = [lin_layers[i] for i in layer_map]
mapped_lin_layers = torch.block_diag(*mapped_lin_layers) #batch_size*inp_size x batch_size*output_size
embeddings_stacked = embeddings.view(-1,1) #stack all embeddings to multiply the linear block
output_embeddings = torch.matmul(mapped_lin_layers, embeddings_stacked).view(embeddings.shape)
return output_embeddings
"""
GENERATE DATA
lin_layers:
List of matrices of size n_layer x inp_size x output_size
embeddings:
Matrix of size batch_size x inp_size
layer_map:
Vector os size batch_size stating which embedding should go thorugh each layer
"""
emb_size = 32
batch_size = 500
emb_types = 20
layer_map = [random.choice(list(range(emb_types))) for i in range(batch_size)]
lin_layers = [torch.arange(emb_size*emb_size, dtype=torch.float32).view(emb_size,emb_size) for i in range(emb_types)]
embeddings = torch.arange(batch_size*emb_size, dtype=torch.float32).view(batch_size,emb_size)
grouped_emb = {i: embeddings[layer_map==i] for i in np.unique(layer_map)} #separate embeddings by embedding type
#Run experiments
%timeit matmul_scatter(lin_layers, embeddings, layer_map)
%timeit matmul_for_loop(lin_layers, embeddings, layer_map)
%timeit matmul_single_embtype(lin_layers, embeddings, layer_map)
>>>>>133 ms ± 2.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
>>>>>1.64 ms ± 14 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
>>>>>31.4 µs ± 805 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
相关的 Whosebug 问题:
pytorch 中的相关问题:https://github.com/pytorch/pytorch/issues/31942
刚刚发现 DGL 已经在开发此功能:https://github.com/dmlc/dgl/pull/3641
编辑:显然 DGL 已经在处理它了:https://github.com/dmlc/dgl/pull/3641
我有几种类型的嵌入,每一种都需要自己的线性投影。我可以使用以下类型的 for 循环解决问题:
emb_out = dict()
for ntype in ntypes:
emb_out[ntype] = self.lin_layer[ntype](emb[ntype])
但理想情况下,我想对 运行 它并行执行某种分散操作。类似于:
pytorch_scatter(lin_layers, embeddings, layer_map, reduce='matmul')
,其中层映射告诉哪个嵌入应该经过哪个层。如果我有 2 种类型的线性层并且 batch_size = 5,那么 layer_map 将类似于 [1,0,1,1,0].
是否可以像 pytorch_scatter 那样以高效的方式对 for 循环进行向量化?请检查以下最小示例。
import torch
import random
import numpy as np
seed = 42
torch.manual_seed(seed)
random.seed(seed)
def matmul_single_embtype(lin_layers, embeddings, layer_map):
#run single linear layer over all embeddings, irrespective of type
output_embeddings = torch.matmul(lin_layers[0], embeddings.T).T
return output_embeddings
def matmul_for_loop(lin_layers, embeddings, layer_map):
#let each embedding type have its own projection, looping over emb types
output_embeddings = dict()
for emb_type in np.unique(layer_map):
output_embeddings[emb_type] = torch.matmul(lin_layers[emb_type], embeddings[layer_map == emb_type].T).T
return output_embeddings
def matmul_scatter(lin_layers, embeddings, layer_map):
#parallelize the for loop by creating a diagonal matrix of lin layers
#this is very innefficient, because creates a copy of the layer for each embedding, instead of broadcasting
mapped_lin_layers = [lin_layers[i] for i in layer_map]
mapped_lin_layers = torch.block_diag(*mapped_lin_layers) #batch_size*inp_size x batch_size*output_size
embeddings_stacked = embeddings.view(-1,1) #stack all embeddings to multiply the linear block
output_embeddings = torch.matmul(mapped_lin_layers, embeddings_stacked).view(embeddings.shape)
return output_embeddings
"""
GENERATE DATA
lin_layers:
List of matrices of size n_layer x inp_size x output_size
embeddings:
Matrix of size batch_size x inp_size
layer_map:
Vector os size batch_size stating which embedding should go thorugh each layer
"""
emb_size = 32
batch_size = 500
emb_types = 20
layer_map = [random.choice(list(range(emb_types))) for i in range(batch_size)]
lin_layers = [torch.arange(emb_size*emb_size, dtype=torch.float32).view(emb_size,emb_size) for i in range(emb_types)]
embeddings = torch.arange(batch_size*emb_size, dtype=torch.float32).view(batch_size,emb_size)
grouped_emb = {i: embeddings[layer_map==i] for i in np.unique(layer_map)} #separate embeddings by embedding type
#Run experiments
%timeit matmul_scatter(lin_layers, embeddings, layer_map)
%timeit matmul_for_loop(lin_layers, embeddings, layer_map)
%timeit matmul_single_embtype(lin_layers, embeddings, layer_map)
>>>>>133 ms ± 2.47 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
>>>>>1.64 ms ± 14 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
>>>>>31.4 µs ± 805 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
相关的 Whosebug 问题:
pytorch 中的相关问题:https://github.com/pytorch/pytorch/issues/31942
刚刚发现 DGL 已经在开发此功能:https://github.com/dmlc/dgl/pull/3641