可重现的 Pytorch 结果和随机种子
Reproducable Pytorch Results & Random Seeds
我有一个带有 Pytorch 的简单玩具 NN。我正在设置我可以在文档中找到的所有种子以及随机的 numpy。
如果我 运行 下面的代码从上到下,结果似乎是可重现的。
但是,如果我 运行 块 1 仅一次,然后每次 运行 块 2,结果会发生变化(有时会发生显着变化)。我不确定为什么会发生这种情况,因为每次都会重新初始化网络和重置优化器。
我使用的是 0.4.0 版本
区块 #1
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.utils.data as utils_data
from torch.autograd import Variable
from torch import optim, nn
from torch.utils.data import Dataset
import torch.nn.functional as F
from torch.nn.init import xavier_uniform_, xavier_normal_,uniform_
torch.manual_seed(123)
import random
random.seed(123)
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
%matplotlib inline
cuda=True #set to true uses GPU
if cuda:
torch.cuda.manual_seed(123)
#load boston data from scikit
boston = load_boston()
x=boston.data
y=boston.target
y=y.reshape(y.shape[0],1)
#train and test
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=123, shuffle=False)
#change to tensors
x_train = torch.from_numpy(x_train)
y_train = torch.from_numpy(y_train)
#create dataset and use data loader
training_samples = utils_data.TensorDataset(x_train, y_train)
data_loader_trn = utils_data.DataLoader(training_samples, batch_size=64,drop_last=False)
#change to tensors
x_test = torch.from_numpy(x_test)
y_test = torch.from_numpy(y_test)
#create dataset and use data loader
testing_samples = utils_data.TensorDataset(x_test, y_test)
data_loader_test = utils_data.DataLoader(testing_samples, batch_size=64,drop_last=False)
#simple model
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
#all the layers
self.fc1 = nn.Linear(x.shape[1], 20)
xavier_uniform_(self.fc1.weight.data) #this is how you can change the weight init
self.drop = nn.Dropout(p=0.5)
self.fc2 = nn.Linear(20, 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x= self.drop(x)
x = self.fc2(x)
return x
区块 #2
net=Net()
if cuda:
net.cuda()
# create a stochastic gradient descent optimizer
optimizer = optim.Adam(net.parameters())
# create a loss function (mse)
loss = nn.MSELoss(size_average=False)
# run the main training loop
epochs =20
hold_loss=[]
for epoch in range(epochs):
cum_loss=0.
cum_records_epoch =0
for batch_idx, (data, target) in enumerate(data_loader_trn):
tr_x, tr_y = data.float(), target.float()
if cuda:
tr_x, tr_y = tr_x.cuda(), tr_y.cuda()
# Reset gradient
optimizer.zero_grad()
# Forward pass
fx = net(tr_x)
output = loss(fx, tr_y) #loss for this batch
cum_loss += output.item() #accumulate the loss
# Backward
output.backward()
# Update parameters based on backprop
optimizer.step()
cum_records_epoch +=len(tr_x)
if batch_idx % 1 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, cum_records_epoch, len(data_loader_trn.dataset),
100. * (batch_idx+1) / len(data_loader_trn), output.item()))
print('Epoch average loss: {:.6f}'.format(cum_loss/cum_records_epoch))
hold_loss.append(cum_loss/cum_records_epoch)
#training loss
plt.plot(np.array(hold_loss))
plt.show()
可能的原因
不知道"sometimes dramatic differences"是什么,不好回答;但当 运行ning [block_1 x1; block_2 x1] xN
时有不同的结果(读“运行ning block_1
然后 block_2
一次;并重复这两个操作 N
次)和[block_1 x1; block_2 xN] x1
是有道理的,因为伪随机数生成器 (PRNG) 和种子是如何工作的。
在第一种情况下,您在每个 block_2
之后重新初始化 block_1
中的 PRNG,因此 block_2
的每个 N
实例都将访问相同的伪随机数序列,由每个 block_1
之前播种。
在第二种情况下,PRNG 仅由单个 block_1
运行 初始化一次。所以 block_2
的每个实例都会有不同的随机值。
(有关 PRNG 和种子的更多信息,您可以查看:random.seed(): What does it do?)
简化示例
让我们假设 numpy/CUDA/pytorch 实际上使用了一个非常差的 PRNG,它只有 returns 增量值(即 PRNG(x_n) = PRNG(x_(n-1)) + 1
,x_0 = seed
)。如果你用 0
作为这个生成器的种子,它将因此 return 1
第一个 random()
调用,2
第二个调用,等等
为了示例,现在让我们也简化您的块:
def block_1():
seed = 0
print("seed: {}".format(seed))
prng.seed(seed)
--
def block_2():
res = "random results:"
for i in range(4):
res += " {}".format(prng.random())
print(res)
让我们比较一下 [block_1 x1; block_2 x1] xN
和 [block_1 x1; block_2 xN] x1
与 N=3
:
for i in range(3):
block_1()
block_2()
# > seed: 0
# > random results: 1 2 3 4
# > seed: 0
# > random results: 1 2 3 4
# > seed: 0
# > random results: 1 2 3 4
block_1()
for i in range(3):
block_2()
# > seed: 0
# > random results: 1 2 3 4
# > random results: 4 5 6 7
# > random results: 8 9 10 11
我有一个带有 Pytorch 的简单玩具 NN。我正在设置我可以在文档中找到的所有种子以及随机的 numpy。
如果我 运行 下面的代码从上到下,结果似乎是可重现的。
但是,如果我 运行 块 1 仅一次,然后每次 运行 块 2,结果会发生变化(有时会发生显着变化)。我不确定为什么会发生这种情况,因为每次都会重新初始化网络和重置优化器。
我使用的是 0.4.0 版本
区块 #1
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.utils.data as utils_data
from torch.autograd import Variable
from torch import optim, nn
from torch.utils.data import Dataset
import torch.nn.functional as F
from torch.nn.init import xavier_uniform_, xavier_normal_,uniform_
torch.manual_seed(123)
import random
random.seed(123)
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
%matplotlib inline
cuda=True #set to true uses GPU
if cuda:
torch.cuda.manual_seed(123)
#load boston data from scikit
boston = load_boston()
x=boston.data
y=boston.target
y=y.reshape(y.shape[0],1)
#train and test
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=123, shuffle=False)
#change to tensors
x_train = torch.from_numpy(x_train)
y_train = torch.from_numpy(y_train)
#create dataset and use data loader
training_samples = utils_data.TensorDataset(x_train, y_train)
data_loader_trn = utils_data.DataLoader(training_samples, batch_size=64,drop_last=False)
#change to tensors
x_test = torch.from_numpy(x_test)
y_test = torch.from_numpy(y_test)
#create dataset and use data loader
testing_samples = utils_data.TensorDataset(x_test, y_test)
data_loader_test = utils_data.DataLoader(testing_samples, batch_size=64,drop_last=False)
#simple model
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
#all the layers
self.fc1 = nn.Linear(x.shape[1], 20)
xavier_uniform_(self.fc1.weight.data) #this is how you can change the weight init
self.drop = nn.Dropout(p=0.5)
self.fc2 = nn.Linear(20, 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x= self.drop(x)
x = self.fc2(x)
return x
区块 #2
net=Net()
if cuda:
net.cuda()
# create a stochastic gradient descent optimizer
optimizer = optim.Adam(net.parameters())
# create a loss function (mse)
loss = nn.MSELoss(size_average=False)
# run the main training loop
epochs =20
hold_loss=[]
for epoch in range(epochs):
cum_loss=0.
cum_records_epoch =0
for batch_idx, (data, target) in enumerate(data_loader_trn):
tr_x, tr_y = data.float(), target.float()
if cuda:
tr_x, tr_y = tr_x.cuda(), tr_y.cuda()
# Reset gradient
optimizer.zero_grad()
# Forward pass
fx = net(tr_x)
output = loss(fx, tr_y) #loss for this batch
cum_loss += output.item() #accumulate the loss
# Backward
output.backward()
# Update parameters based on backprop
optimizer.step()
cum_records_epoch +=len(tr_x)
if batch_idx % 1 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, cum_records_epoch, len(data_loader_trn.dataset),
100. * (batch_idx+1) / len(data_loader_trn), output.item()))
print('Epoch average loss: {:.6f}'.format(cum_loss/cum_records_epoch))
hold_loss.append(cum_loss/cum_records_epoch)
#training loss
plt.plot(np.array(hold_loss))
plt.show()
可能的原因
不知道"sometimes dramatic differences"是什么,不好回答;但当 运行ning [block_1 x1; block_2 x1] xN
时有不同的结果(读“运行ning block_1
然后 block_2
一次;并重复这两个操作 N
次)和[block_1 x1; block_2 xN] x1
是有道理的,因为伪随机数生成器 (PRNG) 和种子是如何工作的。
在第一种情况下,您在每个 block_2
之后重新初始化 block_1
中的 PRNG,因此 block_2
的每个 N
实例都将访问相同的伪随机数序列,由每个 block_1
之前播种。
在第二种情况下,PRNG 仅由单个 block_1
运行 初始化一次。所以 block_2
的每个实例都会有不同的随机值。
(有关 PRNG 和种子的更多信息,您可以查看:random.seed(): What does it do?)
简化示例
让我们假设 numpy/CUDA/pytorch 实际上使用了一个非常差的 PRNG,它只有 returns 增量值(即 PRNG(x_n) = PRNG(x_(n-1)) + 1
,x_0 = seed
)。如果你用 0
作为这个生成器的种子,它将因此 return 1
第一个 random()
调用,2
第二个调用,等等
为了示例,现在让我们也简化您的块:
def block_1():
seed = 0
print("seed: {}".format(seed))
prng.seed(seed)
--
def block_2():
res = "random results:"
for i in range(4):
res += " {}".format(prng.random())
print(res)
让我们比较一下 [block_1 x1; block_2 x1] xN
和 [block_1 x1; block_2 xN] x1
与 N=3
:
for i in range(3):
block_1()
block_2()
# > seed: 0
# > random results: 1 2 3 4
# > seed: 0
# > random results: 1 2 3 4
# > seed: 0
# > random results: 1 2 3 4
block_1()
for i in range(3):
block_2()
# > seed: 0
# > random results: 1 2 3 4
# > random results: 4 5 6 7
# > random results: 8 9 10 11