PyTorch LSTM for Daily Stock Return 预测 - 训练损失始终低于测试损失
PyTorch LSTM for Daily Stock Return Prediction - Train loss is consistently lower than test loss
我想知道是否有人可以分享一些想法,说明为什么我的训练损失开始于比测试损失更高的水平?
我正在尝试 运行 每日股票 return 数据的 LSTM 作为唯一输入,并使用前 10 天来预测第二天的价格。 Training/test/validation套不重叠,所以不漏。不使用任何只会影响训练数据的正则化。
目前真的很困惑,因为我似乎找不到错误。
我将包含下面的代码,但它很长
# Defining the LSTM class
import torch
import torch.nn as nn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
class LSTM(nn.Module):
def __init__(self, n_inputs, n_hidden, num_layers, n_outputs):
super(LSTM, self).__init__()
self.D = n_inputs
self.M = n_hidden
self.K = n_outputs
self.L = num_layers
self.rnn = nn.LSTM(
input_size=self.D,
hidden_size=self.M,
num_layers=self.L,
batch_first=True)
self.fc = nn.Linear(self.M, self.K)
def forward(self, X):
# initial hidden states
h0 = torch.zeros(self.L, X.size(0), self.M).to(device)
c0 = torch.zeros(self.L, X.size(0), self.M).to(device)
# get RNN unit output
out, _ = self.rnn(X, (h0, c0))
# we only want h(T) at the final time step
out = self.fc(out[:, -1, :])
return out
# Defining a function to train the LSTM
def full_gd(model,
loss_function,
optimizer,
X_train,
y_train,
X_test,
y_test,
no_epochs):
# Stuff to store
train_losses = np.zeros(no_epochs)
test_losses = np.zeros(no_epochs)
for it in range(no_epochs):
# zero the parameter gradients
optimizer.zero_grad()
# Forward pass
outputs = model(X_train)
loss = loss_function(outputs, y_train)
# Backward and optimize
loss.backward()
optimizer.step()
# Save losses
train_losses[it] = loss.item()
# Test loss
test_outputs = model(X_test)
test_loss = loss_function(test_outputs, y_test)
test_losses[it] = test_loss.item()
if (it + 1) % 10 == 0:
print(f'Epoch {it+1}/{no_epochs}, Train Loss: {loss.item():.4f}, Test Loss: {test_loss.item():.4f}')
return train_losses, test_losses
# Import sklearn's StandardScaler to scale the returns data
scaler = StandardScaler()
scaler.fit(data[:3*len(data)//5])
historical_returns = scaler.transform(data)
# Creating the dataset to train the LSTM. D is the number of input features. T is the number of data points used in forecasting
T = 10
D = 1
X = []
Y = []
for t in range(len(historical_returns) - T):
x = historical_returns[t:t+T]
X.append(x)
y = historical_returns[t+T]
Y.append(y)
X_historical = np.array(X).reshape(-1, T, 1)
Y_historical = np.array(Y).reshape(-1, 1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Splitting the data into a 60/20/20 train/validation/test split. No random split is used here as this is a time series dataset
x_train1 = torch.from_numpy(X_historical[:3*len(historical_returns)//5].astype(np.float32))
y_train1 = torch.from_numpy(Y_historical[:3*len(historical_returns)//5].astype(np.float32))
x_val1 = torch.from_numpy(X_historical[-2*len(historical_returns)//5: -1*len(historical_returns)//5].astype(np.float32))
y_val1 = torch.from_numpy(Y_historical[-2*len(historical_returns)//5: -1*len(historical_returns)//5].astype(np.float32))
x_test1 = torch.from_numpy(X_historical[-1*len(historical_returns)//5:].astype(np.float32))
y_test1 = torch.from_numpy(Y_historical[-1*len(historical_returns)//5:].astype(np.float32))
# move data to GPU
x_train1, y_train1 = x_train1.to(device), y_train1.to(device)
x_val1, y_val1 = x_val1.to(device), y_val1.to(device)
x_test1, y_test1 = x_test1.to(device), y_test1.to(device)
x_train1 = x_train1.reshape(-1, T, 1)
x_test1 = x_test1.reshape(-1, T, 1)
x_val1 = x_val1.reshape(-1, T, 1)
# Define the model parameters
Hidden = 10
model = LSTM(1, Hidden, 1, 1)
model.to(device)
loss_function = nn.MSELoss()
learning_rate = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Train the model
no_epochs = 200
train_losses, validation_losses = full_gd(model,
loss_function,
optimizer,
x_train1,
y_train1,
x_val1,
y_val1,
no_epochs)
# Plot training and validation loss
plt.figure(figsize=(12,8))
plt.plot(train_losses, label='train loss')
plt.plot(validation_losses, label='test loss')
plt.legend()
plt.show()
嗯,可能有几个原因。
- 你的任务很难,或者你现有的数据很难。
- 您的验证拆分包含非常简单的任务。
这个问题的另一个自然原因是数据集大小的增加,因为验证拆分相对小于训练拆分。理论上通过随机猜测(这在某种程度上是模型的初始状态),你更有可能在大量猜测中失败。
您的模型似乎无法学习,它在训练数据上表现不佳,这是不希望的。请记住,RNN 很难训练。您可以尝试一些潜在的帮助,例如增加 epoch 大小,使模型更复杂。如果你可以将你的结果与另一项工作进行比较,你应该这样做。这将指导您进行实验的好坏。
我想知道是否有人可以分享一些想法,说明为什么我的训练损失开始于比测试损失更高的水平?
我正在尝试 运行 每日股票 return 数据的 LSTM 作为唯一输入,并使用前 10 天来预测第二天的价格。 Training/test/validation套不重叠,所以不漏。不使用任何只会影响训练数据的正则化。
目前真的很困惑,因为我似乎找不到错误。
我将包含下面的代码,但它很长
# Defining the LSTM class
import torch
import torch.nn as nn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
class LSTM(nn.Module):
def __init__(self, n_inputs, n_hidden, num_layers, n_outputs):
super(LSTM, self).__init__()
self.D = n_inputs
self.M = n_hidden
self.K = n_outputs
self.L = num_layers
self.rnn = nn.LSTM(
input_size=self.D,
hidden_size=self.M,
num_layers=self.L,
batch_first=True)
self.fc = nn.Linear(self.M, self.K)
def forward(self, X):
# initial hidden states
h0 = torch.zeros(self.L, X.size(0), self.M).to(device)
c0 = torch.zeros(self.L, X.size(0), self.M).to(device)
# get RNN unit output
out, _ = self.rnn(X, (h0, c0))
# we only want h(T) at the final time step
out = self.fc(out[:, -1, :])
return out
# Defining a function to train the LSTM
def full_gd(model,
loss_function,
optimizer,
X_train,
y_train,
X_test,
y_test,
no_epochs):
# Stuff to store
train_losses = np.zeros(no_epochs)
test_losses = np.zeros(no_epochs)
for it in range(no_epochs):
# zero the parameter gradients
optimizer.zero_grad()
# Forward pass
outputs = model(X_train)
loss = loss_function(outputs, y_train)
# Backward and optimize
loss.backward()
optimizer.step()
# Save losses
train_losses[it] = loss.item()
# Test loss
test_outputs = model(X_test)
test_loss = loss_function(test_outputs, y_test)
test_losses[it] = test_loss.item()
if (it + 1) % 10 == 0:
print(f'Epoch {it+1}/{no_epochs}, Train Loss: {loss.item():.4f}, Test Loss: {test_loss.item():.4f}')
return train_losses, test_losses
# Import sklearn's StandardScaler to scale the returns data
scaler = StandardScaler()
scaler.fit(data[:3*len(data)//5])
historical_returns = scaler.transform(data)
# Creating the dataset to train the LSTM. D is the number of input features. T is the number of data points used in forecasting
T = 10
D = 1
X = []
Y = []
for t in range(len(historical_returns) - T):
x = historical_returns[t:t+T]
X.append(x)
y = historical_returns[t+T]
Y.append(y)
X_historical = np.array(X).reshape(-1, T, 1)
Y_historical = np.array(Y).reshape(-1, 1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Splitting the data into a 60/20/20 train/validation/test split. No random split is used here as this is a time series dataset
x_train1 = torch.from_numpy(X_historical[:3*len(historical_returns)//5].astype(np.float32))
y_train1 = torch.from_numpy(Y_historical[:3*len(historical_returns)//5].astype(np.float32))
x_val1 = torch.from_numpy(X_historical[-2*len(historical_returns)//5: -1*len(historical_returns)//5].astype(np.float32))
y_val1 = torch.from_numpy(Y_historical[-2*len(historical_returns)//5: -1*len(historical_returns)//5].astype(np.float32))
x_test1 = torch.from_numpy(X_historical[-1*len(historical_returns)//5:].astype(np.float32))
y_test1 = torch.from_numpy(Y_historical[-1*len(historical_returns)//5:].astype(np.float32))
# move data to GPU
x_train1, y_train1 = x_train1.to(device), y_train1.to(device)
x_val1, y_val1 = x_val1.to(device), y_val1.to(device)
x_test1, y_test1 = x_test1.to(device), y_test1.to(device)
x_train1 = x_train1.reshape(-1, T, 1)
x_test1 = x_test1.reshape(-1, T, 1)
x_val1 = x_val1.reshape(-1, T, 1)
# Define the model parameters
Hidden = 10
model = LSTM(1, Hidden, 1, 1)
model.to(device)
loss_function = nn.MSELoss()
learning_rate = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Train the model
no_epochs = 200
train_losses, validation_losses = full_gd(model,
loss_function,
optimizer,
x_train1,
y_train1,
x_val1,
y_val1,
no_epochs)
# Plot training and validation loss
plt.figure(figsize=(12,8))
plt.plot(train_losses, label='train loss')
plt.plot(validation_losses, label='test loss')
plt.legend()
plt.show()
嗯,可能有几个原因。
- 你的任务很难,或者你现有的数据很难。
- 您的验证拆分包含非常简单的任务。
这个问题的另一个自然原因是数据集大小的增加,因为验证拆分相对小于训练拆分。理论上通过随机猜测(这在某种程度上是模型的初始状态),你更有可能在大量猜测中失败。
您的模型似乎无法学习,它在训练数据上表现不佳,这是不希望的。请记住,RNN 很难训练。您可以尝试一些潜在的帮助,例如增加 epoch 大小,使模型更复杂。如果你可以将你的结果与另一项工作进行比较,你应该这样做。这将指导您进行实验的好坏。