在 Python 中使用 LSTM 预测未来值
Forecast future values with LSTM in Python
此代码预测指定股票截至当前日期的值,但不预测超出训练数据集的日期。这段代码来自我之前提出的一个问题,所以我对它的理解很低。我假设解决方案是一个简单的变量更改以增加额外的时间,但我不知道需要操纵哪个值。
import pandas as pd
import numpy as np
import yfinance as yf
import os
import matplotlib.pyplot as plt
from IPython.display import display
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
pd.options.mode.chained_assignment = None
# download the data
df = yf.download(tickers=['AAPL'], period='2y')
# split the data
train_data = df[['Close']].iloc[: - 200, :]
valid_data = df[['Close']].iloc[- 200:, :]
# scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train_data)
train_data = scaler.transform(train_data)
valid_data = scaler.transform(valid_data)
# extract the training sequences
x_train, y_train = [], []
for i in range(60, train_data.shape[0]):
x_train.append(train_data[i - 60: i, 0])
y_train.append(train_data[i, 0])
x_train = np.array(x_train)
y_train = np.array(y_train)
# extract the validation sequences
x_valid = []
for i in range(60, valid_data.shape[0]):
x_valid.append(valid_data[i - 60: i, 0])
x_valid = np.array(x_valid)
# reshape the sequences
x_train = x_train.reshape(x_train.shape[0],
x_train.shape[1], 1)
x_valid = x_valid.reshape(x_valid.shape[0],
x_valid.shape[1], 1)
# train the model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True,
input_shape=x_train.shape[1:]))
model.add(LSTM(units=50))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, epochs=50, batch_size=128, verbose=1)
# generate the model predictions
y_pred = model.predict(x_valid)
y_pred = scaler.inverse_transform(y_pred)
y_pred = y_pred.flatten()
# plot the model predictions
df.rename(columns={'Close': 'Actual'}, inplace=True)
df['Predicted'] = np.nan
df['Predicted'].iloc[- y_pred.shape[0]:] = y_pred
df[['Actual', 'Predicted']].plot(title='AAPL')
display(df)
plt.show()
您可以训练您的模型来预测未来的序列(例如接下来的 30 天),而不是像现在这样预测下一个值(第二天)。
为此,您需要将输出定义为 y[t: t + H]
(而不是当前代码中的 y[t]
),其中 y
是时间序列,H
是预测期的长度(即您要预测的未来天数)。您还需要将最后一层的输出数量设置为 H
(而不是当前代码中的 1
)。
您仍然可以将输入定义为 y[t - T: t]
,其中 T
是回顾期的长度(或时间步数),因此模型的输入形状仍然是 (T, 1)
.回溯期 T
通常长于预测期 H
(即 T > H
),并且通常设置为 H
的倍数(即 T = m * H
,其中m > 1
是一个整数。)。
import numpy as np
import pandas as pd
import yfinance as yf
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
pd.options.mode.chained_assignment = None
tf.random.set_seed(0)
# download the data
df = yf.download(tickers=['AAPL'], period='1y')
y = df['Close'].fillna(method='ffill')
y = y.values.reshape(-1, 1)
# scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(y)
y = scaler.transform(y)
# generate the input and output sequences
n_lookback = 60 # length of input sequences (lookback period)
n_forecast = 30 # length of output sequences (forecast period)
X = []
Y = []
for i in range(n_lookback, len(y) - n_forecast + 1):
X.append(y[i - n_lookback: i])
Y.append(y[i: i + n_forecast])
X = np.array(X)
Y = np.array(Y)
# fit the model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(n_lookback, 1)))
model.add(LSTM(units=50))
model.add(Dense(n_forecast))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X, Y, epochs=100, batch_size=32, verbose=0)
# generate the forecasts
X_ = y[- n_lookback:] # last available input sequence
X_ = X_.reshape(1, n_lookback, 1)
Y_ = model.predict(X_).reshape(-1, 1)
Y_ = scaler.inverse_transform(Y_)
# organize the results in a data frame
df_past = df[['Close']].reset_index()
df_past.rename(columns={'index': 'Date', 'Close': 'Actual'}, inplace=True)
df_past['Date'] = pd.to_datetime(df_past['Date'])
df_past['Forecast'] = np.nan
df_past['Forecast'].iloc[-1] = df_past['Actual'].iloc[-1]
df_future = pd.DataFrame(columns=['Date', 'Actual', 'Forecast'])
df_future['Date'] = pd.date_range(start=df_past['Date'].iloc[-1] + pd.Timedelta(days=1), periods=n_forecast)
df_future['Forecast'] = Y_.flatten()
df_future['Actual'] = np.nan
results = df_past.append(df_future).set_index('Date')
# plot the results
results.plot(title='AAPL')
此代码预测指定股票截至当前日期的值,但不预测超出训练数据集的日期。这段代码来自我之前提出的一个问题,所以我对它的理解很低。我假设解决方案是一个简单的变量更改以增加额外的时间,但我不知道需要操纵哪个值。
import pandas as pd
import numpy as np
import yfinance as yf
import os
import matplotlib.pyplot as plt
from IPython.display import display
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
pd.options.mode.chained_assignment = None
# download the data
df = yf.download(tickers=['AAPL'], period='2y')
# split the data
train_data = df[['Close']].iloc[: - 200, :]
valid_data = df[['Close']].iloc[- 200:, :]
# scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train_data)
train_data = scaler.transform(train_data)
valid_data = scaler.transform(valid_data)
# extract the training sequences
x_train, y_train = [], []
for i in range(60, train_data.shape[0]):
x_train.append(train_data[i - 60: i, 0])
y_train.append(train_data[i, 0])
x_train = np.array(x_train)
y_train = np.array(y_train)
# extract the validation sequences
x_valid = []
for i in range(60, valid_data.shape[0]):
x_valid.append(valid_data[i - 60: i, 0])
x_valid = np.array(x_valid)
# reshape the sequences
x_train = x_train.reshape(x_train.shape[0],
x_train.shape[1], 1)
x_valid = x_valid.reshape(x_valid.shape[0],
x_valid.shape[1], 1)
# train the model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True,
input_shape=x_train.shape[1:]))
model.add(LSTM(units=50))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, epochs=50, batch_size=128, verbose=1)
# generate the model predictions
y_pred = model.predict(x_valid)
y_pred = scaler.inverse_transform(y_pred)
y_pred = y_pred.flatten()
# plot the model predictions
df.rename(columns={'Close': 'Actual'}, inplace=True)
df['Predicted'] = np.nan
df['Predicted'].iloc[- y_pred.shape[0]:] = y_pred
df[['Actual', 'Predicted']].plot(title='AAPL')
display(df)
plt.show()
您可以训练您的模型来预测未来的序列(例如接下来的 30 天),而不是像现在这样预测下一个值(第二天)。
为此,您需要将输出定义为 y[t: t + H]
(而不是当前代码中的 y[t]
),其中 y
是时间序列,H
是预测期的长度(即您要预测的未来天数)。您还需要将最后一层的输出数量设置为 H
(而不是当前代码中的 1
)。
您仍然可以将输入定义为 y[t - T: t]
,其中 T
是回顾期的长度(或时间步数),因此模型的输入形状仍然是 (T, 1)
.回溯期 T
通常长于预测期 H
(即 T > H
),并且通常设置为 H
的倍数(即 T = m * H
,其中m > 1
是一个整数。)。
import numpy as np
import pandas as pd
import yfinance as yf
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
pd.options.mode.chained_assignment = None
tf.random.set_seed(0)
# download the data
df = yf.download(tickers=['AAPL'], period='1y')
y = df['Close'].fillna(method='ffill')
y = y.values.reshape(-1, 1)
# scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(y)
y = scaler.transform(y)
# generate the input and output sequences
n_lookback = 60 # length of input sequences (lookback period)
n_forecast = 30 # length of output sequences (forecast period)
X = []
Y = []
for i in range(n_lookback, len(y) - n_forecast + 1):
X.append(y[i - n_lookback: i])
Y.append(y[i: i + n_forecast])
X = np.array(X)
Y = np.array(Y)
# fit the model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(n_lookback, 1)))
model.add(LSTM(units=50))
model.add(Dense(n_forecast))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X, Y, epochs=100, batch_size=32, verbose=0)
# generate the forecasts
X_ = y[- n_lookback:] # last available input sequence
X_ = X_.reshape(1, n_lookback, 1)
Y_ = model.predict(X_).reshape(-1, 1)
Y_ = scaler.inverse_transform(Y_)
# organize the results in a data frame
df_past = df[['Close']].reset_index()
df_past.rename(columns={'index': 'Date', 'Close': 'Actual'}, inplace=True)
df_past['Date'] = pd.to_datetime(df_past['Date'])
df_past['Forecast'] = np.nan
df_past['Forecast'].iloc[-1] = df_past['Actual'].iloc[-1]
df_future = pd.DataFrame(columns=['Date', 'Actual', 'Forecast'])
df_future['Date'] = pd.date_range(start=df_past['Date'].iloc[-1] + pd.Timedelta(days=1), periods=n_forecast)
df_future['Forecast'] = Y_.flatten()
df_future['Actual'] = np.nan
results = df_past.append(df_future).set_index('Date')
# plot the results
results.plot(title='AAPL')