带有 Numpy 的 LSTM,找不到确定的算法

LSTM with Numpy, can't find a definitive algorithm

我正在从头开始做 LSTM 并遵循本指南,但损失没有减少反而增加了。这是迄今为止我找到的最好的指南,但这并不能说明什么,因为即使是这个指南也不完整。除了发现特定于我的代码的问题外,我将感谢任何显示 LSTM 完整序列的来源。

https://wiseodd.github.io/techblog/2016/08/12/lstm-backprop/

模型定义:

import numpy as np


H = 128 # Number of LSTM layer's neurons
D = ... # Number of input dimension == number of items in vocabulary
Z = H + D # Because we will concatenate LSTM state with the input

model = dict(
    Wf=np.random.randn(Z, H) / np.sqrt(Z / 2.),
    Wi=np.random.randn(Z, H) / np.sqrt(Z / 2.),
    Wc=np.random.randn(Z, H) / np.sqrt(Z / 2.),
    Wo=np.random.randn(Z, H) / np.sqrt(Z / 2.),
    Wy=np.random.randn(H, D) / np.sqrt(D / 2.),
    bf=np.zeros((1, H)),
    bi=np.zeros((1, H)),
    bc=np.zeros((1, H)),
    bo=np.zeros((1, H)),
    by=np.zeros((1, D))
)

我的模型:

# RNN class
class RNN:

    def __init__(self, n, d, RL, LR):
        """Pass input size (n), number of memory cells (d), recurrence length (RL), and learning rate (LR)"""
        self.n, self.d, self.z, z = n, d, n + d, n + d
        self.d = d
        self.z, z = n + d, n + d
        self.RL = RL
        self.LR = LR

        self.x = []

        self.Cells = [Cell(n, d, self)]

        self.Wi, self.Wf, self.Wo, self.Wc, self.Wy = randn(z, d) / sqrt(z / 2), randn(z, d) / sqrt(z / 2), randn(z, d) / sqrt(z / 2), randn(z, d) / sqrt(z / 2), randn(d, n) / sqrt(d / 2)
        self.bi, self.bf, self.bo, self.bc, self.by = randn(d, 1), randn(d, 1), randn(d, 1), randn(d, 1), randn(n, 1)
        self.dWi, self.dWf, self.dWo, self.dWc, self.dWy = zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((d, n))
        self.dbi, self.dbf, self.dbo, self.dbc, self.dby = zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((n, 1))

我的手机:

class Cell:

def __init__(self, n, d, rnn):
    """Pass the input size (n) and memory cell size (d), create hidden state of size d, pass rnn (self)"""
    self.n, self.d, self.h, self.z, z = n, d, zeros((d, 1)), n + d, n + d
    self.rnn = rnn

他们的前馈:

def lstm_forward(X, state):
    m = model
    Wf, Wi, Wc, Wo, Wy = m['Wf'], m['Wi'], m['Wc'], m['Wo'], m['Wy']
    bf, bi, bc, bo, by = m['bf'], m['bi'], m['bc'], m['bo'], m['by']

    h_old, c_old = state

    # One-hot encode
    X_one_hot = np.zeros(D)
    X_one_hot[X] = 1.
    X_one_hot = X_one_hot.reshape(1, -1)

    # Concatenate old state with current input
    X = np.column_stack((h_old, X_one_hot))

    hf = sigmoid(X @ Wf + bf)
    hi = sigmoid(X @ Wi + bi)
    ho = sigmoid(X @ Wo + bo)
    hc = tanh(X @ Wc + bc)

    c = hf * c_old + hi * hc
    h = ho * tanh(c)

    y = h @ Wy + by
    prob = softmax(y)

    state = (h, c) # Cache the states of current h & c for next iter
    cache = ... # Add all intermediate variables to this cache

    return prob, state, cache

我的前馈:

def feedforward(self, x, c_, h_):
    """Pass an input of size n, the previous hidden state(ht), and the previous cell state(c)"""
    n, d = self.n, self.d
    Wi, Wf, Wo, Wc, Wy = self.rnn.Wi, self.rnn.Wf, self.rnn.Wo, self.rnn.Wc, self.rnn.Wy
    bi, bf, bo, bc, by = self.rnn.bi, self.rnn.bf, self.rnn.bo, self.rnn.bc, self.rnn.by

    index = x       # one hot encoding
    x = zeros((n, 1))
    x[index] = 1
    g = concat((x, h_))         # input g is input x + previous hidden state

    it = sigmoid(dot(Wi.T, g) + bi)     # gate activations
    ft = sigmoid(dot(Wf.T, g) + bf)
    ot = sigmoid(dot(Wo.T, g) + bo)
    ct = tanh(dot(Wc.T, g) + bc)        # non linearity activation
    c = ft * c_ + it * ct       # cell state

    ht = ot * tanh(c)       # squashed hidden state
    yt = dot(Wy.T, ht) + by     # output state
    p = softmax(yt)     # call softmax, get probability

    self.c_, self.h_ = c_, h_
    self.it, self.ft, self.ot, self.ct = it, ft, ot, ct
    self.c, self.ht, self.yt, self.p, self.g = c, ht, yt, p, g

    return ht, c

他们的反向传播:

def lstm_backward(prob, y_train, d_next, cache):
    # Unpack the cache variable to get the intermediate variables used in forward step
    ... = cache
    dh_next, dc_next = d_next

    # Softmax loss gradient
    dy = prob.copy()
    dy[1, y_train] -= 1.

    # Hidden to output gradient
    dWy = h.T @ dy
    dby = dy
    # Note we're adding dh_next here
    dh = dy @ Wy.T + dh_next

    # Gradient for ho in h = ho * tanh(c)
    dho = tanh(c) * dh
    dho = dsigmoid(ho) * dho

    # Gradient for c in h = ho * tanh(c), note we're adding dc_next here
    dc = ho * dh * dtanh(c)
    dc = dc + dc_next

    # Gradient for hf in c = hf * c_old + hi * hc
    dhf = c_old * dc
    dhf = dsigmoid(hf) * dhf

    # Gradient for hi in c = hf * c_old + hi * hc
    dhi = hc * dc
    dhi = dsigmoid(hi) * dhi

    # Gradient for hc in c = hf * c_old + hi * hc
    dhc = hi * dc
    dhc = dtanh(hc) * dhc

    # Gate gradients, just a normal fully connected layer gradient
    dWf = X.T @ dhf
    dbf = dhf
    dXf = dhf @ Wf.T

    dWi = X.T @ dhi
    dbi = dhi
    dXi = dhi @ Wi.T

    dWo = X.T @ dho
    dbo = dho
    dXo = dho @ Wo.T

    dWc = X.T @ dhc
    dbc = dhc
    dXc = dhc @ Wc.T

    # As X was used in multiple gates, the gradient must be accumulated here
    dX = dXo + dXc + dXi + dXf
    # Split the concatenated X, so that we get our gradient of h_old
    dh_next = dX[:, :H]
    # Gradient for c_old in c = hf * c_old + hi * hc
    dc_next = hf * dc

    grad = dict(Wf=dWf, Wi=dWi, Wc=dWc, Wo=dWo, Wy=dWy, bf=dbf, bi=dbi, bc=dbc, bo=dbo, by=dby)
    state = (dh_next, dc_next)

    return grad, state

我的反向传播:

def backpropagate(self, y, ht1, ct1):

    n, d = self.n, self.d
    Wi, Wf, Wo, Wc, Wy = self.rnn.Wi, self.rnn.Wf, self.rnn.Wo, self.rnn.Wc, self.rnn.Wy
    dWi, dWf, dWo, dWc, dWy = self.rnn.dWi, self.rnn.dWf, self.rnn.dWo, self.rnn.dWc, self.rnn.dWy
    dbi, dbf, dbo, dbc, dby = self.rnn.dbi, self.rnn.dbf, self.rnn.dbo, self.rnn.dbc, self.rnn.dby
    c_, h_ = self.c_, self.h_
    it, ft, ot, ct = self.it, self.ft, self.ot, self.ct
    c, ht, yt, p = self.c, self.ht, self.yt, self.p
    g = self.g

    dy = copy(p)
    dy[y] -= 1

    loss = cross_ent(p, y)

    dh = dot(Wy, dy) + ht1
    dh = clip(dh, -6, 6)

    do = tanh(c) * dh
    do = dsigmoid(ot) * do

    dc = ot * dh * dtanh(c)
    dc = dc + ct1
    dc = clip(dc, -6, 6)

    df = c_ * dc
    df = dsigmoid(ft) * df

    di = ct * dc
    di = dsigmoid(it) * di

    dct = it * dc
    dct = dtanh(ct) * dct

    dWf += dot(g, df.T)
    dWi += dot(g, di.T)
    dWo += dot(g, do.T)
    dWc += dot(g, dc.T)
    dWy += dot(ht, dy.T)

    dbf += df
    dbi += di
    dbo += do
    dbc += dc
    dby += dy

    dxi = dot(Wi, di)
    dxf = dot(Wf, df)
    dxo = dot(Wo, do)
    dxc = dot(Wc, dct)

    dx = dxf + dxi + dxo + dxc

    dht1 = dx[n:]
    dct1 = ft * dc

    return loss, dht1, dct1

他们的训练步骤:

def train_step(X_train, y_train, state):
    probs = []
    caches = []
    loss = 0.
    h, c = state

    # Forward Step

    for x, y_true in zip(X_train, y_train):
        prob, state, cache = lstm_forward(x, state, train=True)
        loss += cross_entropy(prob, y_true)

        # Store forward step result to be used in backward step
        probs.append(prob)
        caches.append(cache)

    # The loss is the average cross entropy
    loss /= X_train.shape[0]

    # Backward Step

    # Gradient for dh_next and dc_next is zero for the last timestep
    d_next = (np.zeros_like(h), np.zeros_like(c))
    grads = {k: np.zeros_like(v) for k, v in model.items()}

    # Go backward from the last timestep to the first
    for prob, y_true, cache in reversed(list(zip(probs, y_train, caches))):
        grad, d_next = lstm_backward(prob, y_true, d_next, cache)

        # Accumulate gradients from all timesteps
        for k in grads.keys():
            grads[k] += grad[k]

    return grads, loss, state

我的训练步骤:

def FeedForward(self, inputs, ht_, ct_):

    n, d, rl, Cells = self.n, self.d, self.RL, self.Cells

    while len(Cells) < rl:
        Cells.append(Cell(n, d, self))

    for cell, x in zip(Cells, range(len(inputs))):
        ht_, ct_ = cell.feedforward(x, ht_, ct_)

    return ht_, ct_



def BPTT(self, outputs, ht1, ct1):

    n, d, z, rl = self.n, self.d, self.n + self.d, self.RL
    Cells = self.Cells

    avg_loss = 0

    for i in reversed(range(rl)):
        loss, ht1, ct1 = Cells[i].backpropagate(outputs[i], ht1, ct1)
        avg_loss += loss

    avg_loss /= rl

    return avg_loss, ht1, ct1


def train(self, inputs, outputs):

    n, d, z, rl = self.n, self.d, self.n + self.d, self.RL
    index = 0
    LR = 0.1
    loss = 0

    ht_, ct_ = zeros((d, 1)), zeros((d, 1))
    ht1, ct1 = zeros((d, 1)), zeros((d, 1))

    while index < len(outputs):
        xlist = inputs[index:index + rl]
        ylist = outputs[index:index + rl]
        ht_, ct_ = self.FeedForward(xlist, ht_, ct_)
        loss, ht1, ct1 = self.BPTT(ylist, ht1, ct1)
        #print(loss)
        self.update(LR)
        index += rl

def update(self, LR):

    n, d, z = self.n, self.d, self.n + self.d

    self.Wi -= LR * self.dWi
    self.Wf -= LR * self.dWf
    self.Wo -= LR * self.dWo
    self.Wc -= LR * self.dWc
    self.Wy -= LR * self.dWy
    self.bi -= LR * self.dbi
    self.bf -= LR * self.dbf
    self.bo -= LR * self.dbo
    self.bc -= LR * self.dbc
    self.by -= LR * self.dby

    self.dWi, self.dWf, self.dWo, self.dWc, self.dWy = zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((d, n))
    self.dbi, self.dbf, self.dbo, self.dbc, self.dby = zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((n, 1))

我的代码总数:

# Import logistic function that doesn't explode outside a 64 bit float
from scipy.special import expit as sigmoid
from numpy import zeros, zeros_like, tanh, exp, sum, dot, sqrt, log, argmax, concatenate as concat, copy
from numpy.random import randn


# derivative of sigmoid function
def dsigmoid(z):
    return sigmoid(z) * (1 - sigmoid(z))

# derivative of hyperbolic tangent
def dtanh(z):
    return 1 - tanh(z) ** 2

# probability function
def softmax(z):
    return exp(z) / sum(exp(z))

# cross entropy loss
def cross_ent(p, y):
    return -log(p[y])


# RNN class
class RNN:
def __init__(self, n, d, RL, LR):
    """Pass input size (n), number of memory cells (d), recurrence length (RL), and learning rate (LR)"""
    self.n, self.d, self.z, z = n, d, n + d, n + d
    self.d = d
    self.z, z = n + d, n + d
    self.RL = RL
    self.LR = LR

    self.x = []

    self.Cells = [Cell(n, d, self)]

    self.Wi, self.Wf, self.Wo, self.Wc, self.Wy = randn(z, d) / sqrt(z / 2), randn(z, d) / sqrt(z / 2), randn(z, d) / sqrt(z / 2), randn(z, d) / sqrt(z / 2), randn(d, n) / sqrt(d / 2)
    self.bi, self.bf, self.bo, self.bc, self.by = randn(d, 1), randn(d, 1), randn(d, 1), randn(d, 1), randn(n, 1)
    self.dWi, self.dWf, self.dWo, self.dWc, self.dWy = zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((d, n))
    self.dbi, self.dbf, self.dbo, self.dbc, self.dby = zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((n, 1))

def FeedForward(self, inputs, ht_, ct_):

    n, d, rl, Cells = self.n, self.d, self.RL, self.Cells

    while len(Cells) < rl:
        Cells.append(Cell(n, d, self))

    for cell, x in zip(Cells, range(len(inputs))):
        ht_, ct_ = cell.feedforward(x, ht_, ct_)

    return ht_, ct_



def BPTT(self, outputs, ht1, ct1):

    n, d, z, rl = self.n, self.d, self.n + self.d, self.RL
    Cells = self.Cells

    avg_loss = 0

    for i in reversed(range(rl)):
        loss, ht1, ct1 = Cells[i].backpropagate(outputs[i], ht1, ct1)
        avg_loss += loss

    avg_loss /= rl

    return avg_loss, ht1, ct1


def train(self, inputs, outputs):

    n, d, z, rl = self.n, self.d, self.n + self.d, self.RL
    index = 0
    LR = 0.1
    loss = 0

    ht_, ct_ = zeros((d, 1)), zeros((d, 1))
    ht1, ct1 = zeros((d, 1)), zeros((d, 1))

    while index < len(outputs):
        xlist = inputs[index:index + rl]
        ylist = outputs[index:index + rl]
        ht_, ct_ = self.FeedForward(xlist, ht_, ct_)
        loss, ht1, ct1 = self.BPTT(ylist, ht1, ct1)
        #print(loss)
        self.update(LR)
        index += rl

def update(self, LR):

    n, d, z = self.n, self.d, self.n + self.d

    self.Wi -= LR * self.dWi
    self.Wf -= LR * self.dWf
    self.Wo -= LR * self.dWo
    self.Wc -= LR * self.dWc
    self.Wy -= LR * self.dWy
    self.bi -= LR * self.dbi
    self.bf -= LR * self.dbf
    self.bo -= LR * self.dbo
    self.bc -= LR * self.dbc
    self.by -= LR * self.dby

    self.dWi, self.dWf, self.dWo, self.dWc, self.dWy = zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((d, n))
    self.dbi, self.dbf, self.dbo, self.dbc, self.dby = zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((n, 1))

class 单元格:

def __init__(self, n, d, rnn):
    """Pass the input size (n) and memory cell size (d), create hidden state of size d, pass rnn (self)"""
    self.n, self.d, self.h, self.z, z = n, d, zeros((d, 1)), n + d, n + d
    self.rnn = rnn


def feedforward(self, x, c_, h_):
    """Pass an input of size n, the previous hidden state(ht), and the previous cell state(c)"""
    n, d = self.n, self.d
    Wi, Wf, Wo, Wc, Wy = self.rnn.Wi, self.rnn.Wf, self.rnn.Wo, self.rnn.Wc, self.rnn.Wy
    bi, bf, bo, bc, by = self.rnn.bi, self.rnn.bf, self.rnn.bo, self.rnn.bc, self.rnn.by

    index = x       # one hot encoding
    x = zeros((n, 1))
    x[index] = 1
    g = concat((x, h_))         # input g is input x + previous hidden state

    it = sigmoid(dot(Wi.T, g) + bi)     # gate activations
    ft = sigmoid(dot(Wf.T, g) + bf)
    ot = sigmoid(dot(Wo.T, g) + bo)
    ct = tanh(dot(Wc.T, g) + bc)        # non linearity activation
    c = ft * c_ + it * ct       # cell state

    ht = ot * tanh(c)       # squashed hidden state
    yt = dot(Wy.T, ht) + by     # output state
    p = softmax(yt)     # call softmax, get probability

    self.c_, self.h_ = c_, h_
    self.it, self.ft, self.ot, self.ct = it, ft, ot, ct
    self.c, self.ht, self.yt, self.p, self.g = c, ht, yt, p, g

    return ht, c


def backpropagate(self, y, ht1, ct1):

    n, d = self.n, self.d
    Wi, Wf, Wo, Wc, Wy = self.rnn.Wi, self.rnn.Wf, self.rnn.Wo, self.rnn.Wc, self.rnn.Wy
    dWi, dWf, dWo, dWc, dWy = self.rnn.dWi, self.rnn.dWf, self.rnn.dWo, self.rnn.dWc, self.rnn.dWy
    dbi, dbf, dbo, dbc, dby = self.rnn.dbi, self.rnn.dbf, self.rnn.dbo, self.rnn.dbc, self.rnn.dby
    c_, h_ = self.c_, self.h_
    it, ft, ot, ct = self.it, self.ft, self.ot, self.ct
    c, ht, yt, p = self.c, self.ht, self.yt, self.p
    g = self.g

    dy = copy(p)
    dy[y] -= 1

    loss = cross_ent(p, y)

    dh = dot(Wy, dy) + ht1
    dh = clip(dh, -6, 6)

    do = tanh(c) * dh
    do = dsigmoid(ot) * do

    dc = ot * dh * dtanh(c)
    dc = dc + ct1
    dc = clip(dc, -6, 6)

    df = c_ * dc
    df = dsigmoid(ft) * df

    di = ct * dc
    di = dsigmoid(it) * di

    dct = it * dc
    dct = dtanh(ct) * dct

    dWf += dot(g, df.T)
    dWi += dot(g, di.T)
    dWo += dot(g, do.T)
    dWc += dot(g, dc.T)
    dWy += dot(ht, dy.T)

    dbf += df
    dbi += di
    dbo += do
    dbc += dc
    dby += dy

    dxi = dot(Wi, di)
    dxf = dot(Wf, df)
    dxo = dot(Wo, do)
    dxc = dot(Wc, dct)

    dx = dxf + dxi + dxo + dxc

    dht1 = dx[n:]
    dct1 = ft * dc

    return loss, dht1, dct1

file = open("trumptweets.txt", 'r', encoding='utf8').read()

text = list(file)

alphabet = list(set(text))

n = (len(alphabet))
d = 100

encode = {ch:i for i,ch in enumerate(alphabet)}
decode = {i:ch for i,ch in enumerate(alphabet)}

inputs = [encode[ch] for ch in text]
outputs = [inputs[i + 1] for i in range(len(inputs)-1)]


RNN = LSTM.RNN(n, d, 100, 0.1)

RNN.train(inputs, outputs)

随着网络训练,损失有所增加。我不确定错误是什么,因为我找不到关于 LSTM 如何工作的任何权威来源。目前我已经实施了 5 个不完整、损坏或不正确的指南。

对于不想通读我的代码的任何人,我遵循以下算法:

  1. 将字符编码为整数
  2. 初始化起始隐藏状态和细胞状态。
  3. 将循环长度的单元格数添加到列表中。
  4. 对于循环长度输入,将每个输入编码为一个热向量。
  5. 将旧隐藏状态与一个热状态连接起来。
  6. 前馈,计算循环长度门和激活。
  7. 保存细胞状态、输出、隐藏状态和概率。
  8. 初始化下一个隐藏状态和单元状态。
  9. 将标签和状态传递给 BPTT
  10. 从末尾开始反向传播每个单元格,累积梯度。
  11. 更新权重。
  12. 将 inputs/outputs 向前移动循环长度步长。

缺少的是正确的更新规则。我主要遵循的指南使用的是 ADAM,因此我从研究论文中实现了它并且它有效。

def train(self, inputs, outputs):

    n, d, z, rl = self.n, self.d, self.n + self.d, self.RL
    index, t, converged = 0, 0, False
    a, b1, b2, e = 0.001, 0.9, 0.999, 1e-8

    mWi, mWf, mWo, mWc, mWy = zeros_like((self.Wi)), zeros_like((self.Wf)), zeros_like((self.Wo)), zeros_like((self.Wc)), zeros_like((self.Wy))
    mbi, mbf, mbo, mbc, mby = zeros_like((self.bi)), zeros_like((self.bf)), zeros_like((self.bo)), zeros_like((self.bc)), zeros_like((self.by))

    vWi, vWf, vWo, vWc, vWy = zeros_like((self.Wi)), zeros_like((self.Wf)), zeros_like((self.Wo)), zeros_like((self.Wc)), zeros_like((self.Wy))
    vbi, vbf, vbo, vbc, vby = zeros_like((self.bi)), zeros_like((self.bf)), zeros_like((self.bo)), zeros_like((self.bc)), zeros_like((self.by))

    ht_, ct_ = zeros((d, 1)), zeros((d, 1))
    ht1, ct1 = zeros((d, 1)), zeros((d, 1))

    while not converged:

        t += 1
        xlist = inputs[index:index + rl]
        ylist = outputs[index:index + rl]
        ht_, ct_ = self.FeedForward(xlist, ht_, ct_)
        loss, ht1, ct1 = self.BPTT(ylist, ht1, ct1)
        print(loss)

        dWi, dWf, dWo, dWc, dWy = self.dWi, self.dWf, self.dWo, self.dWc, self.dWy
        dbi, dbf, dbo, dbc, dby = self.dbi, self.dbf, self.dbo, self.dbc, self.dby

        mWi = b1 * mWi + (1 - b1) * dWi
        mWf = b1 * mWf + (1 - b1) * dWf
        mWo = b1 * mWo + (1 - b1) * dWo
        mWc = b1 * mWc + (1 - b1) * dWc
        mWy = b1 * mWy + (1 - b1) * dWy
        mbi = b1 * mbi + (1 - b1) * dbi
        mbf = b1 * mbf + (1 - b1) * dbf
        mbo = b1 * mbo + (1 - b1) * dbo
        mbc = b1 * mbc + (1 - b1) * dbc
        mby = b1 * mby + (1 - b1) * dby

        vWi = b2 * vWi + (1 - b2) * dWi**2
        vWf = b2 * vWf + (1 - b2) * dWf**2
        vWo = b2 * vWo + (1 - b2) * dWo**2
        vWc = b2 * vWc + (1 - b2) * dWc**2
        vWy = b2 * vWy + (1 - b2) * dWy**2
        vbi = b2 * vbi + (1 - b2) * dbi**2
        vbf = b2 * vbf + (1 - b2) * dbf**2
        vbo = b2 * vbo + (1 - b2) * dbo**2
        vbc = b2 * vbc + (1 - b2) * dbc**2
        vby = b2 * vby + (1 - b2) * dby**2

        mWi_ = mWi / (1 - b1**t)
        mWf_ = mWf / (1 - b1**t)
        mWo_ = mWo / (1 - b1**t)
        mWc_ = mWc / (1 - b1**t)
        mWy_ = mWy / (1 - b1**t)
        mbi_ = mbi / (1 - b1**t)
        mbf_ = mbf / (1 - b1**t)
        mbo_ = mbo / (1 - b1**t)
        mbc_ = mbc / (1 - b1**t)
        mby_ = mby / (1 - b1**t)

        vWi_ = vWi / (1 - b2**t)
        vWf_ = vWf / (1 - b2**t)
        vWo_ = vWo / (1 - b2**t)
        vWc_ = vWc / (1 - b2**t)
        vWy_ = vWy / (1 - b2**t)
        vbi_ = vbi / (1 - b2**t)
        vbf_ = vbf / (1 - b2**t)
        vbo_ = vbo / (1 - b2**t)
        vbc_ = vbc / (1 - b2**t)
        vby_ = vby / (1 - b2**t)

        self.Wi = self.Wi - a * mWi_ / (sqrt(vWi_) + e)
        self.Wf = self.Wf - a * mWf_ / (sqrt(vWf_) + e)
        self.Wo = self.Wo - a * mWo_ / (sqrt(vWo_) + e)
        self.Wc = self.Wc - a * mWc_ / (sqrt(vWc_) + e)
        self.Wy = self.Wy - a * mWy_ / (sqrt(vWy_) + e)
        self.bi = self.bi - a * mbi_ / (sqrt(vbi_) + e)
        self.bf = self.bf - a * mbf_ / (sqrt(vbf_) + e)
        self.bo = self.bo - a * mbo_ / (sqrt(vbo_) + e)
        self.bc = self.bc - a * mbc_ / (sqrt(vbc_) + e)
        self.by = self.by - a * mby_ / (sqrt(vby_) + e)

        self.dWi, self.dWf, self.dWo, self.dWc, self.dWy = zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((z, d)), zeros((d, n))
        self.dbi, self.dbf, self.dbo, self.dbc, self.dby = zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((d, 1)), zeros((n, 1))

        index += rl
        if index >= len(outputs): index = 0