tf2.0: Gradient Tape returns None RNN 模型中的梯度

tf2.0: Gradient Tape returns None gradient in RNN model

在具有嵌入层和 SimpleRNN 层的模型中,我想为每个步骤 t 计算偏导数 dh_t/dh_0。

我的模型结构,包括导入和数据预处理。
有毒评论序列数据可用:https://www.kaggle.com/c/jigsaw-multilingual-toxic-comment-classification/data?select=jigsaw-toxic-comment-train.csv
GloVe 6B 100d 嵌入可用:https://nlp.stanford.edu/projects/glove/

### 1. Imports 
from __future__ import print_function
import numpy as np
from numpy import array, asarray, zeros
import pandas as pd 
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras import Input, Model
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import BatchNormalization, PReLU
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.preprocessing import sequence, text
from keras import backend as k

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### 2. Text data tokenisation and GloVe-100d embeddings:
def data_pp():
    train= pd.read_csv('/Users/Toxic comment data/jigsaw-toxic-comment-train.csv')   train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)
    train= train.iloc[:12000,:]
    xtr, xte, ytr, yte= train_test_split(train['comment_text'].values, 
                                        train['toxic'].values,
                                        stratify= train['toxic'].values,
                                        random_state= 42, test_size= 0.2, shuffle= True)
    
    # Tokenise data
    tok= text.Tokenizer(num_words= None)
    tok.fit_on_texts(list(xtr)+ list(xte))
    input_dim= len(tok.word_index)+1
    input_length= train['comment_text'].apply(lambda x: len(str(x).split())).max()
    xtr_seq= tok.texts_to_sequences(xtr); xte_seq= tok.texts_to_sequences(xte)
    xtr_pad= sequence.pad_sequences(xtr_seq, maxlen= input_length)
    xte_pad= sequence.pad_sequences(xte_seq, maxlen= input_length)
    print('Shape of tokenised training input:', xtr_pad.shape)
    return xtr_pad, ytr, xte_pad, yte, input_dim, input_length, tok
    
xtr_pad, ytr, xte_pad, yte, input_dim, input_length, tok= data_pp()

# Word embeddings
def embed_mat(input_dim, output_dim, tok):
    '''By default output_dim = 100 for GloVe 100d embeddings'''
    embedding_dict=dict()
    f= open('/Users/GloVe/glove.6B.100d.txt')
    for line in f:
        values= line.split()
        word= values[0]; coefs= asarray(values[1:], dtype= 'float32')
        embedding_dict[word]= coefs
    f.close()
    Emat= zeros((input_dim, output_dim))
    for word, i in tok.word_index.items():
        embedding_vector= embedding_dict.get(word)
        if embedding_vector is not None:
            Emat[i]= embedding_vector
    print('Embedding weight matrix has shape:', Emat.shape)
    return Emat

output_dim = 100
Emat= embed_mat(input_dim, output_dim, took)

### 3. Define model and compute gradients:
# You can let it run for a few steps and stop the process. Then inspect the first step h_t, h_0 and the computed dh_t/dh_0.
# For the case in my comment, you can remove the for-loop over the steps t, comment out ht, and compute tape.gradient(states, h0) instead.

batch_size = 100
inp= Input(batch_shape= (batch_size, input_length), name= 'input') 
emb_out= Embedding(input_dim, output_dim, input_length= input_length, 
                         weights= [Emat], trainable= False, name= 'embedding')(inp)
rnn= SimpleRNN(200, return_sequences= True, return_state= False, stateful= True, name= 'simpleRNN')

h0 = tf.convert_to_tensor(np.random.uniform(size= (batch_size, 200)).astype(np.float32))
rnn_allstates= rnn(emb_out, initial_state=h0) 
model_rnn = Model(inputs=inp, outputs= rnn_allstates, name= 'model_rnn')
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

ds = tf.data.Dataset.from_tensor_slices((xtr_pad[:100], ytr[:100])).batch(100)
embedding_layer = model_rnn.layers[1]
rnn_layer = model_rnn.layers[2]

grads_allsteps= []
for b, (x_batch_train, y_batch_train) in enumerate(ds):
    for t in range(input_length):
        with tf.GradientTape() as tape:
            tape.watch(h0)
            et = embedding_layer(x_batch_train)
            states = rnn_layer(et, initial_state= h0)   # (100, 1403, 200)
            ht = states[:,t,:] 

        grad_t= tape.gradient(ht, h0)  # (100, 200)
        print('Computed gradient dht/dh0 at step ', t+1, 'in batch', b+1)
        grads_allsteps.append(grad_t)

在每个步骤 t,h_t 的形状为 (100,200),h_0 的形状为 (100,200)。然而 tape.gradient(ht, h0) returns None 对于每个 t。下面是第一步的结果:

for t in range(1):
    with tf.GradientTape() as tape:
        tape.watch(h0)
        et = embedding_layer(x_batch_train)
        #tape.watch(et)
        states = rnn_layer(et, initial_state= h0)   # (100, 1403, 200)
        ht = states[:,t,:] 
        print(ht)
        print(h0)
    grad_t = tape.gradient(ht, h0)
    tf.print(grad_t)

>>
# h_t:
tf.Tensor(
[[ 0.25634336  0.5259362   0.60045886 ... -0.4978792   0.62755316
   0.09803997]
 [ 0.58387524  0.26037565  0.5646103  ...  0.31233114  0.4853201
   0.10877549]
 [ 0.17190906  0.68681747 -0.32054633 ... -0.6139967   0.48944488
   0.06301598]
 ...
 [ 0.1985917  -0.11821499 -0.47709295 ... -0.05718012  0.16089934
   0.20585683]
 [ 0.73872745  0.503326    0.25224414 ... -0.5771631   0.03748894
   0.09212588]
 [-0.6597108  -0.43926442 -0.23546427 ...  0.26760277  0.28221437
  -0.4039318 ]], shape=(100, 200), dtype=float32)

# h_0:
tf.Tensor(
[[0.51580787 0.51664346 0.70773274 ... 0.45973232 0.7760376  0.48297063]
 [0.61048764 0.26038417 0.60392565 ... 0.7426153  0.15507504 0.57494944]
 [0.11859739 0.33591187 0.68375146 ... 0.59409297 0.5302879  0.28876984]
 ...
 [0.12401487 0.39376178 0.9850304  ... 0.21582918 0.9592233  0.5257605 ]
 [0.9401199  0.2157638  0.6445949  ... 0.36316434 0.5799403  0.3749675 ]
 [0.37230062 0.18162128 0.0739954  ... 0.21624395 0.66291    0.7807376 ]], shape=(100, 200), dtype=float32)

# dh_t/dh_0:
None

Gradient tape看这个h_0,做梯度计算好像有点难度。我已成功使用 GradientTape 观察 RNN 层的输入 e_t,并计算了梯度 dh_t/de_t,但这并没有真正提供有关模型拟合质量的太多信息。

如何使用它来观察定时量h_0,从而计算梯度dh_t/dh_0?在此先感谢您的帮助。


可重现的测试用例:

### 1. Imports 
from __future__ import print_function
import numpy as np
from numpy import array, asarray, zeros
import pandas as pd 
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras import Input, Model
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import BatchNormalization, PReLU
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.preprocessing import sequence, text
from keras import backend as k

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### 2. Simulated data and gradient computation:
batch_size = 100; input_length = 5
xtr_pad = tf.random.uniform((batch_size, input_length), maxval = 500, dtype=tf.int32)
ytr = tf.random.normal((batch_size, input_length, 200))


inp= Input(batch_shape= (batch_size, input_length), name= 'input') 
emb_out= Embedding(500, 100, input_length= input_length, trainable= False, name= 'embedding')(inp)
rnn= SimpleRNN(200, return_sequences= True, return_state= False, stateful= True, name= 'simpleRNN')

h0 = tf.convert_to_tensor(np.random.uniform(size= (batch_size, 200)).astype(np.float32))

rnn_allstates= rnn(emb_out, initial_state=h0) 
model_rnn = Model(inputs=inp, outputs= rnn_allstates, name= 'model_rnn')
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

ds = tf.data.Dataset.from_tensor_slices((xtr_pad, ytr)).batch(100)
embedding_layer = model_rnn.layers[1]
rnn_layer = model_rnn.layers[2]

grads_allsteps= []
for b, (x_batch_train, y_batch_train) in enumerate(ds):
    for t in range(input_length):
        with tf.GradientTape() as tape:
            tape.watch(h0)
            states= model_rnn(x_batch_train)
            ht = states[:,t,:] 

        grad_t= tape.gradient(ht, h0)  
        print('Computed gradient dht/dh0 at step ', t+1, 'in batch', b+1)
        grads_allsteps.append(grad_t)
 

有趣的事情:计算第一步梯度并且看起来不错。其余为 Nones.

grads_allsteps

>>
[<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
 array([[ 1.2307187 , -1.0343404 ,  0.52859926, ..., -0.09879799,
         -1.1407609 , -0.7241671 ],
        [ 1.142821  , -1.312029  ,  0.37148148, ...,  0.2300478 ,
         -1.1440411 , -0.36673146],
        [ 1.2778691 , -1.2225235 ,  0.69951147, ...,  0.17701946,
         -1.2816343 , -0.52648413],
        ...,
        [ 1.1717036 , -1.2444504 ,  0.5874837 , ..., -0.13161334,
         -1.3752006 , -0.376719  ],
        [ 1.1333262 , -1.0013355 ,  0.3363382 , ..., -0.22350994,
         -1.299541  , -0.5073889 ],
        [ 1.18489   , -0.90809333,  0.55045474, ..., -0.10550319,
         -1.0866506 , -0.58325446]], dtype=float32)>, None, None, None, None]

您可以尝试使用 tf.gradients。也宁愿使用 tf.Variable 作为 h0:

# Your imports
#-------
### 2. Simulated data and gradient computation:
batch_size = 100; input_length = 5
xtr_pad = tf.random.uniform((batch_size, input_length), maxval = 500, dtype=tf.int32)
ytr = tf.random.normal((batch_size, input_length, 200))


inp= Input(batch_shape= (batch_size, input_length), name= 'input') 
emb_out= Embedding(500, 100, input_length= input_length, trainable= False, name= 'embedding')(inp)
rnn= SimpleRNN(200, return_sequences= True, return_state= False, stateful= True, name= 'simpleRNN')

h0 = tf.Variable(tf.random.uniform((batch_size, 200)))

rnn_allstates= rnn(emb_out, initial_state=h0) 
model_rnn = Model(inputs=inp, outputs= rnn_allstates, name= 'model_rnn')
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

ds = tf.data.Dataset.from_tensor_slices((xtr_pad, ytr)).batch(100)
embedding_layer = model_rnn.layers[1]
rnn_layer = model_rnn.layers[2]


@tf.function
def calculate_t_gradients(t, x, h0):
  return tf.gradients(model_rnn(x)[:,t,:], h0)

grads_allsteps= []
for b, (x_batch_train, y_batch_train) in enumerate(ds):
    for t in range(input_length):  
      grads_allsteps.append(calculate_t_gradients(t, x_batch_train, h0))
 
print(grads_allsteps) 
[[<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 1.2034059 , -0.46448404,  0.6272926 , ..., -0.40906236,
         0.07618493,  0.6338958 ],
       [ 1.2781916 , -0.20411322,  0.6174417 , ..., -0.31636393,
        -0.23417974,  0.67499626],
       [ 1.113218  , -0.65086263,  0.63425934, ..., -0.66614366,
        -0.07726163,  0.53647137],
       ...,
       [ 1.3399608 , -0.54088974,  0.6213518 , ...,  0.00831087,
        -0.14397278,  0.2614633 ],
       [ 1.213171  , -0.42787278,  0.60535026, ..., -0.56198204,
        -0.09142771,  0.6212783 ],
       [ 1.1901733 , -0.5743524 ,  0.36872283, ..., -0.42522985,
        -0.0861398 ,  0.495057  ]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 0.3487598 ,  1.2738569 , -0.48500937, ...,  0.6011117 ,
        -0.20381093,  0.45596513],
       [ 0.37931004,  1.2778724 , -0.8682532 , ...,  0.8170228 ,
         0.1456329 ,  0.23715591],
       [ 0.5984771 ,  0.92434835, -0.8879645 , ...,  0.38756457,
        -0.17436962,  0.47174054],
       ...,
       [ 0.61081064,  0.99631476, -0.5104377 , ...,  0.5042721 ,
         0.02844866,  0.34626445],
       [ 0.7126102 ,  1.0205276 , -0.60710275, ...,  0.49418694,
        -0.16092762,  0.41363668],
       [ 0.8581749 ,  1.1259711 , -0.5824491 , ...,  0.45388597,
        -0.16205123,  0.72434616]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 3.8507193e-01,  1.2925258e+00,  1.2027258e+00, ...,
         3.2430276e-01,  2.2319333e-01, -2.5218868e-01],
       [ 5.9262186e-01,  1.4497797e+00,  1.2479483e+00, ...,
         4.6175608e-01,  2.5466472e-01, -2.4279505e-01],
       [ 2.5734475e-01,  1.4562432e+00,  1.1020679e+00, ...,
         6.6081107e-01,  1.9841105e-01, -2.5595558e-01],
       ...,
       [ 5.1541841e-01,  1.6206543e+00,  9.6205616e-01, ...,
         7.2725344e-01,  2.5501373e-01, -7.7709556e-04],
       [ 4.4518453e-01,  1.6381552e+00,  1.0112666e+00, ...,
         5.5238277e-01,  2.4137528e-01, -2.6242572e-01],
       [ 6.6721851e-01,  1.5826726e+00,  1.1282607e+00, ...,
         3.2301426e-01,  2.2295776e-01,  1.1724380e-01]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 0.14262576,  0.578709  ,  0.1149607 , ...,  0.1229499 ,
        -0.42344815,  0.8837458 ],
       [-0.09711604,  0.04376438, -0.11737494, ...,  0.00389774,
         0.01737173,  0.17246482],
       [ 0.24414796,  0.30101255, -0.12234146, ..., -0.04850931,
        -0.31790918,  0.21326394],
       ...,
       [-0.20562285,  0.21999156,  0.02703794, ..., -0.03547464,
        -0.59052145,  0.04695258],
       [ 0.2087476 ,  0.46558812, -0.18172565, ..., -0.01167884,
        -0.20868361,  0.09055485],
       [-0.22442941,  0.16119067,  0.10854454, ...,  0.14752978,
        -0.32307786,  0.343314  ]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[-1.1414615 ,  0.37376842, -1.0230722 , ...,  0.60619426,
         0.22550163, -0.6948315 ],
       [-1.0124328 ,  0.27892357, -0.96915233, ...,  0.7048603 ,
        -0.15284726, -0.6734605 ],
       [-0.8542529 ,  0.25970122, -0.90076745, ...,  0.8825682 ,
        -0.02474228, -0.55014515],
       ...,
       [-0.89430666,  0.68327624, -1.0109956 , ...,  0.31722566,
        -0.23703958, -0.6766514 ],
       [-0.8633691 ,  0.28742114, -0.9896866 , ...,  0.98315084,
         0.0115847 , -0.55474746],
       [-0.7229766 ,  0.62417865, -1.2342371 , ...,  0.85149145,
        -0.04468453, -0.60606724]], dtype=float32)>]]

您需要确保 SimpleRNNstateful 参数是 False,因为根据 docs:

If True, the last state for each sample at index i in a batch will be used as initial state for the sample of index i in the following batch.

因此,如果将 stateful 设置为 False,您的代码还将计算每个时间步长的梯度。

找到了一种重复实现渐变带的方法。 del tape将统计数据保存到列表后,会减轻GPU的负担

emb_layer= model_rnn.layers[1]; rnn_layer= model_rnn.layers[2]
n_steps = 40  

dhtdh0_rnn= []
for t in range(n_steps):
    with tf.GradientTape() as tape:
        tape.watch(h0)
        et= emb_layer(xtr_pad[:100])
        ht_all= rnn_layer(et, initial_state= [h0]) 
        ht= ht_all[:,t,:,]
        dhtdh0_t= tape.gradient(ht, h0)
        grad_agg= tf.reduce_mean(abs(dhtdh0_t), [0,1])
        print('step', t+1, 'done')
        dhtdh0_rnn.append(np.log(grad_agg))
        del tape