tf.nn.softmax 行为异常

tf.nn.softmax behaving strangely

我正在使用 enable_eager_execution 学习 LSTM 和 tensorflow。但是,在实施 LSTM 时,我注意到 tf.nn.softmax 的行为 这让我陷入困境。这是我的代码的一部分

class RNN_LSTM(object):
    def __init__(self,hidden_size):
        data=open('Shakespear.txt', 'r').read() 
        self.data = data.split()
        vocab_size=len(list(set(self.data)))
        self.words =list(set(self.data))
        self.hidden_size=hidden_size
        self.input_size=vocab_size+hidden_size
        self.vocab_size=vocab_size
        self.W1=tf.Variable(tf.random.uniform((self.hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W1")*0.1)
        self.b1=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b1"))
        self.W2=tf.Variable(tf.random.uniform((self.hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W2")*0.1)
        self.b2=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b2")*0.1)  
        self.W3=tf.Variable(tf.random.uniform((self.hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W3")*0.1)
        self.b3=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b3")*0.1)
        self.W4=tf.Variable(tf.random.uniform((hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W4")*0.1)
        self.b4=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b4")*0.1)
        self.W5=tf.Variable(tf.random.uniform((self.vocab_size,self.hidden_size),dtype=tf.dtypes.float32,name="W5")*0.1)
        self.b5=tf.Variable(tf.random.uniform((self.vocab_size,1),dtype=tf.dtypes.float32,name="b5")*0.1)

        self.learning_rate=1e-1
        self.sequence_length=50


        #self.M_c=tf.Variable(tf.zeros((self.input_size,1)),name="M_c")
    def one_hot_encoding(self,x,hprev):
        M_c=tf.Variable(tf.zeros((self.input_size,1)),name="M_c")
        vocab=tf.Variable(tf.zeros((self.vocab_size,1)))
        #hprev=tf.Variable(tf.zeros((self.hidden_size,1)))
        vocab=vocab.numpy()
        vocab[x]=1
        M_c=tf.concat((hprev,vocab),axis=0)
        return M_c
     def feedforward(self,M_c,p_s):
            ft=tf.sigmoid( tf.matmul(self.W1,M_c)+self.b1)
            it=tf.sigmoid(tf.matmul(self.W2,M_c)+self.b2)
            gt=tf.math.tanh(tf.matmul(self.W3,M_c)+self.b3)
            cs=tf.multiply(ft,p_s)+tf.multiply(it,gt)
            ot=tf.nn.sigmoid(tf.matmul(self.W4,M_c)+self.b4)
            ht=tf.multiply(ot,tf.math.tanh(cs))
            output=self.softmax(tf.matmul(self.W5,ht)+self.b5)                     
            return ht,output,cs
    def sample_text(self,hprev,begin,p_s,n):
        vocab=tf.Variable(tf.zeros((self.vocab_size,1)),tf.float32)
        vocab=vocab.numpy()
        vocab[begin]=1
        letters=[]
        for i in range(n):
            M=tf.Variable(tf.zeros((self.input_size,1)),name="M")
            M=tf.assign(M,tf.concat((hprev,vocab),axis=0))
            ft=tf.nn.sigmoid(tf.matmul(self.W1,M)+self.b1)
            it=tf.nn.sigmoid(tf.matmul(self.W2,M)+self.b2)
            gt=tf.math.tanh(tf.matmul(self.W3,M)+self.b3)
            cs=tf.multiply(ft,p_s)+tf.multiply(it,gt)
            p_s=cs
            ot=tf.sigmoid(tf.matmul(self.W4,M)+self.b4)
            ht=tf.multiply(ot,tf.math.tanh(cs))
            ht=tf.reshape(ht,(self.hidden_size,1))
            output=tf.matmul(self.W5,ht)+self.b5
            p=self.softmax(output)
            #print(p.numpy())
            p=tf.reshape(p,(1,self.vocab_size))
            samples = tf.random.categorical(p,1)
            sample_selected=tf.cast(samples[0][0].numpy(),tf.int32)
            selection_sample_np=[i for i in range(self.vocab_size)]
            selection_sample_tf=tf.convert_to_tensor(selection_sample_np)
            selected_next_letter=selection_sample_tf[sample_selected]
            trial=tf.cast(selected_next_letter,tf.int32)
            k=tf.Variable(tf.zeros((self.vocab_size,1)),tf.int32)
            k[selected_next_letter,0].assign(1)
            letters.append(selected_next_letter)
            hprev=ht
        return letters 
    def process_input(self):
        char_to_ix={ch:ix for ix,ch in enumerate(self.words)}
        ix_to_char={ix:ch for ix,ch in enumerate(self.words)}
        return char_to_ix,ix_to_char
    def softmax(self,z):
        return tf.math.exp(z-max(z))/tf.math.reduce_sum(tf.math.exp(z-max(z)))

    def AggregatorNew(self):
        losses,iterations=[],[]
        char_to_ix,ix_to_char=self.process_input()
        mem1=tf.Variable(tf.zeros_like(self.W1))
        mem2=tf.Variable(tf.zeros_like(self.W2))
        mem3=tf.Variable(tf.zeros_like(self.W3))
        mem4=tf.Variable(tf.zeros_like(self.W4))
        mem5=tf.Variable(tf.zeros_like(self.W5))
        mem6=tf.Variable(tf.zeros_like(self.b1))
        mem7=tf.Variable(tf.zeros_like(self.b2))
        mem8=tf.Variable(tf.zeros_like(self.b3))
        mem9=tf.Variable(tf.zeros_like(self.b4))
        mem10=tf.Variable(tf.zeros_like(self.b5))       
        dW1=tf.Variable(tf.zeros_like(self.W1))
        dW2=tf.Variable(tf.zeros_like(self.W2))
        dW3=tf.Variable(tf.zeros_like(self.W3))
        dW4=tf.Variable(tf.zeros_like(self.W4))
        dW5=tf.Variable(tf.zeros_like(self.W4))
        db1=tf.Variable(tf.zeros_like(self.b1))
        db2=tf.Variable(tf.zeros_like(self.b2))
        db3=tf.Variable(tf.zeros_like(self.b3))
        db4=tf.Variable(tf.zeros_like(self.b4))
        db5=tf.Variable(tf.zeros_like(self.b5))
        n=0 
        p=0
        self.loss=tf.Variable(0,dtype=tf.dtypes.float32,name="loss")
        smooth_loss =-tf.math.log(1.0/self.vocab_size)*self.sequence_length
        while(1):
            try:
                with DelayedKeyboardInterrupt():
                    if p+self.sequence_length+1>= len(self.data) or n == 0: 
                        hprev=tf.Variable(np.zeros((self.hidden_size,1)),dtype=tf.float32,name="hprev")
                        p_s=tf.Variable(tf.zeros((self.hidden_size,1)),name="p_s")
                        p=0
                    inputs=[char_to_ix[ch] for ch in self.data[p:p+self.sequence_length]]
                    targets=[char_to_ix[ch] for ch in self.data[p+1:p+self.sequence_length+1]]
                    sample_ix = self.sample_text(hprev,inputs[0],p_s,200)
                    list_of_strings=[ix_to_char[ix.numpy()] for ix in sample_ix]
                    list_of_strings_tf=tf.convert_to_tensor(list_of_strings)
                    txt = tf.strings.join(list_of_strings_tf,separator=" ")
                    print ('----\n %s \n----' % (txt.numpy(),  ))

                    #loss=tf.reduce_mean(xentropy,name="loss")
                    with tf.GradientTape() as g:
                        for x, y in zip(inputs,targets):
                            M_c=self.one_hot_encoding(x,hprev) 
                            hprev,output,p_s=self.feedforward(M_c,p_s)
                            activation=output[y]
                            loss=-(tf.math.log(activation))
                    dW1,dW2,dW3,dW4,dW5,db1,db2,db3,db4,db5=g.gradient(loss,[self.W1,self.W2,self.W3,self.W4,self.W5,self.b1,self.b2,self.b3,self.b4,self.b5])
                    smooth_loss = smooth_loss * 0.999 + loss * 0.001

            except KeyboardInterrupt:
                    sample_ix = self.sample_text(hprev,inputs[0],p_s,200)
                    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
                    print ('----\n %s \n----' % (txt,  ))                    
                    break

当我使用 self.softmax() 时,它会在前馈输出中给出概率值,但是当我使用 tf.nn.softmax() 时,所有输出值都奇怪地为 1.

第二个问题:与纯 python 实现相比,tensorflow 在 cpu 中通常更慢,还是我错误地实现了 tensorlow?

如果您正在使用 tf.nn.softmax(),并且您没有指定轴,则它默认为 tf.nn.softmax(logits ,axis=1) 因此给出一个张量输出,其中所有值都是 1s 。就我而言,我只是因为没有提供轴而得到错误的值,即 tf.nn.softmax(logits,axis=0)