tf.nn.softmax 行为异常
tf.nn.softmax behaving strangely
我正在使用 enable_eager_execution 学习 LSTM 和 tensorflow。但是,在实施 LSTM 时,我注意到 tf.nn.softmax 的行为
这让我陷入困境。这是我的代码的一部分
class RNN_LSTM(object):
def __init__(self,hidden_size):
data=open('Shakespear.txt', 'r').read()
self.data = data.split()
vocab_size=len(list(set(self.data)))
self.words =list(set(self.data))
self.hidden_size=hidden_size
self.input_size=vocab_size+hidden_size
self.vocab_size=vocab_size
self.W1=tf.Variable(tf.random.uniform((self.hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W1")*0.1)
self.b1=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b1"))
self.W2=tf.Variable(tf.random.uniform((self.hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W2")*0.1)
self.b2=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b2")*0.1)
self.W3=tf.Variable(tf.random.uniform((self.hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W3")*0.1)
self.b3=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b3")*0.1)
self.W4=tf.Variable(tf.random.uniform((hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W4")*0.1)
self.b4=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b4")*0.1)
self.W5=tf.Variable(tf.random.uniform((self.vocab_size,self.hidden_size),dtype=tf.dtypes.float32,name="W5")*0.1)
self.b5=tf.Variable(tf.random.uniform((self.vocab_size,1),dtype=tf.dtypes.float32,name="b5")*0.1)
self.learning_rate=1e-1
self.sequence_length=50
#self.M_c=tf.Variable(tf.zeros((self.input_size,1)),name="M_c")
def one_hot_encoding(self,x,hprev):
M_c=tf.Variable(tf.zeros((self.input_size,1)),name="M_c")
vocab=tf.Variable(tf.zeros((self.vocab_size,1)))
#hprev=tf.Variable(tf.zeros((self.hidden_size,1)))
vocab=vocab.numpy()
vocab[x]=1
M_c=tf.concat((hprev,vocab),axis=0)
return M_c
def feedforward(self,M_c,p_s):
ft=tf.sigmoid( tf.matmul(self.W1,M_c)+self.b1)
it=tf.sigmoid(tf.matmul(self.W2,M_c)+self.b2)
gt=tf.math.tanh(tf.matmul(self.W3,M_c)+self.b3)
cs=tf.multiply(ft,p_s)+tf.multiply(it,gt)
ot=tf.nn.sigmoid(tf.matmul(self.W4,M_c)+self.b4)
ht=tf.multiply(ot,tf.math.tanh(cs))
output=self.softmax(tf.matmul(self.W5,ht)+self.b5)
return ht,output,cs
def sample_text(self,hprev,begin,p_s,n):
vocab=tf.Variable(tf.zeros((self.vocab_size,1)),tf.float32)
vocab=vocab.numpy()
vocab[begin]=1
letters=[]
for i in range(n):
M=tf.Variable(tf.zeros((self.input_size,1)),name="M")
M=tf.assign(M,tf.concat((hprev,vocab),axis=0))
ft=tf.nn.sigmoid(tf.matmul(self.W1,M)+self.b1)
it=tf.nn.sigmoid(tf.matmul(self.W2,M)+self.b2)
gt=tf.math.tanh(tf.matmul(self.W3,M)+self.b3)
cs=tf.multiply(ft,p_s)+tf.multiply(it,gt)
p_s=cs
ot=tf.sigmoid(tf.matmul(self.W4,M)+self.b4)
ht=tf.multiply(ot,tf.math.tanh(cs))
ht=tf.reshape(ht,(self.hidden_size,1))
output=tf.matmul(self.W5,ht)+self.b5
p=self.softmax(output)
#print(p.numpy())
p=tf.reshape(p,(1,self.vocab_size))
samples = tf.random.categorical(p,1)
sample_selected=tf.cast(samples[0][0].numpy(),tf.int32)
selection_sample_np=[i for i in range(self.vocab_size)]
selection_sample_tf=tf.convert_to_tensor(selection_sample_np)
selected_next_letter=selection_sample_tf[sample_selected]
trial=tf.cast(selected_next_letter,tf.int32)
k=tf.Variable(tf.zeros((self.vocab_size,1)),tf.int32)
k[selected_next_letter,0].assign(1)
letters.append(selected_next_letter)
hprev=ht
return letters
def process_input(self):
char_to_ix={ch:ix for ix,ch in enumerate(self.words)}
ix_to_char={ix:ch for ix,ch in enumerate(self.words)}
return char_to_ix,ix_to_char
def softmax(self,z):
return tf.math.exp(z-max(z))/tf.math.reduce_sum(tf.math.exp(z-max(z)))
def AggregatorNew(self):
losses,iterations=[],[]
char_to_ix,ix_to_char=self.process_input()
mem1=tf.Variable(tf.zeros_like(self.W1))
mem2=tf.Variable(tf.zeros_like(self.W2))
mem3=tf.Variable(tf.zeros_like(self.W3))
mem4=tf.Variable(tf.zeros_like(self.W4))
mem5=tf.Variable(tf.zeros_like(self.W5))
mem6=tf.Variable(tf.zeros_like(self.b1))
mem7=tf.Variable(tf.zeros_like(self.b2))
mem8=tf.Variable(tf.zeros_like(self.b3))
mem9=tf.Variable(tf.zeros_like(self.b4))
mem10=tf.Variable(tf.zeros_like(self.b5))
dW1=tf.Variable(tf.zeros_like(self.W1))
dW2=tf.Variable(tf.zeros_like(self.W2))
dW3=tf.Variable(tf.zeros_like(self.W3))
dW4=tf.Variable(tf.zeros_like(self.W4))
dW5=tf.Variable(tf.zeros_like(self.W4))
db1=tf.Variable(tf.zeros_like(self.b1))
db2=tf.Variable(tf.zeros_like(self.b2))
db3=tf.Variable(tf.zeros_like(self.b3))
db4=tf.Variable(tf.zeros_like(self.b4))
db5=tf.Variable(tf.zeros_like(self.b5))
n=0
p=0
self.loss=tf.Variable(0,dtype=tf.dtypes.float32,name="loss")
smooth_loss =-tf.math.log(1.0/self.vocab_size)*self.sequence_length
while(1):
try:
with DelayedKeyboardInterrupt():
if p+self.sequence_length+1>= len(self.data) or n == 0:
hprev=tf.Variable(np.zeros((self.hidden_size,1)),dtype=tf.float32,name="hprev")
p_s=tf.Variable(tf.zeros((self.hidden_size,1)),name="p_s")
p=0
inputs=[char_to_ix[ch] for ch in self.data[p:p+self.sequence_length]]
targets=[char_to_ix[ch] for ch in self.data[p+1:p+self.sequence_length+1]]
sample_ix = self.sample_text(hprev,inputs[0],p_s,200)
list_of_strings=[ix_to_char[ix.numpy()] for ix in sample_ix]
list_of_strings_tf=tf.convert_to_tensor(list_of_strings)
txt = tf.strings.join(list_of_strings_tf,separator=" ")
print ('----\n %s \n----' % (txt.numpy(), ))
#loss=tf.reduce_mean(xentropy,name="loss")
with tf.GradientTape() as g:
for x, y in zip(inputs,targets):
M_c=self.one_hot_encoding(x,hprev)
hprev,output,p_s=self.feedforward(M_c,p_s)
activation=output[y]
loss=-(tf.math.log(activation))
dW1,dW2,dW3,dW4,dW5,db1,db2,db3,db4,db5=g.gradient(loss,[self.W1,self.W2,self.W3,self.W4,self.W5,self.b1,self.b2,self.b3,self.b4,self.b5])
smooth_loss = smooth_loss * 0.999 + loss * 0.001
except KeyboardInterrupt:
sample_ix = self.sample_text(hprev,inputs[0],p_s,200)
txt = ''.join(ix_to_char[ix] for ix in sample_ix)
print ('----\n %s \n----' % (txt, ))
break
当我使用 self.softmax()
时,它会在前馈输出中给出概率值,但是当我使用 tf.nn.softmax()
时,所有输出值都奇怪地为 1.
第二个问题:与纯 python 实现相比,tensorflow 在 cpu 中通常更慢,还是我错误地实现了 tensorlow?
如果您正在使用 tf.nn.softmax()
,并且您没有指定轴,则它默认为 tf.nn.softmax(logits ,axis=1)
因此给出一个张量输出,其中所有值都是 1s 。就我而言,我只是因为没有提供轴而得到错误的值,即 tf.nn.softmax(logits,axis=0)
我正在使用 enable_eager_execution 学习 LSTM 和 tensorflow。但是,在实施 LSTM 时,我注意到 tf.nn.softmax 的行为 这让我陷入困境。这是我的代码的一部分
class RNN_LSTM(object):
def __init__(self,hidden_size):
data=open('Shakespear.txt', 'r').read()
self.data = data.split()
vocab_size=len(list(set(self.data)))
self.words =list(set(self.data))
self.hidden_size=hidden_size
self.input_size=vocab_size+hidden_size
self.vocab_size=vocab_size
self.W1=tf.Variable(tf.random.uniform((self.hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W1")*0.1)
self.b1=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b1"))
self.W2=tf.Variable(tf.random.uniform((self.hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W2")*0.1)
self.b2=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b2")*0.1)
self.W3=tf.Variable(tf.random.uniform((self.hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W3")*0.1)
self.b3=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b3")*0.1)
self.W4=tf.Variable(tf.random.uniform((hidden_size,self.input_size),dtype=tf.dtypes.float32,name="W4")*0.1)
self.b4=tf.Variable(tf.random.uniform((self.hidden_size,1),dtype=tf.dtypes.float32,name="b4")*0.1)
self.W5=tf.Variable(tf.random.uniform((self.vocab_size,self.hidden_size),dtype=tf.dtypes.float32,name="W5")*0.1)
self.b5=tf.Variable(tf.random.uniform((self.vocab_size,1),dtype=tf.dtypes.float32,name="b5")*0.1)
self.learning_rate=1e-1
self.sequence_length=50
#self.M_c=tf.Variable(tf.zeros((self.input_size,1)),name="M_c")
def one_hot_encoding(self,x,hprev):
M_c=tf.Variable(tf.zeros((self.input_size,1)),name="M_c")
vocab=tf.Variable(tf.zeros((self.vocab_size,1)))
#hprev=tf.Variable(tf.zeros((self.hidden_size,1)))
vocab=vocab.numpy()
vocab[x]=1
M_c=tf.concat((hprev,vocab),axis=0)
return M_c
def feedforward(self,M_c,p_s):
ft=tf.sigmoid( tf.matmul(self.W1,M_c)+self.b1)
it=tf.sigmoid(tf.matmul(self.W2,M_c)+self.b2)
gt=tf.math.tanh(tf.matmul(self.W3,M_c)+self.b3)
cs=tf.multiply(ft,p_s)+tf.multiply(it,gt)
ot=tf.nn.sigmoid(tf.matmul(self.W4,M_c)+self.b4)
ht=tf.multiply(ot,tf.math.tanh(cs))
output=self.softmax(tf.matmul(self.W5,ht)+self.b5)
return ht,output,cs
def sample_text(self,hprev,begin,p_s,n):
vocab=tf.Variable(tf.zeros((self.vocab_size,1)),tf.float32)
vocab=vocab.numpy()
vocab[begin]=1
letters=[]
for i in range(n):
M=tf.Variable(tf.zeros((self.input_size,1)),name="M")
M=tf.assign(M,tf.concat((hprev,vocab),axis=0))
ft=tf.nn.sigmoid(tf.matmul(self.W1,M)+self.b1)
it=tf.nn.sigmoid(tf.matmul(self.W2,M)+self.b2)
gt=tf.math.tanh(tf.matmul(self.W3,M)+self.b3)
cs=tf.multiply(ft,p_s)+tf.multiply(it,gt)
p_s=cs
ot=tf.sigmoid(tf.matmul(self.W4,M)+self.b4)
ht=tf.multiply(ot,tf.math.tanh(cs))
ht=tf.reshape(ht,(self.hidden_size,1))
output=tf.matmul(self.W5,ht)+self.b5
p=self.softmax(output)
#print(p.numpy())
p=tf.reshape(p,(1,self.vocab_size))
samples = tf.random.categorical(p,1)
sample_selected=tf.cast(samples[0][0].numpy(),tf.int32)
selection_sample_np=[i for i in range(self.vocab_size)]
selection_sample_tf=tf.convert_to_tensor(selection_sample_np)
selected_next_letter=selection_sample_tf[sample_selected]
trial=tf.cast(selected_next_letter,tf.int32)
k=tf.Variable(tf.zeros((self.vocab_size,1)),tf.int32)
k[selected_next_letter,0].assign(1)
letters.append(selected_next_letter)
hprev=ht
return letters
def process_input(self):
char_to_ix={ch:ix for ix,ch in enumerate(self.words)}
ix_to_char={ix:ch for ix,ch in enumerate(self.words)}
return char_to_ix,ix_to_char
def softmax(self,z):
return tf.math.exp(z-max(z))/tf.math.reduce_sum(tf.math.exp(z-max(z)))
def AggregatorNew(self):
losses,iterations=[],[]
char_to_ix,ix_to_char=self.process_input()
mem1=tf.Variable(tf.zeros_like(self.W1))
mem2=tf.Variable(tf.zeros_like(self.W2))
mem3=tf.Variable(tf.zeros_like(self.W3))
mem4=tf.Variable(tf.zeros_like(self.W4))
mem5=tf.Variable(tf.zeros_like(self.W5))
mem6=tf.Variable(tf.zeros_like(self.b1))
mem7=tf.Variable(tf.zeros_like(self.b2))
mem8=tf.Variable(tf.zeros_like(self.b3))
mem9=tf.Variable(tf.zeros_like(self.b4))
mem10=tf.Variable(tf.zeros_like(self.b5))
dW1=tf.Variable(tf.zeros_like(self.W1))
dW2=tf.Variable(tf.zeros_like(self.W2))
dW3=tf.Variable(tf.zeros_like(self.W3))
dW4=tf.Variable(tf.zeros_like(self.W4))
dW5=tf.Variable(tf.zeros_like(self.W4))
db1=tf.Variable(tf.zeros_like(self.b1))
db2=tf.Variable(tf.zeros_like(self.b2))
db3=tf.Variable(tf.zeros_like(self.b3))
db4=tf.Variable(tf.zeros_like(self.b4))
db5=tf.Variable(tf.zeros_like(self.b5))
n=0
p=0
self.loss=tf.Variable(0,dtype=tf.dtypes.float32,name="loss")
smooth_loss =-tf.math.log(1.0/self.vocab_size)*self.sequence_length
while(1):
try:
with DelayedKeyboardInterrupt():
if p+self.sequence_length+1>= len(self.data) or n == 0:
hprev=tf.Variable(np.zeros((self.hidden_size,1)),dtype=tf.float32,name="hprev")
p_s=tf.Variable(tf.zeros((self.hidden_size,1)),name="p_s")
p=0
inputs=[char_to_ix[ch] for ch in self.data[p:p+self.sequence_length]]
targets=[char_to_ix[ch] for ch in self.data[p+1:p+self.sequence_length+1]]
sample_ix = self.sample_text(hprev,inputs[0],p_s,200)
list_of_strings=[ix_to_char[ix.numpy()] for ix in sample_ix]
list_of_strings_tf=tf.convert_to_tensor(list_of_strings)
txt = tf.strings.join(list_of_strings_tf,separator=" ")
print ('----\n %s \n----' % (txt.numpy(), ))
#loss=tf.reduce_mean(xentropy,name="loss")
with tf.GradientTape() as g:
for x, y in zip(inputs,targets):
M_c=self.one_hot_encoding(x,hprev)
hprev,output,p_s=self.feedforward(M_c,p_s)
activation=output[y]
loss=-(tf.math.log(activation))
dW1,dW2,dW3,dW4,dW5,db1,db2,db3,db4,db5=g.gradient(loss,[self.W1,self.W2,self.W3,self.W4,self.W5,self.b1,self.b2,self.b3,self.b4,self.b5])
smooth_loss = smooth_loss * 0.999 + loss * 0.001
except KeyboardInterrupt:
sample_ix = self.sample_text(hprev,inputs[0],p_s,200)
txt = ''.join(ix_to_char[ix] for ix in sample_ix)
print ('----\n %s \n----' % (txt, ))
break
当我使用 self.softmax()
时,它会在前馈输出中给出概率值,但是当我使用 tf.nn.softmax()
时,所有输出值都奇怪地为 1.
第二个问题:与纯 python 实现相比,tensorflow 在 cpu 中通常更慢,还是我错误地实现了 tensorlow?
如果您正在使用 tf.nn.softmax()
,并且您没有指定轴,则它默认为 tf.nn.softmax(logits ,axis=1)
因此给出一个张量输出,其中所有值都是 1s 。就我而言,我只是因为没有提供轴而得到错误的值,即 tf.nn.softmax(logits,axis=0)