为什么维特比算法(词性标注)总是预测一个标签?
Why does Viterbi algorithm (POS tagging) always predict one tag?
这是我的 HMM 模型 class:
class HiddenMarkovModel:
def __init__(self):
pass
def fit(self, train_tokens_tags_list):
"""
train_tokens_tags_list: array of sentences of pairs word-tag (for train)
"""
tags = [tag for sent in train_tokens_tags_list
for (word, tag) in sent]
words = [word for sent in train_tokens_tags_list
for (word, tag) in sent]
tag_num = pd.Series(data = nltk.FreqDist(tags)).sort_index()
word_num = pd.Series(data = nltk.FreqDist(words)).sort_values(ascending=False)
self.tags = tag_num.index
self.words = word_num.index
A = pd.DataFrame({'{}'.format(tag) : [0] * len(tag_num) for tag in tag_num.index}, index=tag_num.index)
B = pd.DataFrame({'{}'.format(tag) : [0] * len(word_num) for tag in tag_num.index}, index=word_num.index)
# Compure matrixes A and B from count of words and tags
# sent - sentence
# sent[i][0] - i word in this sentence, sent[i][1] - i tag in this sentence
for sent in train_tokens_tags_list:
for i in range(len(sent)):
B.loc[sent[i][0], sent[i][1]] += 1 # current i-pair word-tag
if len(sent) - 1 != i: # for last tag there is no next tag
A.loc[sent[i][1], sent[i + 1][1]] += 1 # pair tag-tag
# now to probabilities
# normalize along the row, i.e along all possible next tags
A = A.divide(A.sum(axis=1), axis=0)
# normalize along column, i.e along all possible current words
B = B / np.sum(B, axis=0)
self.A = A
self.B = B
return self
def predict(self, test_tokens_list):
"""
test_tokens_list : array of sentences of pairs word-tag (for test)
"""
predict_tags = OrderedDict({i : np.array([]) for i in range(len(test_tokens_list))})
for i_sent in range(len(test_tokens_list)):
current_sent = test_tokens_list[i_sent] # current sentence
len_sent = len(current_sent) # lenght of sentence
q = np.zeros(shape=(len_sent + 1, len(self.tags)))
q[0] = 1 # zero state (equal distribution for all s)
back_point = np.zeros(shape=(len_sent + 1, len(self.tags))) # # argmax
for t in range(len_sent):
# if we haven't met this word during training,
# we'll use the most popular word with most popular tag instead
if current_sent[t] not in self.words:
# print('this word not in dictionary', current_sent[t])
tagdict = findtags('NOUN', brown_tagged_words)
current_sent[t] = tagdict['NOUN'][0][0]
# using max we choose next tag
for i_s in range(len(self.tags)):
# i_s - index for tag
s = self.tags[i_s] # tag
# formula (1)
probability = np.max(q[t][i_s] * self.A.loc[:, s] * self.B.loc[current_sent[t], s])
q[t + 1][i_s] = np.max(probability)
# argmax formula(1)
# argmax for recreating the sequence of tags
back_point[t + 1][i_s] = (probability).reset_index()[s].idxmax() # index
back_point = back_point.astype('int')
# saving tags and changing their order for real one
back_tag = deque()
current_tag = np.argmax(q[len_sent])
for t in range(len_sent, 0, -1):
back_tag.appendleft(self.tags[current_tag])
current_tag = back_point[t, current_tag]
predict_tags[i_sent] = np.array(back_tag)
return predict_tags
其实我之前写的差不多了。我要填补一些空白(最重要的是在我们计算 q 和 back_point 矩阵的 Viterbi 算法中)。但我认为我做错了什么,因为我的模型总是预测类似的事情:
OrderedDict([(0, array(['.', '.', '.'], dtype='<U1'))])
在上面的例子中我给了它这句话:[['he','can','stay']]
我查过了:所有这些词都在训练数据中。
我已经打印出矩阵 A 和 B。它们是:
甲:
乙:
这表明模型的第一部分工作正常。
这些是我求助的公式:
这是数据:
brown_tagged_sents = brown.tagged_sents(tagset="universal", categories='humor')
my_brown_tagged_sents = []
for sent in brown_tagged_sents:
my_brown_tagged_sents.append(list(map(lambda x: (x[0].lower(), x[1]), sent)))
my_brown_tagged_sents = np.array(my_brown_tagged_sents)
from sklearn.model_selection import train_test_split
train_sents, test_sents = train_test_split(my_brown_tagged_sents, random_state=0, test_size=0.1)
我的模型可能有什么问题?
概率应该是这样的:
probability = q[t, :] * self.A.loc[:, s] * self.B.loc[current_sent[t], s]
而不是这个:
probability = q[t][i_s] * self.A.loc[:, s] * self.B.loc[current_sent[t], s]
这是我的 HMM 模型 class:
class HiddenMarkovModel:
def __init__(self):
pass
def fit(self, train_tokens_tags_list):
"""
train_tokens_tags_list: array of sentences of pairs word-tag (for train)
"""
tags = [tag for sent in train_tokens_tags_list
for (word, tag) in sent]
words = [word for sent in train_tokens_tags_list
for (word, tag) in sent]
tag_num = pd.Series(data = nltk.FreqDist(tags)).sort_index()
word_num = pd.Series(data = nltk.FreqDist(words)).sort_values(ascending=False)
self.tags = tag_num.index
self.words = word_num.index
A = pd.DataFrame({'{}'.format(tag) : [0] * len(tag_num) for tag in tag_num.index}, index=tag_num.index)
B = pd.DataFrame({'{}'.format(tag) : [0] * len(word_num) for tag in tag_num.index}, index=word_num.index)
# Compure matrixes A and B from count of words and tags
# sent - sentence
# sent[i][0] - i word in this sentence, sent[i][1] - i tag in this sentence
for sent in train_tokens_tags_list:
for i in range(len(sent)):
B.loc[sent[i][0], sent[i][1]] += 1 # current i-pair word-tag
if len(sent) - 1 != i: # for last tag there is no next tag
A.loc[sent[i][1], sent[i + 1][1]] += 1 # pair tag-tag
# now to probabilities
# normalize along the row, i.e along all possible next tags
A = A.divide(A.sum(axis=1), axis=0)
# normalize along column, i.e along all possible current words
B = B / np.sum(B, axis=0)
self.A = A
self.B = B
return self
def predict(self, test_tokens_list):
"""
test_tokens_list : array of sentences of pairs word-tag (for test)
"""
predict_tags = OrderedDict({i : np.array([]) for i in range(len(test_tokens_list))})
for i_sent in range(len(test_tokens_list)):
current_sent = test_tokens_list[i_sent] # current sentence
len_sent = len(current_sent) # lenght of sentence
q = np.zeros(shape=(len_sent + 1, len(self.tags)))
q[0] = 1 # zero state (equal distribution for all s)
back_point = np.zeros(shape=(len_sent + 1, len(self.tags))) # # argmax
for t in range(len_sent):
# if we haven't met this word during training,
# we'll use the most popular word with most popular tag instead
if current_sent[t] not in self.words:
# print('this word not in dictionary', current_sent[t])
tagdict = findtags('NOUN', brown_tagged_words)
current_sent[t] = tagdict['NOUN'][0][0]
# using max we choose next tag
for i_s in range(len(self.tags)):
# i_s - index for tag
s = self.tags[i_s] # tag
# formula (1)
probability = np.max(q[t][i_s] * self.A.loc[:, s] * self.B.loc[current_sent[t], s])
q[t + 1][i_s] = np.max(probability)
# argmax formula(1)
# argmax for recreating the sequence of tags
back_point[t + 1][i_s] = (probability).reset_index()[s].idxmax() # index
back_point = back_point.astype('int')
# saving tags and changing their order for real one
back_tag = deque()
current_tag = np.argmax(q[len_sent])
for t in range(len_sent, 0, -1):
back_tag.appendleft(self.tags[current_tag])
current_tag = back_point[t, current_tag]
predict_tags[i_sent] = np.array(back_tag)
return predict_tags
其实我之前写的差不多了。我要填补一些空白(最重要的是在我们计算 q 和 back_point 矩阵的 Viterbi 算法中)。但我认为我做错了什么,因为我的模型总是预测类似的事情:
OrderedDict([(0, array(['.', '.', '.'], dtype='<U1'))])
在上面的例子中我给了它这句话:[['he','can','stay']]
我查过了:所有这些词都在训练数据中。
我已经打印出矩阵 A 和 B。它们是:
甲:
乙:
这表明模型的第一部分工作正常。
这些是我求助的公式:
这是数据:
brown_tagged_sents = brown.tagged_sents(tagset="universal", categories='humor')
my_brown_tagged_sents = []
for sent in brown_tagged_sents:
my_brown_tagged_sents.append(list(map(lambda x: (x[0].lower(), x[1]), sent)))
my_brown_tagged_sents = np.array(my_brown_tagged_sents)
from sklearn.model_selection import train_test_split
train_sents, test_sents = train_test_split(my_brown_tagged_sents, random_state=0, test_size=0.1)
我的模型可能有什么问题?
概率应该是这样的:
probability = q[t, :] * self.A.loc[:, s] * self.B.loc[current_sent[t], s]
而不是这个:
probability = q[t][i_s] * self.A.loc[:, s] * self.B.loc[current_sent[t], s]