奇异值分解变化结果
singular value decomposition changing results
我正在尝试使用 svds 执行文本摘要,但每次我 运行 函数时摘要结果都在变化。有人可以让我知道原因以及解决方案吗?
我什至检查了单独的数组 u、s 和 v,即使它们在每个 运行 之后都在变化。如何使它们静态?
在那个svds代码之后,句子矩阵已经计算如下。数据集是澳大利亚最高法院的一些法律文件。
def _compute_matrix(sentences, weighting, norm):
if weighting.lower() == 'binary':
vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 1),
binary=True, stop_words=None)
elif weighting.lower() == 'frequency':
vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 1),
binary=False, stop_words=None)
elif weighting.lower() == 'tfidf':
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1),
stop_words=None)
else:
raise ValueError('Parameter "method" must take one of the values
"binary", "frequency" or "tfidf".')
# Extract word features from sentences using sparse vectorizer
frequency_matrix = vectorizer.fit_transform(sentences).astype(float)
terms = vectorizer.get_feature_names()
if norm in ('l1', 'l2'):
frequency_matrix = normalize(frequency_matrix, norm=norm, axis=1)
elif norm is not None:
raise ValueError('Parameter "norm" can only take values "l1", "l2"
or None')
return frequency_matrix, terms
processed_sentences = _createsentences(raw_content)
sentence_matrix, feature_names = _compute_matrix(processed_sentences,
weighting='tfidf', norm='l2')
sentence_matrix = sentence_matrix.transpose()
sentence_matrix = sentence_matrix.multiply(sentence_matrix > 0)
print(sentence_matrix.shape)
u, s, v = svds(sentence_matrix, k=20)
topic_sigma_threshold = 0.5
topic_averages = v.mean(axis=1)
for topic_ndx, topic_avg in enumerate(topic_averages):
v[topic_ndx, v[topic_ndx, :] <= topic_avg] = 0
if 1 <= topic_sigma_threshold < 0:
raise ValueError('Parameter topic_sigma_threshold must take a value
between 0 and 1')
sigma_threshold = max(s) * topic_sigma_threshold
s[s < sigma_threshold] = 0
saliency_vec = np.dot(np.square(s), np.square(v))
top_sentences = saliency_vec.argsort()[-25:][::-1]
top_sentences.sort()
[processed_sentences[i] for i in top_sentences]
我通过玩弄svd的参数和了解svds的源代码找到了解决方案。 svds 使用来自稀疏矩阵 N 维的随机初始向量。因此,要将初始向量设置为常数选择,我们必须使用 v0 参数,代码如下所述。
np.random.seed(0)
v0 = np.random.rand(min(sentence_matrix.shape))
u, s, v = svds(sentence_matrix, k=20, v0=v0)
我正在尝试使用 svds 执行文本摘要,但每次我 运行 函数时摘要结果都在变化。有人可以让我知道原因以及解决方案吗? 我什至检查了单独的数组 u、s 和 v,即使它们在每个 运行 之后都在变化。如何使它们静态? 在那个svds代码之后,句子矩阵已经计算如下。数据集是澳大利亚最高法院的一些法律文件。
def _compute_matrix(sentences, weighting, norm):
if weighting.lower() == 'binary':
vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 1),
binary=True, stop_words=None)
elif weighting.lower() == 'frequency':
vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 1),
binary=False, stop_words=None)
elif weighting.lower() == 'tfidf':
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1),
stop_words=None)
else:
raise ValueError('Parameter "method" must take one of the values
"binary", "frequency" or "tfidf".')
# Extract word features from sentences using sparse vectorizer
frequency_matrix = vectorizer.fit_transform(sentences).astype(float)
terms = vectorizer.get_feature_names()
if norm in ('l1', 'l2'):
frequency_matrix = normalize(frequency_matrix, norm=norm, axis=1)
elif norm is not None:
raise ValueError('Parameter "norm" can only take values "l1", "l2"
or None')
return frequency_matrix, terms
processed_sentences = _createsentences(raw_content)
sentence_matrix, feature_names = _compute_matrix(processed_sentences,
weighting='tfidf', norm='l2')
sentence_matrix = sentence_matrix.transpose()
sentence_matrix = sentence_matrix.multiply(sentence_matrix > 0)
print(sentence_matrix.shape)
u, s, v = svds(sentence_matrix, k=20)
topic_sigma_threshold = 0.5
topic_averages = v.mean(axis=1)
for topic_ndx, topic_avg in enumerate(topic_averages):
v[topic_ndx, v[topic_ndx, :] <= topic_avg] = 0
if 1 <= topic_sigma_threshold < 0:
raise ValueError('Parameter topic_sigma_threshold must take a value
between 0 and 1')
sigma_threshold = max(s) * topic_sigma_threshold
s[s < sigma_threshold] = 0
saliency_vec = np.dot(np.square(s), np.square(v))
top_sentences = saliency_vec.argsort()[-25:][::-1]
top_sentences.sort()
[processed_sentences[i] for i in top_sentences]
我通过玩弄svd的参数和了解svds的源代码找到了解决方案。 svds 使用来自稀疏矩阵 N 维的随机初始向量。因此,要将初始向量设置为常数选择,我们必须使用 v0 参数,代码如下所述。
np.random.seed(0)
v0 = np.random.rand(min(sentence_matrix.shape))
u, s, v = svds(sentence_matrix, k=20, v0=v0)