Levenshtein 距离的单词同义词
Synonyms for words by Levenshtein Distance
这是我的代码:
public void SearchWordSynonymsByLevenstein()
{
foreach (var eachWord in wordCounter)
{
foreach (var eachSecondWord in wordCounter)
{
if (eachWord.Key.Length > 3)
{
var score = LevenshteinDistance.Compute(eachWord.Key, eachSecondWord.Key);
if (score < 2)
{
if(!wordSynonymsByLevenstein.Any(x => x.Value.ContainsKey(eachSecondWord.Key)))
{
if (!wordSynonymsByLevenstein.ContainsKey(eachWord.Key))
{
wordSynonymsByLevenstein.Add(eachWord.Key, new Dictionary<string, int> { { eachSecondWord.Key, eachSecondWord.Value } });
}
else
{
wordSynonymsByLevenstein[eachWord.Key].Add(eachSecondWord.Key, eachSecondWord.Value);
}
}
}
}
}
}
}
我的 wordCounter
是 Dictionary<string, int>
其中键是我的每个单词,值是计算文档中这个单词的数量。像词袋之类的东西。我必须从其他 eachSecondWord
中搜索 eachWord
的同义词。这种方法太费时间了。时间呈指数增长。还有其他方法可以减少时间吗?
首先,我假设您不想在 wordSynonymsByLevenstein
集合中将某个词与其自身相关联。其次,您可以通过比较单词的长度来跳过那些您知道不会满足 <2 分要求的单词。
public void SearchWordSynonymsByLevenstein()
{
foreach (var eachWord in wordCounter)
{
foreach (var eachSecondWord in wordCounter)
{
if (eachWord.Key == eachSecondWord.Key
|| eachWord.Key.Length <= 3
|| Math.Abs(eachWord.Key.Length - eachSecondWord.Key.Length) >= 2)
{
continue;
}
var score = LevenshteinDistance.Compute(eachWord.Key, eachSecondWord.Key);
if (score >= 2)
{
continue;
}
if(!wordSynonymsByLevenstein.Any(x => x.Value.ContainsKey(eachSecondWord.Key)))
{
if (!wordSynonymsByLevenstein.ContainsKey(eachWord.Key))
{
wordSynonymsByLevenstein.Add(eachWord.Key, new Dictionary<string, int> { { eachSecondWord.Key, eachSecondWord.Value } });
}
else
{
wordSynonymsByLevenstein[eachWord.Key].Add(eachSecondWord.Key, eachSecondWord.Value);
}
}
}
}
}
你用if(!wordSynonymsByLevenstein.Any(x => x.Value.ContainsKey(eachSecondWord.Key)))
表达的要求不是特别明显或直接,但如果你不希望一个词与多个词相关联,那么你可以额外添加一个HashSet<string>
并在关联单词时将它们添加到那个 HashSet
并在继续之前检查下一个单词是否在那里,而不是迭代嵌套词典。
public void SearchWordSynonymsByLevenstein()
{
var used = new HashSet<string>();
foreach (var eachWord in wordCounter)
{
foreach (var eachSecondWord in wordCounter)
{
if (eachWord.Key == eachSecondWord.Key
|| eachWord.Key.Length <= 3
|| Math.Abs(eachWord.Key.Length - eachSecondWord.Key.Length) >= 2)
{
continue;
}
var score = LevenshteinDistance.Compute(eachWord.Key, eachSecondWord.Key);
if (score >= 2)
{
continue;
}
if(used.Add(eachSecondWord.Key)))
{
if (!wordSynonymsByLevenstein.ContainsKey(eachWord.Key))
{
wordSynonymsByLevenstein.Add(eachWord.Key, new Dictionary<string, int> { { eachSecondWord.Key, eachSecondWord.Value } });
}
else
{
wordSynonymsByLevenstein[eachWord.Key].Add(eachSecondWord.Key, eachSecondWord.Value);
}
}
}
}
}
这里我使用了 if(used.Add(eachSecondWord.Key)))
因为 Add
将 return true
如果添加了单词, false
如果它已经在 HashSet
.
这是我的代码:
public void SearchWordSynonymsByLevenstein()
{
foreach (var eachWord in wordCounter)
{
foreach (var eachSecondWord in wordCounter)
{
if (eachWord.Key.Length > 3)
{
var score = LevenshteinDistance.Compute(eachWord.Key, eachSecondWord.Key);
if (score < 2)
{
if(!wordSynonymsByLevenstein.Any(x => x.Value.ContainsKey(eachSecondWord.Key)))
{
if (!wordSynonymsByLevenstein.ContainsKey(eachWord.Key))
{
wordSynonymsByLevenstein.Add(eachWord.Key, new Dictionary<string, int> { { eachSecondWord.Key, eachSecondWord.Value } });
}
else
{
wordSynonymsByLevenstein[eachWord.Key].Add(eachSecondWord.Key, eachSecondWord.Value);
}
}
}
}
}
}
}
我的 wordCounter
是 Dictionary<string, int>
其中键是我的每个单词,值是计算文档中这个单词的数量。像词袋之类的东西。我必须从其他 eachSecondWord
中搜索 eachWord
的同义词。这种方法太费时间了。时间呈指数增长。还有其他方法可以减少时间吗?
首先,我假设您不想在 wordSynonymsByLevenstein
集合中将某个词与其自身相关联。其次,您可以通过比较单词的长度来跳过那些您知道不会满足 <2 分要求的单词。
public void SearchWordSynonymsByLevenstein()
{
foreach (var eachWord in wordCounter)
{
foreach (var eachSecondWord in wordCounter)
{
if (eachWord.Key == eachSecondWord.Key
|| eachWord.Key.Length <= 3
|| Math.Abs(eachWord.Key.Length - eachSecondWord.Key.Length) >= 2)
{
continue;
}
var score = LevenshteinDistance.Compute(eachWord.Key, eachSecondWord.Key);
if (score >= 2)
{
continue;
}
if(!wordSynonymsByLevenstein.Any(x => x.Value.ContainsKey(eachSecondWord.Key)))
{
if (!wordSynonymsByLevenstein.ContainsKey(eachWord.Key))
{
wordSynonymsByLevenstein.Add(eachWord.Key, new Dictionary<string, int> { { eachSecondWord.Key, eachSecondWord.Value } });
}
else
{
wordSynonymsByLevenstein[eachWord.Key].Add(eachSecondWord.Key, eachSecondWord.Value);
}
}
}
}
}
你用if(!wordSynonymsByLevenstein.Any(x => x.Value.ContainsKey(eachSecondWord.Key)))
表达的要求不是特别明显或直接,但如果你不希望一个词与多个词相关联,那么你可以额外添加一个HashSet<string>
并在关联单词时将它们添加到那个 HashSet
并在继续之前检查下一个单词是否在那里,而不是迭代嵌套词典。
public void SearchWordSynonymsByLevenstein()
{
var used = new HashSet<string>();
foreach (var eachWord in wordCounter)
{
foreach (var eachSecondWord in wordCounter)
{
if (eachWord.Key == eachSecondWord.Key
|| eachWord.Key.Length <= 3
|| Math.Abs(eachWord.Key.Length - eachSecondWord.Key.Length) >= 2)
{
continue;
}
var score = LevenshteinDistance.Compute(eachWord.Key, eachSecondWord.Key);
if (score >= 2)
{
continue;
}
if(used.Add(eachSecondWord.Key)))
{
if (!wordSynonymsByLevenstein.ContainsKey(eachWord.Key))
{
wordSynonymsByLevenstein.Add(eachWord.Key, new Dictionary<string, int> { { eachSecondWord.Key, eachSecondWord.Value } });
}
else
{
wordSynonymsByLevenstein[eachWord.Key].Add(eachSecondWord.Key, eachSecondWord.Value);
}
}
}
}
}
这里我使用了 if(used.Add(eachSecondWord.Key)))
因为 Add
将 return true
如果添加了单词, false
如果它已经在 HashSet
.