计算不相等字符串的最小编辑距离 python
Calculating Minimum Edit Distance for unequal strings python
我正在尝试以 2 的替代成本实现最小编辑距离。以下是我目前使用的代码。它适用于长度相等的字符串,但对于不相等的字符串会产生错误。不对的地方请指正
def med(source, target):
# if len(x) > len(y):
# print("insode if")
# source, target = y, x
print(len(source), len(target))
cost = [[0 for inner in range(len(source)+1)] for outer in
range(len(target)+1)]
global backtrace
backtrace = [[0 for inner in range(len(source)+1)] for outer in
range(len(target)+1)]
global SUB
global INS
global DEL
for i in range(0,len(target)+1):
cost[i][0] = i
for j in range(0,len(source)+1):
cost[0][j] = j
for i in range(1,len(target)+1):
for j in range(1,len(source)+1):
if source[i-1]==target[j-1]:
cost[i][j] = cost[i-1][j-1]
else:
deletion = cost[i-1][j]+1
insertion = cost[i][j-1]+1
substitution = cost[i-1][j-1]+2
cost[i][j] = min(insertion,deletion,substitution)
if cost[i][j] == substitution:
backtrace[i][j] = SUB
elif cost[i][j] == insertion:
backtrace[i][j] = INS
else:
backtrace[i][j] = DEL
return cost[i][j]
med("levenshtein","levels")
我得到的错误是:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-26-86bf20ea27c7> in <module>()
49 return cost[i][j]
50
---> 51 med("levenshtein","levels")
<ipython-input-26-86bf20ea27c7> in med(source, target)
31 for i in range(1,len(target)+1):
32 for j in range(1,len(source)+1):
---> 33 if source[i-1]==target[j-1]:
34 cost[i][j] = cost[i-1][j-1]
35 else:
IndexError: string index out of range
对于不同长度的字符串,cost
和 backtrace
索引不匹配。
可以通过仅更新一个 numpy m
* n
arr 来实现具有 2 个替代成本的最小编辑距离
根据算法,
下面的代码将完成这项工作。
def minimumEditDistance(first, second):
#Creating numpy ndarray( initialized with 0 of dimension of size of both strings
matrix = np.zeros((len(first)+1,len(second)+1), dtype=np.int)
# Cross relation loop through each character of each string with each other and
# fill the respective index of matrxi (row,column)
for i in range(len(first)+1):
for j in range(len(second)+1):
#First doing the boundary value analysis, if first or second string is empty so directly adding insertion cost
if i == 0:
matrix[i][j] = j
#Second case
elif j == 0:
matrix[i][j] = i
else:
matrix[i][j] = min(matrix[i][j-1] + 1,
matrix[i-1][j] + 1,
matrix[i-1][j-1] + 2 if first[i-1] != second[j-1] else matrix[i-1][j-1] + 0)
# Adjusted the cost accordinly, insertion = 1, deletion=1 and substitution=2
return matrix[len(first)][len(second)] # Returning the final
输出:
>>>print(minimumEditDistance('levenshtein','levels'))
7
>>>print(minimumEditDistance('levenshtein','levenshtein'))
0
我正在尝试以 2 的替代成本实现最小编辑距离。以下是我目前使用的代码。它适用于长度相等的字符串,但对于不相等的字符串会产生错误。不对的地方请指正
def med(source, target):
# if len(x) > len(y):
# print("insode if")
# source, target = y, x
print(len(source), len(target))
cost = [[0 for inner in range(len(source)+1)] for outer in
range(len(target)+1)]
global backtrace
backtrace = [[0 for inner in range(len(source)+1)] for outer in
range(len(target)+1)]
global SUB
global INS
global DEL
for i in range(0,len(target)+1):
cost[i][0] = i
for j in range(0,len(source)+1):
cost[0][j] = j
for i in range(1,len(target)+1):
for j in range(1,len(source)+1):
if source[i-1]==target[j-1]:
cost[i][j] = cost[i-1][j-1]
else:
deletion = cost[i-1][j]+1
insertion = cost[i][j-1]+1
substitution = cost[i-1][j-1]+2
cost[i][j] = min(insertion,deletion,substitution)
if cost[i][j] == substitution:
backtrace[i][j] = SUB
elif cost[i][j] == insertion:
backtrace[i][j] = INS
else:
backtrace[i][j] = DEL
return cost[i][j]
med("levenshtein","levels")
我得到的错误是:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-26-86bf20ea27c7> in <module>()
49 return cost[i][j]
50
---> 51 med("levenshtein","levels")
<ipython-input-26-86bf20ea27c7> in med(source, target)
31 for i in range(1,len(target)+1):
32 for j in range(1,len(source)+1):
---> 33 if source[i-1]==target[j-1]:
34 cost[i][j] = cost[i-1][j-1]
35 else:
IndexError: string index out of range
对于不同长度的字符串,cost
和 backtrace
索引不匹配。
可以通过仅更新一个 numpy m
* n
arr 来实现具有 2 个替代成本的最小编辑距离
根据算法, 下面的代码将完成这项工作。
def minimumEditDistance(first, second):
#Creating numpy ndarray( initialized with 0 of dimension of size of both strings
matrix = np.zeros((len(first)+1,len(second)+1), dtype=np.int)
# Cross relation loop through each character of each string with each other and
# fill the respective index of matrxi (row,column)
for i in range(len(first)+1):
for j in range(len(second)+1):
#First doing the boundary value analysis, if first or second string is empty so directly adding insertion cost
if i == 0:
matrix[i][j] = j
#Second case
elif j == 0:
matrix[i][j] = i
else:
matrix[i][j] = min(matrix[i][j-1] + 1,
matrix[i-1][j] + 1,
matrix[i-1][j-1] + 2 if first[i-1] != second[j-1] else matrix[i-1][j-1] + 0)
# Adjusted the cost accordinly, insertion = 1, deletion=1 and substitution=2
return matrix[len(first)][len(second)] # Returning the final
输出:
>>>print(minimumEditDistance('levenshtein','levels'))
7
>>>print(minimumEditDistance('levenshtein','levenshtein'))
0