python3, difflib SequenceMatcher
python3, difflib SequenceMatcher
以下接受两个字符串,比较差异和 return 它们既相同又不同,用空格分隔(保持最长字符串的长度。
代码中的注释区,就是应该return编辑的4个字符串。
from difflib import SequenceMatcher
t1 = 'betty: backstreetvboysareback"give.jpg"LAlarrygarryhannyhref="ang"_self'
t2 = 'bettyv: backstreetvboysareback"lifeislike"LAlarrygarryhannyhref="in.php"_self'
#t1 = 'betty : backstreetvboysareback" i e "LAlarrygarryhannyhref=" n "_self'
#t2 = 'betty : backstreetvboysareback" i e "LAlarrygarryhannyhref=" n "_self'
#o1 = ' g v .jpg g '
#o2 = ' v l f islike i .php '
matcher = SequenceMatcher(None, t1, t2)
blocks = matcher.get_matching_blocks()
bla1 = []
bla2 = []
for i in range(len(blocks)):
if i != len(blocks)-1:
bla1.append([t1[blocks[i].a + blocks[i].size:blocks[i+1].a], blocks[i].a + blocks[i].size, blocks[i+1].a])
bla2.append([t2[blocks[i].b + blocks[i].size:blocks[i+1].b], blocks[i].b + blocks[i].size, blocks[i+1].b])
cnt = 0
for i in range(len(bla1)):
if bla1[i][1] < bla2[i][1]:
num = bla2[i][1] - bla1[i][1]
t2 = t2[0:bla2[i][1]] + ' '*num + t2[bla2[i][1]:len(t2)]
bla2[i][0] = ' '*num + bla2[i][0]
bla2[i][1] = bla1[i][1]
if bla2[i][1] < bla1[i][1]:
num = bla1[i][1] - bla2[i][1]
t1 = t1[0:bla1[i][1]] + ' '*num + t1[bla1[i][1]:len(t1)]
bla1[i][0] = ' '*num + bla1[i][0]
bla1[i][1] = bla2[i][1]
if bla1[i][2] > bla2[i][2]:
num = bla1[i][2] - bla2[i][2]
t2 = t2[0:bla2[i][2]] + ' '*num + t2[bla2[i][2]:len(t2)]
bla2[i][0] = bla2[i][0] + ' '*num
bla2[i][2] = bla1[i][2]
if bla2[i][2] > bla1[i][2]:
num = bla2[i][2] - bla1[i][2]
t1 = t1[0:bla1[i][2]] + ' '*num + t1[bla1[i][2]:len(t1)]
bla1[i][0] = bla1[i][0] + ' '*num
bla1[i][2] = bla2[i][2]
t11 = []
t11 = t1[0:bla1[0][1]]
t11 += t1[bla1[0][2]:bla1[1][1]]
t11 += t1[bla1[1][2]:bla1[2][1]]
t11 += t1[bla1[2][2]:bla1[3][1]]
t11 += t1[bla1[3][2]:bla1[4][1]]
t11 += t1[bla1[5][2]:bla1[6][1]]
t11 += t1[bla1[6][2]:len(t1)]
t12 = []
t12 = t2[0:bla1[0][1]]
t12 += t2[bla1[0][2]:bla1[1][1]]
t12 += t2[bla1[1][2]:bla1[2][1]]
t12 += t2[bla1[2][2]:bla1[3][1]]
t12 += t2[bla1[3][2]:bla1[4][1]]
t12 += t2[bla1[5][2]:bla1[6][1]]
t12 += t2[bla1[6][2]:len(t2)]
将块排列成有组织的格式后 bla1
,bla2
其中每个差异都存储为一个字符串及其开始和结束位置,例如 ['v', 33, 34]
用于每个单独的字符串。在此之后,我尝试插入空格以匹配所需的长度和分隔系数,这就是代码开始中断的地方。
如果有人可以看一下!
我已经解决了这个问题,由于没有人 post 做出回应,我将 post 进展 和 解决方案。以下代码是 progress ... 它在处理偏移量较小但在差异较大时开始中断的变化时效果很好,特别是在保持间距(偏移量)以匹配二.
from difflib import SequenceMatcher
import pdb
t1 = 'betty: backstreetvboysareback"give.jpg"LAlarrygarryhannyhref="ang"_self'
t2 = 'betty: backstreetvboysareback"lol.jpg"LAlarrygarryhannyhref="ang"_self'
#t2 = 'bettyv: backstreetvboysareback"lifeislike"LAlarrygarryhannyhref="in.php"_selff'
#t2 = 'LA'
#t2 = 'c give.'
#t2 = 'give.'
#t1 = 'betty : backstreetvboysareback" i e "LAlarrygarryhannyhref=" n "_self'
#t2 = 'betty : backstreetvboysareback" i e "LAlarrygarryhannyhref=" n "_self'
#o1 = ' g v .jpg g '
#o2 = ' v l f islike i .php '
matcher = SequenceMatcher(None, t1, t2)
blocks = matcher.get_matching_blocks()
#print(len(blocks))
bla1 = []
bla2 = []
#bla = (string), (first pos), (second pos), (pos1 + pos2), (pos + pos2 total positions added togeather)
dnt = False
for i in range(len(blocks)):
if i == 0:
if blocks[i].a != 0 and dnt == False:
bla1.append([t1[blocks[i].a:blocks[i].b], 0, blocks[i].a, 0, 0])
bla2.append([t2[blocks[i].a:blocks[i].b], 0, blocks[i].b, 0, 0])
dnt = True
if blocks[i].b != 0 and dnt == False:
bla2.append([t2[blocks[i].a:blocks[i].b], 0, blocks[i].b, 0, 0])
bla1.append([t1[blocks[i].a:blocks[i].b], 0, blocks[i].a, 0, 0])
dnt = True
if i != len(blocks)-1:
print(blocks[i])
bla1.append([t1[blocks[i].a + blocks[i].size:blocks[i+1].a], blocks[i].a + blocks[i].size, blocks[i+1].a, 0, 0])
bla2.append([t2[blocks[i].b + blocks[i].size:blocks[i+1].b], blocks[i].b + blocks[i].size, blocks[i+1].b, 0, 0])
#pdb.set_trace()
ttl = 0
for i in range(len(bla1)):
cnt = bla1[i][2] - bla1[i][1]
if cnt != 0:
bla1[i][3] = cnt
ttl = ttl + cnt
bla1[i][4] = ttl
ttl = 0
for i in range(len(bla2)):
cnt = bla2[i][2] - bla2[i][1]
if cnt != 0:
bla2[i][3] = cnt
ttl = ttl + cnt
bla2[i][4] = ttl
print(bla1)
print(bla2)
tt1 = ''
dif = 0
i = 0
while True:
if i == 0:
if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]
tt1 += t1[:bla1[i][1]] + '_'*dif
if i <= len(bla1) -1:
if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]
if len(bla1) != 1:
if i == 0: tt1 += t1[bla1[i][1] + bla1[i][3]:bla1[i+1][1]]
if i != 0 and i != len(bla1)-1: tt1 += '_'*dif + t1[bla1[i][1] + bla1[i][3]:bla1[i+1][1]]
if i == len(bla1)-1: tt1 += '_'*dif + t1[bla1[i][1] + bla1[i][3]:len(t1)]
i = i+1
print('t1 = ' + tt1)
else:
break
tt2 = ''
i = 0
dif = 0
while True:
if i == 0:
if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]
tt2 += t2[:bla2[i][1]] + '_'*dif
if i <= len(bla2) -1:
if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]
if len(bla2) != 1:
if i == 0: tt2 += t2[bla2[i][1] + bla2[i][3]:bla2[i+1][1]]
if i != 0 and i != len(bla1)-1: tt2 += '_'*dif + t2[bla2[i][1] + bla2[i][3]:bla2[i+1][1]]
if i == len(bla2)-1: tt2 += '_'*dif + t2[bla2[i][1] + bla2[i][3]:len(t2)]
i = i+1
print('t2 = ' + tt2)
else:
break
print()
解法:
不幸的是,我太忙了,无法继续编写代码,因此求助于 sub-processing diffutils ...这是许多艰苦编码的绝佳选择!
以下接受两个字符串,比较差异和 return 它们既相同又不同,用空格分隔(保持最长字符串的长度。
代码中的注释区,就是应该return编辑的4个字符串。
from difflib import SequenceMatcher
t1 = 'betty: backstreetvboysareback"give.jpg"LAlarrygarryhannyhref="ang"_self'
t2 = 'bettyv: backstreetvboysareback"lifeislike"LAlarrygarryhannyhref="in.php"_self'
#t1 = 'betty : backstreetvboysareback" i e "LAlarrygarryhannyhref=" n "_self'
#t2 = 'betty : backstreetvboysareback" i e "LAlarrygarryhannyhref=" n "_self'
#o1 = ' g v .jpg g '
#o2 = ' v l f islike i .php '
matcher = SequenceMatcher(None, t1, t2)
blocks = matcher.get_matching_blocks()
bla1 = []
bla2 = []
for i in range(len(blocks)):
if i != len(blocks)-1:
bla1.append([t1[blocks[i].a + blocks[i].size:blocks[i+1].a], blocks[i].a + blocks[i].size, blocks[i+1].a])
bla2.append([t2[blocks[i].b + blocks[i].size:blocks[i+1].b], blocks[i].b + blocks[i].size, blocks[i+1].b])
cnt = 0
for i in range(len(bla1)):
if bla1[i][1] < bla2[i][1]:
num = bla2[i][1] - bla1[i][1]
t2 = t2[0:bla2[i][1]] + ' '*num + t2[bla2[i][1]:len(t2)]
bla2[i][0] = ' '*num + bla2[i][0]
bla2[i][1] = bla1[i][1]
if bla2[i][1] < bla1[i][1]:
num = bla1[i][1] - bla2[i][1]
t1 = t1[0:bla1[i][1]] + ' '*num + t1[bla1[i][1]:len(t1)]
bla1[i][0] = ' '*num + bla1[i][0]
bla1[i][1] = bla2[i][1]
if bla1[i][2] > bla2[i][2]:
num = bla1[i][2] - bla2[i][2]
t2 = t2[0:bla2[i][2]] + ' '*num + t2[bla2[i][2]:len(t2)]
bla2[i][0] = bla2[i][0] + ' '*num
bla2[i][2] = bla1[i][2]
if bla2[i][2] > bla1[i][2]:
num = bla2[i][2] - bla1[i][2]
t1 = t1[0:bla1[i][2]] + ' '*num + t1[bla1[i][2]:len(t1)]
bla1[i][0] = bla1[i][0] + ' '*num
bla1[i][2] = bla2[i][2]
t11 = []
t11 = t1[0:bla1[0][1]]
t11 += t1[bla1[0][2]:bla1[1][1]]
t11 += t1[bla1[1][2]:bla1[2][1]]
t11 += t1[bla1[2][2]:bla1[3][1]]
t11 += t1[bla1[3][2]:bla1[4][1]]
t11 += t1[bla1[5][2]:bla1[6][1]]
t11 += t1[bla1[6][2]:len(t1)]
t12 = []
t12 = t2[0:bla1[0][1]]
t12 += t2[bla1[0][2]:bla1[1][1]]
t12 += t2[bla1[1][2]:bla1[2][1]]
t12 += t2[bla1[2][2]:bla1[3][1]]
t12 += t2[bla1[3][2]:bla1[4][1]]
t12 += t2[bla1[5][2]:bla1[6][1]]
t12 += t2[bla1[6][2]:len(t2)]
将块排列成有组织的格式后 bla1
,bla2
其中每个差异都存储为一个字符串及其开始和结束位置,例如 ['v', 33, 34]
用于每个单独的字符串。在此之后,我尝试插入空格以匹配所需的长度和分隔系数,这就是代码开始中断的地方。
如果有人可以看一下!
我已经解决了这个问题,由于没有人 post 做出回应,我将 post 进展 和 解决方案。以下代码是 progress ... 它在处理偏移量较小但在差异较大时开始中断的变化时效果很好,特别是在保持间距(偏移量)以匹配二.
from difflib import SequenceMatcher
import pdb
t1 = 'betty: backstreetvboysareback"give.jpg"LAlarrygarryhannyhref="ang"_self'
t2 = 'betty: backstreetvboysareback"lol.jpg"LAlarrygarryhannyhref="ang"_self'
#t2 = 'bettyv: backstreetvboysareback"lifeislike"LAlarrygarryhannyhref="in.php"_selff'
#t2 = 'LA'
#t2 = 'c give.'
#t2 = 'give.'
#t1 = 'betty : backstreetvboysareback" i e "LAlarrygarryhannyhref=" n "_self'
#t2 = 'betty : backstreetvboysareback" i e "LAlarrygarryhannyhref=" n "_self'
#o1 = ' g v .jpg g '
#o2 = ' v l f islike i .php '
matcher = SequenceMatcher(None, t1, t2)
blocks = matcher.get_matching_blocks()
#print(len(blocks))
bla1 = []
bla2 = []
#bla = (string), (first pos), (second pos), (pos1 + pos2), (pos + pos2 total positions added togeather)
dnt = False
for i in range(len(blocks)):
if i == 0:
if blocks[i].a != 0 and dnt == False:
bla1.append([t1[blocks[i].a:blocks[i].b], 0, blocks[i].a, 0, 0])
bla2.append([t2[blocks[i].a:blocks[i].b], 0, blocks[i].b, 0, 0])
dnt = True
if blocks[i].b != 0 and dnt == False:
bla2.append([t2[blocks[i].a:blocks[i].b], 0, blocks[i].b, 0, 0])
bla1.append([t1[blocks[i].a:blocks[i].b], 0, blocks[i].a, 0, 0])
dnt = True
if i != len(blocks)-1:
print(blocks[i])
bla1.append([t1[blocks[i].a + blocks[i].size:blocks[i+1].a], blocks[i].a + blocks[i].size, blocks[i+1].a, 0, 0])
bla2.append([t2[blocks[i].b + blocks[i].size:blocks[i+1].b], blocks[i].b + blocks[i].size, blocks[i+1].b, 0, 0])
#pdb.set_trace()
ttl = 0
for i in range(len(bla1)):
cnt = bla1[i][2] - bla1[i][1]
if cnt != 0:
bla1[i][3] = cnt
ttl = ttl + cnt
bla1[i][4] = ttl
ttl = 0
for i in range(len(bla2)):
cnt = bla2[i][2] - bla2[i][1]
if cnt != 0:
bla2[i][3] = cnt
ttl = ttl + cnt
bla2[i][4] = ttl
print(bla1)
print(bla2)
tt1 = ''
dif = 0
i = 0
while True:
if i == 0:
if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]
tt1 += t1[:bla1[i][1]] + '_'*dif
if i <= len(bla1) -1:
if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]
if len(bla1) != 1:
if i == 0: tt1 += t1[bla1[i][1] + bla1[i][3]:bla1[i+1][1]]
if i != 0 and i != len(bla1)-1: tt1 += '_'*dif + t1[bla1[i][1] + bla1[i][3]:bla1[i+1][1]]
if i == len(bla1)-1: tt1 += '_'*dif + t1[bla1[i][1] + bla1[i][3]:len(t1)]
i = i+1
print('t1 = ' + tt1)
else:
break
tt2 = ''
i = 0
dif = 0
while True:
if i == 0:
if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]
tt2 += t2[:bla2[i][1]] + '_'*dif
if i <= len(bla2) -1:
if bla1[i][3] >= bla2[i][3]: dif = bla1[i][3]
if bla1[i][3] < bla2[i][3]: dif = bla2[i][3]
if len(bla2) != 1:
if i == 0: tt2 += t2[bla2[i][1] + bla2[i][3]:bla2[i+1][1]]
if i != 0 and i != len(bla1)-1: tt2 += '_'*dif + t2[bla2[i][1] + bla2[i][3]:bla2[i+1][1]]
if i == len(bla2)-1: tt2 += '_'*dif + t2[bla2[i][1] + bla2[i][3]:len(t2)]
i = i+1
print('t2 = ' + tt2)
else:
break
print()
解法:
不幸的是,我太忙了,无法继续编写代码,因此求助于 sub-processing diffutils ...这是许多艰苦编码的绝佳选择!