Python 中的正则表达式匹配不一致
Inconsistent regex matching in Python
我有这段代码可以使用正则表达式打印在两个文件中找到的匹配项:
with open('ipCountry_list.txt','r') as csvfile1, open('city.txt', 'r', encoding="utf8" ) as file1:
with open('unmatch.txt', 'a+') as file2:
readCS = csv.reader(csvfile1, delimiter=';')
reader = file1.readlines()
for row in readCS:
for line in reader:
if all(re.findall(fr"\b{word}\b[^ ]", line, re.IGNORECASE) for word in row[:2]):
print(str(row) + line)
file2.write(str(row) + line)
break
if all(re.findall(fr"\b{word}\b[^ ]", line, re.IGNORECASE) for word in row[:3]):
print(str(row) + line)
file2.write(str(row) + line)
break
输出:
['TH', 'BANGKOK']'TH~10~Bangkok'
['ES', 'VALENCIA']'ES~VC~Valencia'
['US', 'AZ', 'PHOENIX']'US~AZ~Aguila'
['JP', 'KASHIWA']'JP~12~Kashiwa'
[CZ;PRAGUE 4 - aaa]
如果您看到 ['US', 'AZ', 'PHOENIX']'US~AZ~Aguila'
不匹配。
[CZ;PRAGUE 4 - aaa]
应该与 'CZ~10~Prague'
匹配,但无法匹配。
我不确定我做错了什么,也许是我的正则表达式?
编辑:
ipCountry_list.txt:
TH;BANGKOK;aaa
ES;VALENCIA;aaa
US;AZ;PHOENIX;aaa
JP;KASHIWA;aaa
CZ;PRAGUE 4 - aaa;
ZA;EAST LONDON;aaa
GB;BRIDGWATER;aaa
RU;MOSCOW;aaa
GH;TEMA;aaa
city.txt:
'TH~10~Bangkok'
'ES~VC~Valencia'
'US~AZ~Aguila'
'US~AZ~Phoenix'
'JP~12~Kashiwa'
'GB~ENG~Bridgwater'
'ZA~EC~East London'
'RU~MOW~Moscow'
'GH~AA~Tema'
'CZ~10~Prague'
最后,我建议这样的代码作为答案:
import csv
import re
with open('ipCountry_list.csv', 'r') as csvfile1, open('city.txt', 'r', encoding="utf8") as file1:
with open('unmatch.txt', 'a+') as file2:
readCS = csv.reader(csvfile1, delimiter=';')
reader = file1.readlines()
# Fallback for the case when city.txt line
# has no trailing newline. For example, it
# can happen for the last line. But in the case
# when every line has a newline, the code line
# should be removed.
reader = tuple(map(lambda line: line.strip('\n') + '\n', reader))
for row in readCS:
for line in reader:
ready_row = row[:3] if len(row) > 3 else row[:2]
ready_row = [re.sub(r'[ \d-]*aaa(?=$)', '', col)
if 'aaa' in col else col
for col in ready_row if col]
if all(re.search(fr"\b{word}\b(?:[^ ]|$)", line,
re.IGNORECASE) for word in ready_row):
print(str(row) + line, end='')
file2.write(str(row) + line)
break
我有这段代码可以使用正则表达式打印在两个文件中找到的匹配项:
with open('ipCountry_list.txt','r') as csvfile1, open('city.txt', 'r', encoding="utf8" ) as file1:
with open('unmatch.txt', 'a+') as file2:
readCS = csv.reader(csvfile1, delimiter=';')
reader = file1.readlines()
for row in readCS:
for line in reader:
if all(re.findall(fr"\b{word}\b[^ ]", line, re.IGNORECASE) for word in row[:2]):
print(str(row) + line)
file2.write(str(row) + line)
break
if all(re.findall(fr"\b{word}\b[^ ]", line, re.IGNORECASE) for word in row[:3]):
print(str(row) + line)
file2.write(str(row) + line)
break
输出:
['TH', 'BANGKOK']'TH~10~Bangkok'
['ES', 'VALENCIA']'ES~VC~Valencia'
['US', 'AZ', 'PHOENIX']'US~AZ~Aguila'
['JP', 'KASHIWA']'JP~12~Kashiwa'
[CZ;PRAGUE 4 - aaa]
如果您看到 ['US', 'AZ', 'PHOENIX']'US~AZ~Aguila'
不匹配。
[CZ;PRAGUE 4 - aaa]
应该与 'CZ~10~Prague'
匹配,但无法匹配。
我不确定我做错了什么,也许是我的正则表达式?
编辑:
ipCountry_list.txt:
TH;BANGKOK;aaa
ES;VALENCIA;aaa
US;AZ;PHOENIX;aaa
JP;KASHIWA;aaa
CZ;PRAGUE 4 - aaa;
ZA;EAST LONDON;aaa
GB;BRIDGWATER;aaa
RU;MOSCOW;aaa
GH;TEMA;aaa
city.txt:
'TH~10~Bangkok'
'ES~VC~Valencia'
'US~AZ~Aguila'
'US~AZ~Phoenix'
'JP~12~Kashiwa'
'GB~ENG~Bridgwater'
'ZA~EC~East London'
'RU~MOW~Moscow'
'GH~AA~Tema'
'CZ~10~Prague'
最后,我建议这样的代码作为答案:
import csv
import re
with open('ipCountry_list.csv', 'r') as csvfile1, open('city.txt', 'r', encoding="utf8") as file1:
with open('unmatch.txt', 'a+') as file2:
readCS = csv.reader(csvfile1, delimiter=';')
reader = file1.readlines()
# Fallback for the case when city.txt line
# has no trailing newline. For example, it
# can happen for the last line. But in the case
# when every line has a newline, the code line
# should be removed.
reader = tuple(map(lambda line: line.strip('\n') + '\n', reader))
for row in readCS:
for line in reader:
ready_row = row[:3] if len(row) > 3 else row[:2]
ready_row = [re.sub(r'[ \d-]*aaa(?=$)', '', col)
if 'aaa' in col else col
for col in ready_row if col]
if all(re.search(fr"\b{word}\b(?:[^ ]|$)", line,
re.IGNORECASE) for word in ready_row):
print(str(row) + line, end='')
file2.write(str(row) + line)
break