fuzzywuzzy 返回单个字符,而不是字符串
fuzzywuzzy returning single characters, not strings
我不确定哪里出错了,也不知道为什么我的数据 return 出错了。编写此代码以使用 fuzzywuzzy 根据正确名称列表清除错误的输入道路名称,将不正确的替换为最接近的匹配项。
return正在返回 data2
的所有行。我正在寻找 return 相同的,或替换 data1
的行返回给我。
我的最小可复制示例:
import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
data1 =('3176 DETRIT ROAD')
data2 =('DETROIT RD')
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data2 = data2.split(',')
if street2 not in data2:
street2 = process.extract(street2, data2)
print(street2[0])
我的完整代码
import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def convert_tolist(string):
li = list(string.split(" "))
return li
with open(r"Cass_Howard - Copy.csv") as csv_file,\
open("Final_Test_Clean.csv", "w") as f,\
open(r"TEST_no_dups12.csv") as ul:
csv_reader = csv.reader(csv_file, delimiter=',')
next(csv_reader)
csv_reader = csv.reader(f, delimiter=',')
file_1 = csv_file
file_2 = ul
for data1, data2 in zip(file_1, file_2):
data1 = data1.split(',')
data1 = data1[18]
data1 = data1.upper()
data2 = data2.strip()
data2 = data2.split(',')
data2 = ''.join(data2)
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data1 = list(data1)
convert_tolist(data2)
if street2 not in data2:
street2 = process.extract(street2, data2)
print(street2)
street2
查询数据(约950行)
DETROIT ROAD
DETROIT ROAD
MANNIX ST
MANNIX ST
data2
选择数据(约200行)
ACRES
ADERSON RD
AIRPORT RD
ALGONQUIN
好的,我不确定我是否已经完全理解您的问题,但是通过修改您的 reprex,我得出了以下解决方案。
import usaddress
from fuzzywuzzy import process
data1 = "3176 DETRIT ROAD"
choices = ["DETROIT RD"]
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
parts = [
data1[0].get("StreetNamePreDirectional"),
data1[0].get("StreetName"),
data1[0].get("StreetNamePostType"),
]
street = " ".join([x for x in parts if x])
if street not in choices:
street = " ".join(
[data1[0].get("AddressNumber"), process.extract(street, choices)[0][0]]
)
print(street)
这产生:
3176 DETROIT RD
所以基本上,它使用 process.extract()
函数从选择列表中替换了街道名称位。
这里是一个可调用函数:
import usaddress
from fuzzywuzzy import process
def best_street(addr: str, choices: list[str]) -> str:
try:
usaddr = usaddress.tag(addr)
except usaddress.RepeatedLabelError:
pass
street_parts = [
usaddr[0].get("StreetNamePreDirectional"),
usaddr[0].get("StreetName"),
usaddr[0].get("StreetNamePostType"),
]
street = " ".join([x for x in street_parts if x])
return " ".join(
[usaddr[0].get("AddressNumber"), process.extract(street, choices)[0][0]]
)
if __name__ == "__main__":
choices = ["AIRPORT RD", "DETROIT RD"]
print(best_street("123 Detrt", choices))
print(best_street("9876 AIRPUMP DR", choices))
产量:
123 DETROIT RD
9876 AIRPORT RD
我的问题是 fuzzywuzzy 需要你传递一个可迭代对象,所以我不得不添加 data2 = data2.split(',')
以获得完整的字符串 return。
我的另一个问题是认为我需要使用 zip()
来比较我的文件,而 zip()
用于比较平行产品而不是交叉产品。
这是我想出的有效方法。代码没有问题,但 fuzzywuzzy 不够准确,无法使用此工具清理我的地址数据中的错别字。
如果你能想办法让我清理if/else声明,我愿意听。
import os
import csv
import shutil
import usaddress
import pandas as pd
from fuzzywuzzy import process
with open(r"TEST_Cass_Howard.csv") as csv_file, \
open(".\Scratch\Final_Test_Clean.csv", "w") as f, \
open(r"TEST_Unique_List.csv") as ul:
csv_reader = csv.reader(csv_file, delimiter=',')
next(csv_reader)
csv_reader1 = csv.reader(f, delimiter=',')
correct = list(ul)
for line in csv_reader:
line = line[18]
line = line.upper()
if line == '' or line == ' ':
line = ''
else:
try:
addressbrk = usaddress.tag(line)
except usaddress.RepeatedLabelError:
addressbrk = line
line = addressbrk # alt output: ('Please fix the incorect format of: %s,' % addressbrk)
if line != '':
roadnum2 = line[0].get('AddressNumber', '')
roadir2 = line[0].get('StreetNamePreDirectional', '')
roadname2 = line[0].get('StreetName', '')
roaddsg2 = line[0].get('StreetNamePostType', '')
else:
line = ''
if line != '':
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
else:
street2 = ''
if street2 != '':
if street2 not in correct:
street2 = process.extractOne(street2, correct)
else:
street2 = '\n'
if street2 != '':
tgthr = (roadnum2, street2[0])
else:
tgthr = ''
if tgthr != '':
final = (' '.join(tgthr))
else:
final = ('Null' + '\n')
f.writelines(final)
original = r"TEST_Cass_Howard.csv"
target = (r'.\Scratch\Cass_Howard_Clean.csv')
shutil.copyfile(original, target)
df1 = pd.read_csv('.\Scratch\Final_Test_Clean.csv', header=None)
df1.columns = ["WELL_ADDR_CLN"]
df = pd.read_csv('.\Scratch\Cass_Howard_Clean.csv')
df = df.join(df1)
new_data = df['WELL_ADDR_CLN']
df = df.drop(columns=['WELL_ADDR_CLN'])
df.insert(loc=19, column='WELL_ADDR_CLN', value=new_data)
os.remove(".\Scratch\Cass_Howard_Clean.csv")
df.to_csv("Cass_Howard_Clean.csv", index=False)
os.remove('.\Scratch\Final_Test_Clean.csv')
我不确定哪里出错了,也不知道为什么我的数据 return 出错了。编写此代码以使用 fuzzywuzzy 根据正确名称列表清除错误的输入道路名称,将不正确的替换为最接近的匹配项。
return正在返回 data2
的所有行。我正在寻找 return 相同的,或替换 data1
的行返回给我。
我的最小可复制示例:
import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
data1 =('3176 DETRIT ROAD')
data2 =('DETROIT RD')
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data2 = data2.split(',')
if street2 not in data2:
street2 = process.extract(street2, data2)
print(street2[0])
我的完整代码
import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def convert_tolist(string):
li = list(string.split(" "))
return li
with open(r"Cass_Howard - Copy.csv") as csv_file,\
open("Final_Test_Clean.csv", "w") as f,\
open(r"TEST_no_dups12.csv") as ul:
csv_reader = csv.reader(csv_file, delimiter=',')
next(csv_reader)
csv_reader = csv.reader(f, delimiter=',')
file_1 = csv_file
file_2 = ul
for data1, data2 in zip(file_1, file_2):
data1 = data1.split(',')
data1 = data1[18]
data1 = data1.upper()
data2 = data2.strip()
data2 = data2.split(',')
data2 = ''.join(data2)
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data1 = list(data1)
convert_tolist(data2)
if street2 not in data2:
street2 = process.extract(street2, data2)
print(street2)
street2
查询数据(约950行)
DETROIT ROAD
DETROIT ROAD
MANNIX ST
MANNIX ST
data2
选择数据(约200行)
ACRES
ADERSON RD
AIRPORT RD
ALGONQUIN
好的,我不确定我是否已经完全理解您的问题,但是通过修改您的 reprex,我得出了以下解决方案。
import usaddress
from fuzzywuzzy import process
data1 = "3176 DETRIT ROAD"
choices = ["DETROIT RD"]
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
parts = [
data1[0].get("StreetNamePreDirectional"),
data1[0].get("StreetName"),
data1[0].get("StreetNamePostType"),
]
street = " ".join([x for x in parts if x])
if street not in choices:
street = " ".join(
[data1[0].get("AddressNumber"), process.extract(street, choices)[0][0]]
)
print(street)
这产生:
3176 DETROIT RD
所以基本上,它使用 process.extract()
函数从选择列表中替换了街道名称位。
这里是一个可调用函数:
import usaddress
from fuzzywuzzy import process
def best_street(addr: str, choices: list[str]) -> str:
try:
usaddr = usaddress.tag(addr)
except usaddress.RepeatedLabelError:
pass
street_parts = [
usaddr[0].get("StreetNamePreDirectional"),
usaddr[0].get("StreetName"),
usaddr[0].get("StreetNamePostType"),
]
street = " ".join([x for x in street_parts if x])
return " ".join(
[usaddr[0].get("AddressNumber"), process.extract(street, choices)[0][0]]
)
if __name__ == "__main__":
choices = ["AIRPORT RD", "DETROIT RD"]
print(best_street("123 Detrt", choices))
print(best_street("9876 AIRPUMP DR", choices))
产量:
123 DETROIT RD
9876 AIRPORT RD
我的问题是 fuzzywuzzy 需要你传递一个可迭代对象,所以我不得不添加 data2 = data2.split(',')
以获得完整的字符串 return。
我的另一个问题是认为我需要使用 zip()
来比较我的文件,而 zip()
用于比较平行产品而不是交叉产品。
这是我想出的有效方法。代码没有问题,但 fuzzywuzzy 不够准确,无法使用此工具清理我的地址数据中的错别字。
如果你能想办法让我清理if/else声明,我愿意听。
import os
import csv
import shutil
import usaddress
import pandas as pd
from fuzzywuzzy import process
with open(r"TEST_Cass_Howard.csv") as csv_file, \
open(".\Scratch\Final_Test_Clean.csv", "w") as f, \
open(r"TEST_Unique_List.csv") as ul:
csv_reader = csv.reader(csv_file, delimiter=',')
next(csv_reader)
csv_reader1 = csv.reader(f, delimiter=',')
correct = list(ul)
for line in csv_reader:
line = line[18]
line = line.upper()
if line == '' or line == ' ':
line = ''
else:
try:
addressbrk = usaddress.tag(line)
except usaddress.RepeatedLabelError:
addressbrk = line
line = addressbrk # alt output: ('Please fix the incorect format of: %s,' % addressbrk)
if line != '':
roadnum2 = line[0].get('AddressNumber', '')
roadir2 = line[0].get('StreetNamePreDirectional', '')
roadname2 = line[0].get('StreetName', '')
roaddsg2 = line[0].get('StreetNamePostType', '')
else:
line = ''
if line != '':
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
else:
street2 = ''
if street2 != '':
if street2 not in correct:
street2 = process.extractOne(street2, correct)
else:
street2 = '\n'
if street2 != '':
tgthr = (roadnum2, street2[0])
else:
tgthr = ''
if tgthr != '':
final = (' '.join(tgthr))
else:
final = ('Null' + '\n')
f.writelines(final)
original = r"TEST_Cass_Howard.csv"
target = (r'.\Scratch\Cass_Howard_Clean.csv')
shutil.copyfile(original, target)
df1 = pd.read_csv('.\Scratch\Final_Test_Clean.csv', header=None)
df1.columns = ["WELL_ADDR_CLN"]
df = pd.read_csv('.\Scratch\Cass_Howard_Clean.csv')
df = df.join(df1)
new_data = df['WELL_ADDR_CLN']
df = df.drop(columns=['WELL_ADDR_CLN'])
df.insert(loc=19, column='WELL_ADDR_CLN', value=new_data)
os.remove(".\Scratch\Cass_Howard_Clean.csv")
df.to_csv("Cass_Howard_Clean.csv", index=False)
os.remove('.\Scratch\Final_Test_Clean.csv')