fuzzywuzzy 返回单个字符,而不是字符串

fuzzywuzzy returning single characters, not strings

我不确定哪里出错了,也不知道为什么我的数据 return 出错了。编写此代码以使用 fuzzywuzzy 根据正确名称列表清除错误的输入道路名称,将不正确的替换为最接近的匹配项。

return正在返回 data2 的所有行。我正在寻找 return 相同的,或替换 data1 的行返回给我。

我的最小可复制示例:

import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

data1 =('3176 DETRIT ROAD')
data2 =('DETROIT RD')

try:
    data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
    pass

roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')

street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data2 = data2.split(',')

if street2 not in data2:
    street2 = process.extract(street2, data2)
    print(street2[0])

我的完整代码

import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def convert_tolist(string):
    li = list(string.split(" "))
    return li

with open(r"Cass_Howard - Copy.csv") as csv_file,\
        open("Final_Test_Clean.csv", "w") as f,\
        open(r"TEST_no_dups12.csv") as ul:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    csv_reader = csv.reader(f, delimiter=',')
    file_1 = csv_file
    file_2 = ul

    for data1, data2 in zip(file_1, file_2):
        data1 = data1.split(',')
        data1 = data1[18]
        data1 = data1.upper()
        data2 = data2.strip()
        data2 = data2.split(',')
        data2 = ''.join(data2)
        try:
            data1 = usaddress.tag(data1)
        except usaddress.RepeatedLabelError:
            pass

        roaddnum2 = data1[0].get('AddressNumber', '')
        roadir2 = data1[0].get('StreetNamePreDirectional', '')
        roadname2 = data1[0].get('StreetName', '')
        roaddsg2 = data1[0].get('StreetNamePostType', '')

        street2 = (roadir2, roadname2, roaddsg2)
        street2 = " ".join(street2)
        street2 = street2.strip()
        data1 = list(data1)
        convert_tolist(data2)


        if street2 not in data2:
            street2 = process.extract(street2, data2)
            print(street2)

street2查询数据(约950行)

DETROIT ROAD
DETROIT ROAD
MANNIX ST
MANNIX ST

data2 选择数据(约200行)

ACRES
ADERSON RD
AIRPORT RD
ALGONQUIN

好的,我不确定我是否已经完全理解您的问题,但是通过修改您的 reprex,我得出了以下解决方案。

import usaddress
from fuzzywuzzy import process

data1 = "3176 DETRIT ROAD"
choices = ["DETROIT RD"]

try:
    data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
    pass

parts = [
    data1[0].get("StreetNamePreDirectional"),
    data1[0].get("StreetName"),
    data1[0].get("StreetNamePostType"),
]

street = " ".join([x for x in parts if x])

if street not in choices:
    street = " ".join(
        [data1[0].get("AddressNumber"), process.extract(street, choices)[0][0]]
    )

print(street)

这产生:

3176 DETROIT RD

所以基本上,它使用 process.extract() 函数从选择列表中替换了街道名称位。


这里是一个可调用函数:

import usaddress
from fuzzywuzzy import process


def best_street(addr: str, choices: list[str]) -> str:
    try:
        usaddr = usaddress.tag(addr)
    except usaddress.RepeatedLabelError:
        pass
    street_parts = [
        usaddr[0].get("StreetNamePreDirectional"),
        usaddr[0].get("StreetName"),
        usaddr[0].get("StreetNamePostType"),
    ]
    street = " ".join([x for x in street_parts if x])
    return " ".join(
        [usaddr[0].get("AddressNumber"), process.extract(street, choices)[0][0]]
    )


if __name__ == "__main__":
    choices = ["AIRPORT RD", "DETROIT RD"]
    print(best_street("123 Detrt", choices))
    print(best_street("9876 AIRPUMP DR", choices))

产量:

123 DETROIT RD
9876 AIRPORT RD

我的问题是 fuzzywuzzy 需要你传递一个可迭代对象,所以我不得不添加 data2 = data2.split(',') 以获得完整的字符串 return。

我的另一个问题是认为我需要使用 zip() 来比较我的文件,而 zip() 用于比较平行产品而不是交叉产品。

这是我想出的有效方法。代码没有问题,但 fuzzywuzzy 不够准确,无法使用此工具清理我的地址数据中的错别字。

如果你能想办法让我清理if/else声明,我愿意听。

import os
import csv
import shutil
import usaddress
import pandas as pd
from fuzzywuzzy import process

with open(r"TEST_Cass_Howard.csv") as csv_file, \
        open(".\Scratch\Final_Test_Clean.csv", "w") as f, \
        open(r"TEST_Unique_List.csv") as ul:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    csv_reader1 = csv.reader(f, delimiter=',')
    correct = list(ul)
    for line in csv_reader:
        line = line[18]
        line = line.upper()
        if line == '' or line == ' ':
            line = ''
        else:
            try:
                addressbrk = usaddress.tag(line)
            except usaddress.RepeatedLabelError:
                addressbrk = line
            line = addressbrk  # alt output: ('Please fix the incorect format of: %s,' % addressbrk)
        if line != '':
            roadnum2 = line[0].get('AddressNumber', '')
            roadir2 = line[0].get('StreetNamePreDirectional', '')
            roadname2 = line[0].get('StreetName', '')
            roaddsg2 = line[0].get('StreetNamePostType', '')
        else:
            line = ''
        if line != '':
            street2 = (roadir2, roadname2, roaddsg2)
            street2 = " ".join(street2)
            street2 = street2.strip()
        else:
            street2 = ''
        if street2 != '':
            if street2 not in correct:
                street2 = process.extractOne(street2, correct)
            else:
                street2 = '\n'
        if street2 != '':
            tgthr = (roadnum2, street2[0])
        else:
            tgthr = ''
        if tgthr != '':
            final = (' '.join(tgthr))
        else:
            final = ('Null' + '\n')
        f.writelines(final)
original = r"TEST_Cass_Howard.csv" 
target = (r'.\Scratch\Cass_Howard_Clean.csv')
shutil.copyfile(original, target)
df1 = pd.read_csv('.\Scratch\Final_Test_Clean.csv', header=None)
df1.columns = ["WELL_ADDR_CLN"]
df = pd.read_csv('.\Scratch\Cass_Howard_Clean.csv')
df = df.join(df1)
new_data = df['WELL_ADDR_CLN']
df = df.drop(columns=['WELL_ADDR_CLN'])
df.insert(loc=19, column='WELL_ADDR_CLN', value=new_data)
os.remove(".\Scratch\Cass_Howard_Clean.csv")
df.to_csv("Cass_Howard_Clean.csv", index=False)
os.remove('.\Scratch\Final_Test_Clean.csv')