我如何清理地址以便 Nominatim 可以正确定位它们?

How do I clean addresses so Nominatim can geolocate them properly?

我正在尝试使用 Nominatim 对我从网上抓取的地址集进行地理定位。 Nominatim 适用于“标准”地址,例如。 123 StreetName St., ExampleSuburb 但我抓取的一些地址具有“非标准”元素,例如。 仓库 3, 123 StreetName., ExampleSuburb.

有什么方法可以去掉“非标准”元素,让 Nominatim 更容易找到它们吗?或者有什么方法可以让 Nominatim 尝试在非标准元素的情况下对地址进行地理定位?

例如,下面的代码在执行代码时抛出一个类型错误,我不知道如何修复重新格式化地址以阻止这种情况发生,因为它是直接从网站上抓取的,我根本没有干预.

from bs4 import BeautifulSoup
import requests
from requests import get
import sqlite3
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

def scrapecafes(city, area):

    #url = 'https://www.broadsheet.com.au/melbourne/guides/best-cafes-thornbury' #go to the website
    url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
    response = requests.get(url, timeout=5)

    soup_cafe_names = BeautifulSoup(response.content, "html.parser")
    type(soup_cafe_names)

    cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the elements
    cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the elements
    #cafeNameTuple = [(cafe,) for cafe in cafeNamesClean]

    #print(cafeNamesClean)

    #addresses
    soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
    type(soup_cafe_addresses)

    cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" })
    cafeAddressesClean = [address.text for address in cafeAddresses]
    #cafeAddressesTuple = [(address,) for address in cafeAddressesClean]

    #print(cafeAddressesClean)


    ##geocode addresses
    locator = Nominatim(user_agent="myGeocoder")
    geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

    try:
        for item in cafeAddressesClean:
            location = (locator.geocode(item))
            lat = [location.latitude for item in location]
            long = [location.longitude for item in location]
            print(location)

    except:
        pass

    #zip up for table
    fortable = zip(cafeNamesClean, cafeAddressesClean, lat, long)
    print(fortable)

scrapecafes(melbourne, fitzroy)

你的脚本有 2 个问题。

  1. 您正在循环 cafeAddressesClean 但您没有将输出存储在任何地方。
  2. zip 列表之后,您并未将它们转换为列表。

下面将值插入到 sqlite 数据库中。总共有 10 个值被插入。

from bs4 import BeautifulSoup
import requests
from requests import get
import sqlite3
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

#cafeNamesthornbury
def scrapecafes(city, area):

    #url = 'https://www.broadsheet.com.au/melbourne/guides/best-cafes-thornbury' #go to the website
    url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
    response = requests.get(url, timeout=5)

    soup_cafe_names = BeautifulSoup(response.content, "html.parser")
    type(soup_cafe_names)

    cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the elements
    cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the elements
    cafeNameTuple = [(cafe,) for cafe in cafeNamesClean]

    print(cafeNamesClean)

    #addresses
    soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
    type(soup_cafe_addresses)

    cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" })
    cafeAddressesClean = [address.text for address in cafeAddresses]
    cafeAddressesTuple = [(address,) for address in cafeAddressesClean]

    print(cafeAddressesClean)


    ##geocode addresses
    locator = Nominatim(user_agent="myGeocoder")
    geocode = RateLimiter(locator.geocode, min_delay_seconds=1)

    location = []

    for item in cafeAddressesClean:
        location.append(locator.geocode(item))

    lat = [loc.latitude for loc in location]
    long = [loc.longitude for loc in location]

    #zip up for table
    fortable = list(zip(cafeNamesClean, cafeAddressesClean, lat, long))

##connect to database
    try:
        sqliteConnection = sqlite3.connect('25july_database.db')
        cursor = sqliteConnection.cursor()
        print("Database created and Successfully Connected to 25july_database")

        sqlite_select_Query = "select sqlite_version();"
        cursor.execute(sqlite_select_Query)
        record = cursor.fetchall()
        print("SQLite Database Version is: ", record)
        cursor.close()

    except sqlite3.Error as error:
        print("Error while connecting to sqlite", error)

    #create table
    try:
        sqlite_create_table_query = ''' CREATE TABLE IF NOT EXISTS scraper (
                                        name TEXT NOT NULL,
                                        address TEXT NOT NULL,
                                        latitude FLOAT NOT NULL,
                                        longitude FLOAT NOT NULL
                                        );'''

        cursor = sqliteConnection.cursor()
        print("Successfully Connected to SQLite")
        cursor.execute(sqlite_create_table_query)
        sqliteConnection.commit()
        print("SQLite table created")
    except sqlite3.Error as error:
        print("Error while creating a sqlite table", error)

##enter data into table
    try:

        for row in list(fortable):
            sqlite_insert_name_param = """INSERT INTO scraper VALUES (?,?,?,?);"""

            cursor.execute(sqlite_insert_name_param, row)

            sqliteConnection.commit()
        
            print("Total", cursor.rowcount, "Records inserted successfully into table")

        cursor.close()

    except sqlite3.Error as error:
        print("Failed to insert data into sqlite table", error)

    finally:
        if (sqliteConnection):
            sqliteConnection.close()
            print("The SQLite connection is closed")

scrapecafes('melbourne', 'thornbury')

在 运行 脚本之后:

Prior| 637 High Street, Thornbury|-37.76159772|144.99994556
Rat the Cafe| 72 Wales Street, Thornbury|-37.7618172|145.0091904
Ampersand Coffee and Food| 863 High Street, Thornbury|-37.754689125|145.0010879
Umberto Espresso Bar| 822 High Street, Thornbury|-37.7532839|145.0016297
Brother Alec| 719 High Street, Thornbury|-37.7590570333333|145.0003715
Short Round| 731 High Street, Thornbury|-37.758653675|145.000430475
Jerry Joy| 128  Mansfield Street, Thornbury|-37.7573008|145.0096578
The Old Milk Bar| 144 Dundas Street, Thornbury|-37.7544244|145.020563
Little Henri| 848  High Street, Thornbury|51.6087678|-2.5260139
Northern Soul| 843 High Street, Thornbury|-37.7552406555556|145.000992355556