我如何清理地址以便 Nominatim 可以正确定位它们?
How do I clean addresses so Nominatim can geolocate them properly?
我正在尝试使用 Nominatim 对我从网上抓取的地址集进行地理定位。 Nominatim 适用于“标准”地址,例如。 123 StreetName St., ExampleSuburb 但我抓取的一些地址具有“非标准”元素,例如。 仓库 3, 123 StreetName., ExampleSuburb.
有什么方法可以去掉“非标准”元素,让 Nominatim 更容易找到它们吗?或者有什么方法可以让 Nominatim 尝试在非标准元素的情况下对地址进行地理定位?
例如,下面的代码在执行代码时抛出一个类型错误,我不知道如何修复重新格式化地址以阻止这种情况发生,因为它是直接从网站上抓取的,我根本没有干预.
from bs4 import BeautifulSoup
import requests
from requests import get
import sqlite3
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
def scrapecafes(city, area):
#url = 'https://www.broadsheet.com.au/melbourne/guides/best-cafes-thornbury' #go to the website
url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
response = requests.get(url, timeout=5)
soup_cafe_names = BeautifulSoup(response.content, "html.parser")
type(soup_cafe_names)
cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the elements
cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the elements
#cafeNameTuple = [(cafe,) for cafe in cafeNamesClean]
#print(cafeNamesClean)
#addresses
soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
type(soup_cafe_addresses)
cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" })
cafeAddressesClean = [address.text for address in cafeAddresses]
#cafeAddressesTuple = [(address,) for address in cafeAddressesClean]
#print(cafeAddressesClean)
##geocode addresses
locator = Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
try:
for item in cafeAddressesClean:
location = (locator.geocode(item))
lat = [location.latitude for item in location]
long = [location.longitude for item in location]
print(location)
except:
pass
#zip up for table
fortable = zip(cafeNamesClean, cafeAddressesClean, lat, long)
print(fortable)
scrapecafes(melbourne, fitzroy)
你的脚本有 2 个问题。
- 您正在循环
cafeAddressesClean
但您没有将输出存储在任何地方。
- 在
zip
列表之后,您并未将它们转换为列表。
下面将值插入到 sqlite 数据库中。总共有 10 个值被插入。
from bs4 import BeautifulSoup
import requests
from requests import get
import sqlite3
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
#cafeNamesthornbury
def scrapecafes(city, area):
#url = 'https://www.broadsheet.com.au/melbourne/guides/best-cafes-thornbury' #go to the website
url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
response = requests.get(url, timeout=5)
soup_cafe_names = BeautifulSoup(response.content, "html.parser")
type(soup_cafe_names)
cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the elements
cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the elements
cafeNameTuple = [(cafe,) for cafe in cafeNamesClean]
print(cafeNamesClean)
#addresses
soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
type(soup_cafe_addresses)
cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" })
cafeAddressesClean = [address.text for address in cafeAddresses]
cafeAddressesTuple = [(address,) for address in cafeAddressesClean]
print(cafeAddressesClean)
##geocode addresses
locator = Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
location = []
for item in cafeAddressesClean:
location.append(locator.geocode(item))
lat = [loc.latitude for loc in location]
long = [loc.longitude for loc in location]
#zip up for table
fortable = list(zip(cafeNamesClean, cafeAddressesClean, lat, long))
##connect to database
try:
sqliteConnection = sqlite3.connect('25july_database.db')
cursor = sqliteConnection.cursor()
print("Database created and Successfully Connected to 25july_database")
sqlite_select_Query = "select sqlite_version();"
cursor.execute(sqlite_select_Query)
record = cursor.fetchall()
print("SQLite Database Version is: ", record)
cursor.close()
except sqlite3.Error as error:
print("Error while connecting to sqlite", error)
#create table
try:
sqlite_create_table_query = ''' CREATE TABLE IF NOT EXISTS scraper (
name TEXT NOT NULL,
address TEXT NOT NULL,
latitude FLOAT NOT NULL,
longitude FLOAT NOT NULL
);'''
cursor = sqliteConnection.cursor()
print("Successfully Connected to SQLite")
cursor.execute(sqlite_create_table_query)
sqliteConnection.commit()
print("SQLite table created")
except sqlite3.Error as error:
print("Error while creating a sqlite table", error)
##enter data into table
try:
for row in list(fortable):
sqlite_insert_name_param = """INSERT INTO scraper VALUES (?,?,?,?);"""
cursor.execute(sqlite_insert_name_param, row)
sqliteConnection.commit()
print("Total", cursor.rowcount, "Records inserted successfully into table")
cursor.close()
except sqlite3.Error as error:
print("Failed to insert data into sqlite table", error)
finally:
if (sqliteConnection):
sqliteConnection.close()
print("The SQLite connection is closed")
scrapecafes('melbourne', 'thornbury')
在 运行 脚本之后:
Prior| 637 High Street, Thornbury|-37.76159772|144.99994556
Rat the Cafe| 72 Wales Street, Thornbury|-37.7618172|145.0091904
Ampersand Coffee and Food| 863 High Street, Thornbury|-37.754689125|145.0010879
Umberto Espresso Bar| 822 High Street, Thornbury|-37.7532839|145.0016297
Brother Alec| 719 High Street, Thornbury|-37.7590570333333|145.0003715
Short Round| 731 High Street, Thornbury|-37.758653675|145.000430475
Jerry Joy| 128 Mansfield Street, Thornbury|-37.7573008|145.0096578
The Old Milk Bar| 144 Dundas Street, Thornbury|-37.7544244|145.020563
Little Henri| 848 High Street, Thornbury|51.6087678|-2.5260139
Northern Soul| 843 High Street, Thornbury|-37.7552406555556|145.000992355556
我正在尝试使用 Nominatim 对我从网上抓取的地址集进行地理定位。 Nominatim 适用于“标准”地址,例如。 123 StreetName St., ExampleSuburb 但我抓取的一些地址具有“非标准”元素,例如。 仓库 3, 123 StreetName., ExampleSuburb.
有什么方法可以去掉“非标准”元素,让 Nominatim 更容易找到它们吗?或者有什么方法可以让 Nominatim 尝试在非标准元素的情况下对地址进行地理定位?
例如,下面的代码在执行代码时抛出一个类型错误,我不知道如何修复重新格式化地址以阻止这种情况发生,因为它是直接从网站上抓取的,我根本没有干预.
from bs4 import BeautifulSoup
import requests
from requests import get
import sqlite3
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
def scrapecafes(city, area):
#url = 'https://www.broadsheet.com.au/melbourne/guides/best-cafes-thornbury' #go to the website
url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
response = requests.get(url, timeout=5)
soup_cafe_names = BeautifulSoup(response.content, "html.parser")
type(soup_cafe_names)
cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the elements
cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the elements
#cafeNameTuple = [(cafe,) for cafe in cafeNamesClean]
#print(cafeNamesClean)
#addresses
soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
type(soup_cafe_addresses)
cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" })
cafeAddressesClean = [address.text for address in cafeAddresses]
#cafeAddressesTuple = [(address,) for address in cafeAddressesClean]
#print(cafeAddressesClean)
##geocode addresses
locator = Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
try:
for item in cafeAddressesClean:
location = (locator.geocode(item))
lat = [location.latitude for item in location]
long = [location.longitude for item in location]
print(location)
except:
pass
#zip up for table
fortable = zip(cafeNamesClean, cafeAddressesClean, lat, long)
print(fortable)
scrapecafes(melbourne, fitzroy)
你的脚本有 2 个问题。
- 您正在循环
cafeAddressesClean
但您没有将输出存储在任何地方。 - 在
zip
列表之后,您并未将它们转换为列表。
下面将值插入到 sqlite 数据库中。总共有 10 个值被插入。
from bs4 import BeautifulSoup
import requests
from requests import get
import sqlite3
import geopandas
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
#cafeNamesthornbury
def scrapecafes(city, area):
#url = 'https://www.broadsheet.com.au/melbourne/guides/best-cafes-thornbury' #go to the website
url = f"https://www.broadsheet.com.au/{city}/guides/best-cafes-{area}"
response = requests.get(url, timeout=5)
soup_cafe_names = BeautifulSoup(response.content, "html.parser")
type(soup_cafe_names)
cafeNames = soup_cafe_names.findAll('h2', attrs={"class":"venue-title", }) #scrape the elements
cafeNamesClean = [cafe.text.strip() for cafe in cafeNames] #clean the elements
cafeNameTuple = [(cafe,) for cafe in cafeNamesClean]
print(cafeNamesClean)
#addresses
soup_cafe_addresses = BeautifulSoup(response.content, "html.parser")
type(soup_cafe_addresses)
cafeAddresses = soup_cafe_addresses.findAll( attrs={"class":"address-content" })
cafeAddressesClean = [address.text for address in cafeAddresses]
cafeAddressesTuple = [(address,) for address in cafeAddressesClean]
print(cafeAddressesClean)
##geocode addresses
locator = Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
location = []
for item in cafeAddressesClean:
location.append(locator.geocode(item))
lat = [loc.latitude for loc in location]
long = [loc.longitude for loc in location]
#zip up for table
fortable = list(zip(cafeNamesClean, cafeAddressesClean, lat, long))
##connect to database
try:
sqliteConnection = sqlite3.connect('25july_database.db')
cursor = sqliteConnection.cursor()
print("Database created and Successfully Connected to 25july_database")
sqlite_select_Query = "select sqlite_version();"
cursor.execute(sqlite_select_Query)
record = cursor.fetchall()
print("SQLite Database Version is: ", record)
cursor.close()
except sqlite3.Error as error:
print("Error while connecting to sqlite", error)
#create table
try:
sqlite_create_table_query = ''' CREATE TABLE IF NOT EXISTS scraper (
name TEXT NOT NULL,
address TEXT NOT NULL,
latitude FLOAT NOT NULL,
longitude FLOAT NOT NULL
);'''
cursor = sqliteConnection.cursor()
print("Successfully Connected to SQLite")
cursor.execute(sqlite_create_table_query)
sqliteConnection.commit()
print("SQLite table created")
except sqlite3.Error as error:
print("Error while creating a sqlite table", error)
##enter data into table
try:
for row in list(fortable):
sqlite_insert_name_param = """INSERT INTO scraper VALUES (?,?,?,?);"""
cursor.execute(sqlite_insert_name_param, row)
sqliteConnection.commit()
print("Total", cursor.rowcount, "Records inserted successfully into table")
cursor.close()
except sqlite3.Error as error:
print("Failed to insert data into sqlite table", error)
finally:
if (sqliteConnection):
sqliteConnection.close()
print("The SQLite connection is closed")
scrapecafes('melbourne', 'thornbury')
在 运行 脚本之后:
Prior| 637 High Street, Thornbury|-37.76159772|144.99994556
Rat the Cafe| 72 Wales Street, Thornbury|-37.7618172|145.0091904
Ampersand Coffee and Food| 863 High Street, Thornbury|-37.754689125|145.0010879
Umberto Espresso Bar| 822 High Street, Thornbury|-37.7532839|145.0016297
Brother Alec| 719 High Street, Thornbury|-37.7590570333333|145.0003715
Short Round| 731 High Street, Thornbury|-37.758653675|145.000430475
Jerry Joy| 128 Mansfield Street, Thornbury|-37.7573008|145.0096578
The Old Milk Bar| 144 Dundas Street, Thornbury|-37.7544244|145.020563
Little Henri| 848 High Street, Thornbury|51.6087678|-2.5260139
Northern Soul| 843 High Street, Thornbury|-37.7552406555556|145.000992355556