Python 快速计算很多距离
Python calculate lots of distances quickly
我有 36,742 个点的输入,这意味着如果我想计算距离矩阵的下三角(使用 vincenty 近似),我需要生成 36,742*36,741*0.5 = 1,349,974,563 个距离。
我想保留彼此相距 50 公里以内的配对组合。我目前的设置如下
shops= [[id,lat,lon]...]
def lower_triangle_mat(points):
for i in range(len(shops)-1):
for j in range(i+1,len(shops)):
yield [shops[i],shops[j]]
def return_stores_cutoff(points,cutoff_km=0):
below_cut = []
counter = 0
for x in lower_triangle_mat(points):
dist_km = vincenty(x[0][1:3],x[1][1:3]).km
counter += 1
if counter % 1000000 == 0:
print("%d out of %d" % (counter,(len(shops)*len(shops)-1*0.5)))
if dist_km <= cutoff_km:
below_cut.append([x[0][0],x[1][0],dist_km])
return below_cut
start = time.clock()
stores = return_stores_cutoff(points=shops,cutoff_km=50)
print(time.clock() - start)
这显然需要数小时。我想到的一些可能性:
- 使用 numpy 矢量化 这些计算而不是循环
- 使用某种散列 来快速粗略地切断(100 公里以内的所有商店),然后只计算这些商店之间的准确距离
- 不是将点存储在列表中,而是使用四叉树之类的东西,但我认为这只有助于近点的排名而不是实际距离 -> 所以我猜是某种 geodatabase
- 我显然可以尝试 haversine 或投影并使用 euclidean 距离,但是我对尽可能使用最准确的测量感兴趣
- 利用并行处理(但是我在想出如何切割列表以仍然获得所有相关对时遇到了一些困难)。
编辑:我认为这里绝对需要地理哈希 - 一个例子from:
from geoindex import GeoGridIndex, GeoPoint
geo_index = GeoGridIndex()
for _ in range(10000):
lat = random.random()*180 - 90
lng = random.random()*360 - 180
index.add_point(GeoPoint(lat, lng))
center_point = GeoPoint(37.7772448, -122.3955118)
for distance, point in index.get_nearest_points(center_point, 10, 'km'):
print("We found {0} in {1} km".format(point, distance))
但是,我还想矢量化(而不是循环)geo-hash 返回的商店的距离计算。
编辑 2:Pouria Hadjibagheri - 我尝试使用 lambda 和映射:
# [B]: Mapping approach
lwr_tr_mat = ((shops[i],shops[j]) for i in range(len(shops)-1) for j in range(i+1,len(shops)))
func = lambda x: (x[0][0],x[1][0],vincenty(x[0],x[1]).km)
# Trying to see if conditional statements slow this down
func_cond = lambda x: (x[0][0],x[1][0],vincenty(x[0],x[1]).km) if vincenty(x[0],x[1]).km <= 50 else None
start = time.clock()
out_dist = list(map(func,lwr_tr_mat))
print(time.clock() - start)
start = time.clock()
out_dist = list(map(func_cond,lwr_tr_mat))
print(time.clock() - start)
它们都在 61 秒左右(我将商店数量从 32,000 家限制为 2000 家)。可能是我用错了地图?
您是否尝试过映射整个数组和函数而不是遍历它们?示例如下:
from numpy.random import rand
my_array = rand(int(5e7), 1) # An array of 50,000,000 random numbers in double.
现在通常做的是:
squared_list_iter = [value**2 for value in my_array]
这当然有效,但最好是无效的。
替代方法是使用函数映射数组。这是按如下方式完成的:
func = lambda x: x**2 # Here is what I want to do on my array.
squared_list_map = map(func, test) # Here I am doing it!
现在,有人可能会问,这有什么不同,甚至更好吗?从现在开始,我们也添加了对函数的调用!这是您的答案:
对于前一个解决方案(通过迭代):
1 loop: 1.11 minutes.
与后一种方案(映射)相比:
500 loop, on average 560 ns.
同时将 map()
转换为 list(map(my_list))
列表会将时间增加 10 倍,大约 500 ms
。
你来选择!
"Use some kind of hashing to get a quick rough-cut off (all stores within 100km) and then only calculate accurate distances between those stores"
我认为将其称为网格可能更好。因此,首先制定命令,以一组坐标作为键,并将每个商店放在该点附近 50 公里的范围内。那么当你计算距离时,你只查看附近的桶,而不是遍历整个宇宙中的每个商店
这听起来像是 k-D trees 的经典用例。
如果你先将你的点转换成欧几里得space然后你可以使用scipy.spatial.cKDTree
的query_pairs
方法:
from scipy.spatial import cKDTree
tree = cKDTree(data)
# where data is (nshops, ndim) containing the Euclidean coordinates of each shop
# in units of km
pairs = tree.query_pairs(50, p=2) # 50km radius, L2 (Euclidean) norm
pairs
将是 set
个 (i, j)
元组,对应于彼此相距≤50km 的成对商店的行索引。
tree.sparse_distance_matrix
is a scipy.sparse.dok_matrix
. Since the matrix will be symmetric and you're only interested in unique row/column pairs, you could use scipy.sparse.tril
to zero out the upper triangle, giving you a scipy.sparse.coo_matrix
的输出。从那里您可以通过 .row
、.col
和 .data
属性访问非零行和列索引及其相应的距离值:
from scipy import sparse
tree_dist = tree.sparse_distance_matrix(tree, max_distance=10000, p=2)
udist = sparse.tril(tree_dist, k=-1) # zero the main diagonal
ridx = udist.row # row indices
cidx = udist.col # column indices
dist = udist.data # distance values
感谢大家的帮助。我想我已经通过结合所有建议解决了这个问题。
我使用 numpy 导入地理坐标,然后使用 "France Lambert - 93" 投影它们。这让我可以用这些点填充 scipy.spatial.cKDTree,然后通过指定 50 公里的截止点(我的投影点以米为单位)来计算 sparse_distance_matrix。然后我将下三角提取为 CSV。
import numpy as np
import csv
import time
from pyproj import Proj, transform
#http://epsg.io/2154 (accuracy: 1.0m)
fr = '+proj=lcc +lat_1=49 +lat_2=44 +lat_0=46.5 +lon_0=3 \
+x_0=700000 +y_0=6600000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 \
+units=m +no_defs'
#http://epsg.io/27700-5339 (accuracy: 1.0m)
uk = '+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 \
+x_0=400000 +y_0=-100000 +ellps=airy \
+towgs84=446.448,-125.157,542.06,0.15,0.247,0.842,-20.489 +units=m +no_defs'
path_to_csv = '.../raw_in.csv'
out_csv = '.../out.csv'
def proj_arr(points):
inproj = Proj(init='epsg:4326')
outproj = Proj(uk)
# origin|destination|lon|lat
func = lambda x: transform(inproj,outproj,x[2],x[1])
return np.array(list(map(func, points)))
tstart = time.time()
# Import points as geographic coordinates
# ID|lat|lon
#Sample to try and replicate
#points = np.array([
# [39007,46.585012,5.5857829],
# [88086,48.192370,6.7296289],
# [62627,50.309155,3.0218611],
# [14020,49.133972,-0.15851507],
# [1091, 42.981765,2.0104902]])
#
points = np.genfromtxt(path_to_csv,
delimiter=',',
skip_header=1)
print("Total points: %d" % len(points))
print("Triangular matrix contains: %d" % (len(points)*((len(points))-1)*0.5))
# Get projected co-ordinates
proj_pnts = proj_arr(points)
# Fill quad-tree
from scipy.spatial import cKDTree
tree = cKDTree(proj_pnts)
cut_off_metres = 1600
tree_dist = tree.sparse_distance_matrix(tree,
max_distance=cut_off_metres,
p=2)
# Extract triangle
from scipy import sparse
udist = sparse.tril(tree_dist, k=-1) # zero the main diagonal
print("Distances after quad-tree cut-off: %d " % len(udist.data))
# Export CSV
import csv
f = open(out_csv, 'w', newline='')
w = csv.writer(f, delimiter=",", )
w.writerow(['id_a','lat_a','lon_a','id_b','lat_b','lon_b','metres'])
w.writerows(np.column_stack((points[udist.row ],
points[udist.col],
udist.data)))
f.close()
"""
Get ID labels
"""
id_to_csv = '...id.csv'
id_labels = np.genfromtxt(id_to_csv,
delimiter=',',
skip_header=1,
dtype='U')
"""
Try vincenty on the un-projected co-ordinates
"""
from geopy.distance import vincenty
vout_csv = '.../out_vin.csv'
test_vin = np.column_stack((points[udist.row].T[1:3].T,
points[udist.col].T[1:3].T))
func = lambda x: vincenty(x[0:2],x[2:4]).m
output = list(map(func,test_vin))
# Export CSV
f = open(vout_csv, 'w', newline='')
w = csv.writer(f, delimiter=",", )
w.writerow(['id_a','id_a2', 'lat_a','lon_a',
'id_b','id_b2', 'lat_b','lon_b',
'proj_metres','vincenty_metres'])
w.writerows(np.column_stack((list(id_labels[udist.row]),
points[udist.row ],
list(id_labels[udist.col]),
points[udist.col],
udist.data,
output,
)))
f.close()
print("Finished in %.0f seconds" % (time.time()-tstart)
这种方法用了 164 秒来生成(5,306,434 距离)——相比之下,9 秒——并且保存到磁盘也用了大约 90 秒。
然后我比较了 vincenty 距离和斜边距离(在投影坐标上)的差异。
平均米差为 2.7,平均 difference/metres 为 0.0073% - 看起来不错。
我有 36,742 个点的输入,这意味着如果我想计算距离矩阵的下三角(使用 vincenty 近似),我需要生成 36,742*36,741*0.5 = 1,349,974,563 个距离。
我想保留彼此相距 50 公里以内的配对组合。我目前的设置如下
shops= [[id,lat,lon]...]
def lower_triangle_mat(points):
for i in range(len(shops)-1):
for j in range(i+1,len(shops)):
yield [shops[i],shops[j]]
def return_stores_cutoff(points,cutoff_km=0):
below_cut = []
counter = 0
for x in lower_triangle_mat(points):
dist_km = vincenty(x[0][1:3],x[1][1:3]).km
counter += 1
if counter % 1000000 == 0:
print("%d out of %d" % (counter,(len(shops)*len(shops)-1*0.5)))
if dist_km <= cutoff_km:
below_cut.append([x[0][0],x[1][0],dist_km])
return below_cut
start = time.clock()
stores = return_stores_cutoff(points=shops,cutoff_km=50)
print(time.clock() - start)
这显然需要数小时。我想到的一些可能性:
- 使用 numpy 矢量化 这些计算而不是循环
- 使用某种散列 来快速粗略地切断(100 公里以内的所有商店),然后只计算这些商店之间的准确距离
- 不是将点存储在列表中,而是使用四叉树之类的东西,但我认为这只有助于近点的排名而不是实际距离 -> 所以我猜是某种 geodatabase
- 我显然可以尝试 haversine 或投影并使用 euclidean 距离,但是我对尽可能使用最准确的测量感兴趣
- 利用并行处理(但是我在想出如何切割列表以仍然获得所有相关对时遇到了一些困难)。
编辑:我认为这里绝对需要地理哈希 - 一个例子from:
from geoindex import GeoGridIndex, GeoPoint
geo_index = GeoGridIndex()
for _ in range(10000):
lat = random.random()*180 - 90
lng = random.random()*360 - 180
index.add_point(GeoPoint(lat, lng))
center_point = GeoPoint(37.7772448, -122.3955118)
for distance, point in index.get_nearest_points(center_point, 10, 'km'):
print("We found {0} in {1} km".format(point, distance))
但是,我还想矢量化(而不是循环)geo-hash 返回的商店的距离计算。
编辑 2:Pouria Hadjibagheri - 我尝试使用 lambda 和映射:
# [B]: Mapping approach
lwr_tr_mat = ((shops[i],shops[j]) for i in range(len(shops)-1) for j in range(i+1,len(shops)))
func = lambda x: (x[0][0],x[1][0],vincenty(x[0],x[1]).km)
# Trying to see if conditional statements slow this down
func_cond = lambda x: (x[0][0],x[1][0],vincenty(x[0],x[1]).km) if vincenty(x[0],x[1]).km <= 50 else None
start = time.clock()
out_dist = list(map(func,lwr_tr_mat))
print(time.clock() - start)
start = time.clock()
out_dist = list(map(func_cond,lwr_tr_mat))
print(time.clock() - start)
它们都在 61 秒左右(我将商店数量从 32,000 家限制为 2000 家)。可能是我用错了地图?
您是否尝试过映射整个数组和函数而不是遍历它们?示例如下:
from numpy.random import rand
my_array = rand(int(5e7), 1) # An array of 50,000,000 random numbers in double.
现在通常做的是:
squared_list_iter = [value**2 for value in my_array]
这当然有效,但最好是无效的。
替代方法是使用函数映射数组。这是按如下方式完成的:
func = lambda x: x**2 # Here is what I want to do on my array.
squared_list_map = map(func, test) # Here I am doing it!
现在,有人可能会问,这有什么不同,甚至更好吗?从现在开始,我们也添加了对函数的调用!这是您的答案:
对于前一个解决方案(通过迭代):
1 loop: 1.11 minutes.
与后一种方案(映射)相比:
500 loop, on average 560 ns.
同时将 map()
转换为 list(map(my_list))
列表会将时间增加 10 倍,大约 500 ms
。
你来选择!
"Use some kind of hashing to get a quick rough-cut off (all stores within 100km) and then only calculate accurate distances between those stores" 我认为将其称为网格可能更好。因此,首先制定命令,以一组坐标作为键,并将每个商店放在该点附近 50 公里的范围内。那么当你计算距离时,你只查看附近的桶,而不是遍历整个宇宙中的每个商店
这听起来像是 k-D trees 的经典用例。
如果你先将你的点转换成欧几里得space然后你可以使用scipy.spatial.cKDTree
的query_pairs
方法:
from scipy.spatial import cKDTree
tree = cKDTree(data)
# where data is (nshops, ndim) containing the Euclidean coordinates of each shop
# in units of km
pairs = tree.query_pairs(50, p=2) # 50km radius, L2 (Euclidean) norm
pairs
将是 set
个 (i, j)
元组,对应于彼此相距≤50km 的成对商店的行索引。
tree.sparse_distance_matrix
is a scipy.sparse.dok_matrix
. Since the matrix will be symmetric and you're only interested in unique row/column pairs, you could use scipy.sparse.tril
to zero out the upper triangle, giving you a scipy.sparse.coo_matrix
的输出。从那里您可以通过 .row
、.col
和 .data
属性访问非零行和列索引及其相应的距离值:
from scipy import sparse
tree_dist = tree.sparse_distance_matrix(tree, max_distance=10000, p=2)
udist = sparse.tril(tree_dist, k=-1) # zero the main diagonal
ridx = udist.row # row indices
cidx = udist.col # column indices
dist = udist.data # distance values
感谢大家的帮助。我想我已经通过结合所有建议解决了这个问题。
我使用 numpy 导入地理坐标,然后使用 "France Lambert - 93" 投影它们。这让我可以用这些点填充 scipy.spatial.cKDTree,然后通过指定 50 公里的截止点(我的投影点以米为单位)来计算 sparse_distance_matrix。然后我将下三角提取为 CSV。
import numpy as np
import csv
import time
from pyproj import Proj, transform
#http://epsg.io/2154 (accuracy: 1.0m)
fr = '+proj=lcc +lat_1=49 +lat_2=44 +lat_0=46.5 +lon_0=3 \
+x_0=700000 +y_0=6600000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 \
+units=m +no_defs'
#http://epsg.io/27700-5339 (accuracy: 1.0m)
uk = '+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 \
+x_0=400000 +y_0=-100000 +ellps=airy \
+towgs84=446.448,-125.157,542.06,0.15,0.247,0.842,-20.489 +units=m +no_defs'
path_to_csv = '.../raw_in.csv'
out_csv = '.../out.csv'
def proj_arr(points):
inproj = Proj(init='epsg:4326')
outproj = Proj(uk)
# origin|destination|lon|lat
func = lambda x: transform(inproj,outproj,x[2],x[1])
return np.array(list(map(func, points)))
tstart = time.time()
# Import points as geographic coordinates
# ID|lat|lon
#Sample to try and replicate
#points = np.array([
# [39007,46.585012,5.5857829],
# [88086,48.192370,6.7296289],
# [62627,50.309155,3.0218611],
# [14020,49.133972,-0.15851507],
# [1091, 42.981765,2.0104902]])
#
points = np.genfromtxt(path_to_csv,
delimiter=',',
skip_header=1)
print("Total points: %d" % len(points))
print("Triangular matrix contains: %d" % (len(points)*((len(points))-1)*0.5))
# Get projected co-ordinates
proj_pnts = proj_arr(points)
# Fill quad-tree
from scipy.spatial import cKDTree
tree = cKDTree(proj_pnts)
cut_off_metres = 1600
tree_dist = tree.sparse_distance_matrix(tree,
max_distance=cut_off_metres,
p=2)
# Extract triangle
from scipy import sparse
udist = sparse.tril(tree_dist, k=-1) # zero the main diagonal
print("Distances after quad-tree cut-off: %d " % len(udist.data))
# Export CSV
import csv
f = open(out_csv, 'w', newline='')
w = csv.writer(f, delimiter=",", )
w.writerow(['id_a','lat_a','lon_a','id_b','lat_b','lon_b','metres'])
w.writerows(np.column_stack((points[udist.row ],
points[udist.col],
udist.data)))
f.close()
"""
Get ID labels
"""
id_to_csv = '...id.csv'
id_labels = np.genfromtxt(id_to_csv,
delimiter=',',
skip_header=1,
dtype='U')
"""
Try vincenty on the un-projected co-ordinates
"""
from geopy.distance import vincenty
vout_csv = '.../out_vin.csv'
test_vin = np.column_stack((points[udist.row].T[1:3].T,
points[udist.col].T[1:3].T))
func = lambda x: vincenty(x[0:2],x[2:4]).m
output = list(map(func,test_vin))
# Export CSV
f = open(vout_csv, 'w', newline='')
w = csv.writer(f, delimiter=",", )
w.writerow(['id_a','id_a2', 'lat_a','lon_a',
'id_b','id_b2', 'lat_b','lon_b',
'proj_metres','vincenty_metres'])
w.writerows(np.column_stack((list(id_labels[udist.row]),
points[udist.row ],
list(id_labels[udist.col]),
points[udist.col],
udist.data,
output,
)))
f.close()
print("Finished in %.0f seconds" % (time.time()-tstart)
这种方法用了 164 秒来生成(5,306,434 距离)——相比之下,9 秒——并且保存到磁盘也用了大约 90 秒。
然后我比较了 vincenty 距离和斜边距离(在投影坐标上)的差异。
平均米差为 2.7,平均 difference/metres 为 0.0073% - 看起来不错。