矢量化:用非标准数据迭代两个非常大的数据帧
Vectorization: iterating over two very large dataframes with non-standard data
我在 python 中有两个数据框:一个是 ~150k calls,每个都有一个地理位置,另一个是 ~50k streets ,每个都有一个地理路径。给定每个呼叫的位置,我想将最近街道的头节点和尾节点 ID 附加到呼叫数据帧。
我已经通读了通话数据并转换了数字 lat/long 列并构建了一个 Shapely Point 列。同样,我已将字符串路径数据列转换为 Shapely LineString。这些是下面的算法一和二——不太可能是最有效的实现。欢迎您的评论。
% Algorithm One: given two columns of latitude & longitude, create a new Point
def call_iter():
points = []
for index, row in calls.iterrows():
points.append(Point(row['Incident Latitude'], row['Incident Longitude']))
return points % appended to the call dataframe
% Algorithm Two: given a string column containing coordinate data, construct a LineString
def street_iter():
paths = []
for geo in streets.geometry:
l = []
for t in geo.split():
try:
t = t.strip('(,)')
l.append(float(t))
except ValueError:
pass
p = []
for i in range(0, len(l), 2):
p.append(Point(l[i], l[i+1]))
paths.append(LineString(p))
return paths % appended to the street dataframe
然而,我主要关心的是第一段中概述的内容:给定 Shapely 方法 line.distance(point)
和新创建的 Shapely 对象,我如何才能有效地找到离每个电话最近的街道?我一天的尝试如下所示。这确实有效,但每次调用需要 1-2 秒,这是我想使用的几个数据集中的第一个。
% Algorithm Three: find the closest street (head 'u' and tail 'v' nodes) to each call
def build_matrix():
heads = []
tails = []
for i_c, r_c in calls.iterrows():
print(i)
p = r_c[4]
head_min = -1
tail_min = -1
dist_min = float('inf')
min_group = []
for i_s, r_s in streets.iterrows():
l = r_s[5].distance(p)
if dist_min > l:
head_min = r_s['u'] % head node
tail_min = r_s['v'] % tail node
dist_min = l
min_group = []
min_group.append(r_s)
if dist_min == l:
min_group.append(r_s)
if len(min_group) > 1:
choice = secrets.choice(min_group) % randomly selects an arc
head_min = choice['u']
tail_min = choice['v']
heads.append(head_min)
tails.append(tail_min)
return (heads, tails) % both appended to the calls dataframe
我花了几个小时研究矢量化,但是我找不到任何此类事情的例子。非常感谢您的帮助。
好的,明白了。继续关注Tenkanen, Heikinheimo & Aagesen, and the solution posted here的工作,我有办法了
该方法最终涉及 BallTree 搜索,以找到每条街道与每个事件最近的 centroid(如果有人知道如何修改它以处理 LineStrings,我将不胜感激提示)。下面的实现。
def get_nearest(src_points, candidates, k_neighbors=1):
"""Find nearest neighbors for all source points from a set of candidate points"""
# Create tree from the candidate points
tree = BallTree(candidates, leaf_size=15, metric='haversine')
# Find closest points and distances
distances, indices = tree.query(src_points, k=k_neighbors)
# Transpose to get distances and indices into arrays
distances = distances.transpose()
indices = indices.transpose()
# Get closest indices and distances (i.e. array at index 0)
# note: for the second closest points, you would take index 1, etc.
closest = indices[0]
closest_dist = distances[0]
# Return indices and distances
return (closest, closest_dist)
def nearest_neighbor(left_gdf, right_gdf, return_vals=False):
"""
For each point in left_gdf, find closest point in right GeoDataFrame and return them.
"""
left_geom_col = left_gdf.geometry.name
right_geom_col = right_gdf.geometry.name
# Ensure that index in right gdf is formed of sequential numbers
right = right_gdf.copy().reset_index(drop=True)
# Parse coordinates from points and insert them into a numpy array as RADIANS
left_radians = np.array(left_gdf[left_geom_col].apply(lambda geom: (geom.x * np.pi / 180, geom.y * np.pi / 180)).to_list())
right_radians = np.array(right[right_geom_col].apply(lambda geom: (geom.x * np.pi / 180, geom.y * np.pi / 180)).to_list())
# Find the nearest points
# -----------------------
# closest ==> index in right_gdf that corresponds to the closest point
# dist ==> distance between the nearest neighbors (in meters)
closest, dist = get_nearest(src_points=left_radians, candidates=right_radians)
# Return points from right GeoDataFrame that are closest to points in left GeoDataFrame
closest_points = left_gdf # right.loc[closest]
# Ensure that the index corresponds the one in left_gdf
closest_points = closest_points.reset_index(drop=True)
# Add the head and tail node IDs of the closest street
if return_vals:
closest_points['u'] = right.loc[closest,'u'].reset_index(drop=True)
closest_points['v'] = right.loc[closest,'v'].reset_index(drop=True)
return closest_points
我在 python 中有两个数据框:一个是 ~150k calls,每个都有一个地理位置,另一个是 ~50k streets ,每个都有一个地理路径。给定每个呼叫的位置,我想将最近街道的头节点和尾节点 ID 附加到呼叫数据帧。
我已经通读了通话数据并转换了数字 lat/long 列并构建了一个 Shapely Point 列。同样,我已将字符串路径数据列转换为 Shapely LineString。这些是下面的算法一和二——不太可能是最有效的实现。欢迎您的评论。
% Algorithm One: given two columns of latitude & longitude, create a new Point
def call_iter():
points = []
for index, row in calls.iterrows():
points.append(Point(row['Incident Latitude'], row['Incident Longitude']))
return points % appended to the call dataframe
% Algorithm Two: given a string column containing coordinate data, construct a LineString
def street_iter():
paths = []
for geo in streets.geometry:
l = []
for t in geo.split():
try:
t = t.strip('(,)')
l.append(float(t))
except ValueError:
pass
p = []
for i in range(0, len(l), 2):
p.append(Point(l[i], l[i+1]))
paths.append(LineString(p))
return paths % appended to the street dataframe
然而,我主要关心的是第一段中概述的内容:给定 Shapely 方法 line.distance(point)
和新创建的 Shapely 对象,我如何才能有效地找到离每个电话最近的街道?我一天的尝试如下所示。这确实有效,但每次调用需要 1-2 秒,这是我想使用的几个数据集中的第一个。
% Algorithm Three: find the closest street (head 'u' and tail 'v' nodes) to each call
def build_matrix():
heads = []
tails = []
for i_c, r_c in calls.iterrows():
print(i)
p = r_c[4]
head_min = -1
tail_min = -1
dist_min = float('inf')
min_group = []
for i_s, r_s in streets.iterrows():
l = r_s[5].distance(p)
if dist_min > l:
head_min = r_s['u'] % head node
tail_min = r_s['v'] % tail node
dist_min = l
min_group = []
min_group.append(r_s)
if dist_min == l:
min_group.append(r_s)
if len(min_group) > 1:
choice = secrets.choice(min_group) % randomly selects an arc
head_min = choice['u']
tail_min = choice['v']
heads.append(head_min)
tails.append(tail_min)
return (heads, tails) % both appended to the calls dataframe
我花了几个小时研究矢量化,但是我找不到任何此类事情的例子。非常感谢您的帮助。
好的,明白了。继续关注Tenkanen, Heikinheimo & Aagesen, and the solution posted here的工作,我有办法了
该方法最终涉及 BallTree 搜索,以找到每条街道与每个事件最近的 centroid(如果有人知道如何修改它以处理 LineStrings,我将不胜感激提示)。下面的实现。
def get_nearest(src_points, candidates, k_neighbors=1):
"""Find nearest neighbors for all source points from a set of candidate points"""
# Create tree from the candidate points
tree = BallTree(candidates, leaf_size=15, metric='haversine')
# Find closest points and distances
distances, indices = tree.query(src_points, k=k_neighbors)
# Transpose to get distances and indices into arrays
distances = distances.transpose()
indices = indices.transpose()
# Get closest indices and distances (i.e. array at index 0)
# note: for the second closest points, you would take index 1, etc.
closest = indices[0]
closest_dist = distances[0]
# Return indices and distances
return (closest, closest_dist)
def nearest_neighbor(left_gdf, right_gdf, return_vals=False):
"""
For each point in left_gdf, find closest point in right GeoDataFrame and return them.
"""
left_geom_col = left_gdf.geometry.name
right_geom_col = right_gdf.geometry.name
# Ensure that index in right gdf is formed of sequential numbers
right = right_gdf.copy().reset_index(drop=True)
# Parse coordinates from points and insert them into a numpy array as RADIANS
left_radians = np.array(left_gdf[left_geom_col].apply(lambda geom: (geom.x * np.pi / 180, geom.y * np.pi / 180)).to_list())
right_radians = np.array(right[right_geom_col].apply(lambda geom: (geom.x * np.pi / 180, geom.y * np.pi / 180)).to_list())
# Find the nearest points
# -----------------------
# closest ==> index in right_gdf that corresponds to the closest point
# dist ==> distance between the nearest neighbors (in meters)
closest, dist = get_nearest(src_points=left_radians, candidates=right_radians)
# Return points from right GeoDataFrame that are closest to points in left GeoDataFrame
closest_points = left_gdf # right.loc[closest]
# Ensure that the index corresponds the one in left_gdf
closest_points = closest_points.reset_index(drop=True)
# Add the head and tail node IDs of the closest street
if return_vals:
closest_points['u'] = right.loc[closest,'u'].reset_index(drop=True)
closest_points['v'] = right.loc[closest,'v'].reset_index(drop=True)
return closest_points