如果到前一点的 haversine 距离小于某个值,则删除 GPS 点
Remove GPS points if haversine distance to previous point is less than a certain value
我有一个带有 GPS 坐标的 pandas 数据框
import pandas as pd
d1 = {'user': ['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'],
'longitude': [-122.419576048851, -122.4196457862854, -122.41975843906403, -122.41981744766234, -122.41961896419524, -122.41846561431885, -122.41841197013854, -122.41860508918761, -122.41830468177795, -122.41655588150023, -122.416330575943, -122.41608381271362, -122.41587996482849, -122.41443157196045, -122.41400241851807, -122.4145495891571, -122.28513300418852, -122.28403329849243, -122.28397965431215, -122.28369534015657, -122.28364706039427, -122.28360414505003, -122.28335201740265, -122.28326618671417, -122.28309988975525, -122.2829818725586, -122.28216111660002, -122.28297650814056],
'latitude':[37.77727010900716, 37.77759235026598, 37.778147789138536, 37.778291948163755, 37.77833010785869, 37.77846154665706, 37.77932225301237, 37.780250787054555, 37.78027198632572, 37.78056029581, 37.78059421449895, 37.78061965350541, 37.78064509250312, 37.780848604169755, 37.7822816496242, 37.784647385762014, 37.81233951943745, 37.812068286068886, 37.81228018722322, 37.81312354779044, 37.813237972853855, 37.813365111605194, 37.814017753748836, 37.8141830323372, 37.814161842795265, 37.81414489115734, 37.814009277913826, 37.81183095605405]}
df1 = pd.DataFrame(data=d1)
使用以下 haversine 函数,我能够计算 GPS 轨迹的连续点之间的距离(按用户分组)
# Define haversine function
def haversine(lat1, lon1, lat2, lon2, earth_radius=6371):
lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
a = np.sin((lat2-lat1)/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2
km = earth_radius * 2 * np.arcsin(np.sqrt(a))
m = km * 1000
return pd.DataFrame(m)
df1['distance'] = df1.groupby('user').apply(lambda x: haversine(x['latitude'],
x['longitude'],
x['latitude'].shift(),
x['longitude'].shift())).values
df1['distance'] = df1['distance'].fillna(0)
user longitude latitude distance
0 A -122.419576 37.777270 0.000000
1 A -122.419646 37.777592 36.352012
2 A -122.419758 37.778148 62.550525
3 A -122.419817 37.778292 16.847806
4 A -122.419619 37.778330 17.952766
5 A -122.418466 37.778462 102.412611
6 A -122.418412 37.779322 95.822233
7 A -122.418605 37.780251 104.633961
8 A -122.418305 37.780272 26.506241
9 A -122.416556 37.780560 157.000401
10 A -122.416331 37.780594 20.156826
11 A -122.416084 37.780620 21.870313
12 A -122.415880 37.780645 18.136963
13 A -122.414432 37.780849 129.286601
14 A -122.414002 37.782282 163.749922
15 A -122.414550 37.784647 267.416687
16 B -122.285133 37.812340 0.000000
17 B -122.284033 37.812068 101.203952
18 B -122.283980 37.812280 24.028959
19 B -122.283695 37.813124 97.046376
20 B -122.283647 37.813238 13.411732
21 B -122.283604 37.813365 14.631208
22 B -122.283352 37.814018 75.875008
23 B -122.283266 37.814183 19.864639
24 B -122.283100 37.814162 14.797045
25 B -122.282982 37.814145 10.537113
26 B -122.282161 37.814009 73.658945
27 B -122.282977 37.811831 252.587420
现在我想编写一个函数来删除第二个,即如果与其前身相比距离小于 50 米,则以下 GPS 点。该函数应始终保留轨迹的最后 point/feature,而不管前一个保留的特征之间的距离如何。第一点也应该始终保持。
有什么实现方法吗?
可以插入函数的解决方案如下:
您希望为每个用户保留第一个和最后一个实例。所以这可以通过
来实现
g = df.groupby('user')
df2 = pd.concat([g.head(1), g.tail(1)])
也就是
user longitude latitude distance
0 A -122.419576 37.777270 0.000000
16 B -122.285133 37.812340 0.000000
15 A -122.414550 37.784647 267.416687
27 B -122.282977 37.811831 252.587420
然后,确定距离的差异,如果距离小于 50,则删除行并与每个组的第一行和最后一行连接,并按索引排序:
df = df.drop(df[df.distance< 50].index)
df_new = pd.concat([df,df2]).sort_index()
df_new = df_new.drop_duplicates()
给出
user longitude latitude distance
0 A -122.419576 37.777270 0.000000
2 A -122.419758 37.778148 62.550525
5 A -122.418466 37.778462 102.412611
6 A -122.418412 37.779322 95.822233
7 A -122.418605 37.780251 104.633961
9 A -122.416556 37.780560 157.000401
13 A -122.414432 37.780849 129.286601
14 A -122.414002 37.782282 163.749922
15 A -122.414550 37.784647 267.416687
16 B -122.285133 37.812340 0.000000
17 B -122.284033 37.812068 101.203952
19 B -122.283695 37.813124 97.046376
22 B -122.283352 37.814018 75.875008
26 B -122.282161 37.814009 73.658945
27 B -122.282977 37.811831 252.587420
不是最漂亮的功能,但它确实有效:
def Drop_values(df):
g = df.groupby('user')
df2 = pd.concat([g.head(1), g.tail(1)])
df = df.drop(df[df.distance< 50].index)
df_new = pd.concat([df,df2]).sort_index()
df_new = df_new.drop_duplicates()
return(df_new)
我有一个带有 GPS 坐标的 pandas 数据框
import pandas as pd
d1 = {'user': ['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'],
'longitude': [-122.419576048851, -122.4196457862854, -122.41975843906403, -122.41981744766234, -122.41961896419524, -122.41846561431885, -122.41841197013854, -122.41860508918761, -122.41830468177795, -122.41655588150023, -122.416330575943, -122.41608381271362, -122.41587996482849, -122.41443157196045, -122.41400241851807, -122.4145495891571, -122.28513300418852, -122.28403329849243, -122.28397965431215, -122.28369534015657, -122.28364706039427, -122.28360414505003, -122.28335201740265, -122.28326618671417, -122.28309988975525, -122.2829818725586, -122.28216111660002, -122.28297650814056],
'latitude':[37.77727010900716, 37.77759235026598, 37.778147789138536, 37.778291948163755, 37.77833010785869, 37.77846154665706, 37.77932225301237, 37.780250787054555, 37.78027198632572, 37.78056029581, 37.78059421449895, 37.78061965350541, 37.78064509250312, 37.780848604169755, 37.7822816496242, 37.784647385762014, 37.81233951943745, 37.812068286068886, 37.81228018722322, 37.81312354779044, 37.813237972853855, 37.813365111605194, 37.814017753748836, 37.8141830323372, 37.814161842795265, 37.81414489115734, 37.814009277913826, 37.81183095605405]}
df1 = pd.DataFrame(data=d1)
使用以下 haversine 函数,我能够计算 GPS 轨迹的连续点之间的距离(按用户分组)
# Define haversine function
def haversine(lat1, lon1, lat2, lon2, earth_radius=6371):
lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
a = np.sin((lat2-lat1)/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2
km = earth_radius * 2 * np.arcsin(np.sqrt(a))
m = km * 1000
return pd.DataFrame(m)
df1['distance'] = df1.groupby('user').apply(lambda x: haversine(x['latitude'],
x['longitude'],
x['latitude'].shift(),
x['longitude'].shift())).values
df1['distance'] = df1['distance'].fillna(0)
user longitude latitude distance
0 A -122.419576 37.777270 0.000000
1 A -122.419646 37.777592 36.352012
2 A -122.419758 37.778148 62.550525
3 A -122.419817 37.778292 16.847806
4 A -122.419619 37.778330 17.952766
5 A -122.418466 37.778462 102.412611
6 A -122.418412 37.779322 95.822233
7 A -122.418605 37.780251 104.633961
8 A -122.418305 37.780272 26.506241
9 A -122.416556 37.780560 157.000401
10 A -122.416331 37.780594 20.156826
11 A -122.416084 37.780620 21.870313
12 A -122.415880 37.780645 18.136963
13 A -122.414432 37.780849 129.286601
14 A -122.414002 37.782282 163.749922
15 A -122.414550 37.784647 267.416687
16 B -122.285133 37.812340 0.000000
17 B -122.284033 37.812068 101.203952
18 B -122.283980 37.812280 24.028959
19 B -122.283695 37.813124 97.046376
20 B -122.283647 37.813238 13.411732
21 B -122.283604 37.813365 14.631208
22 B -122.283352 37.814018 75.875008
23 B -122.283266 37.814183 19.864639
24 B -122.283100 37.814162 14.797045
25 B -122.282982 37.814145 10.537113
26 B -122.282161 37.814009 73.658945
27 B -122.282977 37.811831 252.587420
现在我想编写一个函数来删除第二个,即如果与其前身相比距离小于 50 米,则以下 GPS 点。该函数应始终保留轨迹的最后 point/feature,而不管前一个保留的特征之间的距离如何。第一点也应该始终保持。
有什么实现方法吗?
可以插入函数的解决方案如下:
您希望为每个用户保留第一个和最后一个实例。所以这可以通过
来实现g = df.groupby('user')
df2 = pd.concat([g.head(1), g.tail(1)])
也就是
user longitude latitude distance
0 A -122.419576 37.777270 0.000000
16 B -122.285133 37.812340 0.000000
15 A -122.414550 37.784647 267.416687
27 B -122.282977 37.811831 252.587420
然后,确定距离的差异,如果距离小于 50,则删除行并与每个组的第一行和最后一行连接,并按索引排序:
df = df.drop(df[df.distance< 50].index)
df_new = pd.concat([df,df2]).sort_index()
df_new = df_new.drop_duplicates()
给出
user longitude latitude distance
0 A -122.419576 37.777270 0.000000
2 A -122.419758 37.778148 62.550525
5 A -122.418466 37.778462 102.412611
6 A -122.418412 37.779322 95.822233
7 A -122.418605 37.780251 104.633961
9 A -122.416556 37.780560 157.000401
13 A -122.414432 37.780849 129.286601
14 A -122.414002 37.782282 163.749922
15 A -122.414550 37.784647 267.416687
16 B -122.285133 37.812340 0.000000
17 B -122.284033 37.812068 101.203952
19 B -122.283695 37.813124 97.046376
22 B -122.283352 37.814018 75.875008
26 B -122.282161 37.814009 73.658945
27 B -122.282977 37.811831 252.587420
不是最漂亮的功能,但它确实有效:
def Drop_values(df):
g = df.groupby('user')
df2 = pd.concat([g.head(1), g.tail(1)])
df = df.drop(df[df.distance< 50].index)
df_new = pd.concat([df,df2]).sort_index()
df_new = df_new.drop_duplicates()
return(df_new)