Python pandas/lists 算法性能

Python pandas/lists Algorithm Performance

我编写了一个包含 2 个 pandas 数据框的简短脚本,一个包含城市资产的经纬度,另一个包含城市内便利设施的经纬度。

资产数据框有超过 160,000 行。对于不同的便利设施(全科医生服务、药房、学校等),我的代码最终将遍历一系列 csv 文件,并创建这些便利设施每个位置的经纬度数据帧。

我目前正在遍历每项资产,并使用 geopy.distance.distance() 创建每项资产到每项设施的距离列表,并汇总到每项设施的距离的最小值。因此,例如对于包含 200 家药店位置的 csv 文件,我想找到最近的药店与我的每项资产的距离。

目前我正在使用列表理解(两次)。这是我的代码:

# dataframe of amenity lats and lons
GP_Lats_lons = pd.read_csv('GPHealthCentreClinic.csv', usecols=(['lat', 'long']))

#dataframe of asset lats and lons (using database connection)
assets_df = pd.read_sql_query(query, conn) 

def get_distance(lat, lon):
    distances = [distance.distance((lat,lon),(x['lat'],x['long'])).km for y, x in GP_Lats_lons.iterrows()]  
    return min(distances)

assets_df['nearest_gp_km'] = [get_distance(x['Latitude'], x['Longitude']) for y, x in assets_df[['Latitude','Longitude']].iterrows()]

有没有人在数据结构和算法方面有任何想法如何优化上述代码?列表理解是一种 good/bad 方法吗? pd.DataFrame.apply() 中的 lambda 函数是更好的方法吗?

谢谢

TLDR 的长答案;

  • 使用组合而不是排列来驱动需要距离计算的位置集
  • 使用 numbahaversine 函数比 geopy.distance
  • 快 10 倍以上

设置

import requests
import pandas as pd
searchendpoint = "https://directory.spineservices.nhs.uk/ORD/2-0-0/organisations"
# get all healthcare facilities in Herefordshire
dfhc = pd.concat([pd.json_normalize(requests
                             .get(searchendpoint, params={"PostCode":f"HR{i}","Status":"Active"})
                             .json()["Organisations"]) 
           for i in range(1,10)]).reset_index(drop=True)
# get geo data for all postcodes associated with healthcare facilities
# API batch restriction of 100 post codes per call
dfgeo = (pd.concat([pd.json_normalize(
    requests.post("http://api.postcodes.io/postcodes", 
               json={"postcodes":dfhc.PostCode.unique().tolist()[i:i+100]}).json(),
    record_path="result")
 for i in range(0, len(dfhc.PostCode.unique()), 100)])
 .rename(columns=lambda c: c.replace("result.",""))
 .reset_index(drop=True)
 .assign(coord=lambda dfa: dfa.longitude.combine(dfa.latitude, lambda x,y: (x,y,)))
)
dfgeo_missed = dfgeo.loc[dfgeo.postcode.isna()]
dfgeo = dfgeo.loc[~dfgeo.postcode.isna()]

numba/geopy/haversine 距离

import geopy.distance
from numba import jit
import numpy as np

# a few ways to calculate distance between two (lon,lat) pairs
@jit(nopython=True)
def haversine_jit(x,y):
    # approximate radius of earth in km
    R = 6373.0

    s_lat = np.deg2rad(x[0])                    
    s_lng = np.deg2rad(x[1])     
    e_lat = np.deg2rad(y[0])                       
    e_lng = np.deg2rad(y[1])

    d = np.sin((e_lat - s_lat)/2)**2 + \
        np.cos(s_lat)*np.cos(e_lat) * \
        np.sin((e_lng - s_lng)/2)**2

    return 2 * R * np.arcsin(np.sqrt(d))

def haversine(x,y):
    # approximate radius of earth in km
    R = 6373.0

    s_lat = np.deg2rad(x[0])                    
    s_lng = np.deg2rad(x[1])     
    e_lat = np.deg2rad(y[0])                       
    e_lng = np.deg2rad(y[1])

    d = np.sin((e_lat - s_lat)/2)**2 + \
        np.cos(s_lat)*np.cos(e_lat) * \
        np.sin((e_lng - s_lng)/2)**2

    return 2 * R * np.arcsin(np.sqrt(d))


def geopykm(x,y):
    return geopy.distance.distance(x,y).km



优化

import itertools

# optimisation - use just combinations not permuations of locations
dfcombis = (pd.DataFrame(itertools.combinations(dfgeo.postcode.values, 2))
 .merge(dfgeo.loc[:,["postcode","coord","longitude","latitude"]], left_on=0, right_on="postcode")
 .merge(dfgeo.loc[:,["postcode","coord","longitude","latitude"]], left_on=1, right_on="postcode")
 .drop(columns=[0,1]))

def testit(df, calc=geopykm, col="km"):
    return df.assign(**{col:df.coord_x.combine(df.coord_y, calc)})

%timeit dfx = testit(dfcombis)
%timeit dfx = testit(dfcombis, calc=haversine)
%timeit dfx = testit(dfcombis, calc=haversine_jit)


dfx = testit(dfcombis, calc=haversine_jit, col="km")

计时

1.77 s ± 63.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
280 ms ± 16.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
125 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

用法

# a to b is same as b to a,  concat the reverse
# some locations have multiple facilities, include a to a is 0kmn
dfnb = pd.concat([
    dfx.loc[dfx.km.le(10),["postcode_x","postcode_y","km"]],
    dfx.loc[dfx.km.le(10),["postcode_y","postcode_x","km"]].rename(columns={"postcode_x":"postcode_y","postcode_y":"postcode_x"}),
    pd.DataFrame({"postcode_x":dfhc.PostCode.unique(),"postcode_y":dfhc.PostCode.unique(),"km":0})
          ],).reset_index(drop=True)

# finally some analysis,  find nearest pharmacies to GP surgery

(dfnb.merge(dfhc.loc[dfhc.PrimaryRoleId.isin(["RO180","RO96"]),["Name","PostCode","PrimaryRoleDescription"]],
    left_on="postcode_x", right_on="PostCode")
 .merge(dfhc.loc[dfhc.PrimaryRoleId.isin(["RO182","RO177"]),["Name","PostCode","PrimaryRoleDescription"]],
    left_on="postcode_y", right_on="PostCode")
 .sort_values(["Name_x","km"])
 .groupby(["Name_x"], as_index=False).first()

)
Name_x postcode_x postcode_y km PostCode_x PrimaryRoleDescription_x Name_y PostCode_y PrimaryRoleDescription_y
0 22A KING STREET HR4 9DA HR4 9AA 0.213861 HR4 9DA PRIMARY CARE TRUST SITE BOOTS UK LIMITED HR4 9AA PHARMACY
1 ALTON STREET SURGERY HR9 5AB HR9 5AB 0 HR9 5AB PRIMARY CARE TRUST SITE ALTON STREET SURGERY HR9 5AB PRESCRIBING COST CENTRE
2 AUBREY STREET HR4 0BU HR4 9AA 0.148447 HR4 0BU PRIMARY CARE TRUST SITE BOOTS UK LIMITED HR4 9AA PHARMACY
3 AYLESTONE HILL SURGERY HR1 1HR HR4 9AA 1.46984 HR1 1HR BRANCH SURGERY BOOTS UK LIMITED HR4 9AA PHARMACY
4 BARRS COURT SCHOOL HR1 1EQ HR4 9AA 1.27244 HR1 1EQ PRIMARY CARE TRUST SITE BOOTS UK LIMITED HR4 9AA PHARMACY
5 BELMONT ABBEY HR2 9RP HR2 9RP 0 HR2 9RP PRIMARY CARE TRUST SITE CYPS - LINDEN CENTRE HR2 9RP PRESCRIBING COST CENTRE
6 BELMONT HEALTH CENTRE HR2 7XT HR2 7XT 0 HR2 7XT PRIMARY CARE TRUST SITE BELMONT MEDICAL CENTRE HR2 7XT PRESCRIBING COST CENTRE
7 BLACKMARSTON SCHOOL HR2 7NX HR2 7JE 0.975908 HR2 7NX PRIMARY CARE TRUST SITE ASDA PHARMACY HR2 7JE PHARMACY
8 BOBBLESTOCK SURGERY HR4 9LP HR4 9AA 3.5643 HR4 9LP BRANCH SURGERY BOOTS UK LIMITED HR4 9AA PHARMACY
9 BODENHAM SURGERY HR1 3JU HR6 8LR 9.71357 HR1 3JU PRIMARY CARE TRUST SITE BOOTS UK LIMITED HR6 8LR PHARMACY
10 DENTAL ACCESS CENTRE/HEALTH PROMOTION HR2 7JE HR2 7JE 0 HR2 7JE PRIMARY CARE TRUST SITE ASDA PHARMACY HR2 7JE PHARMACY
11 ETNAM STREET MENTAL HEALTH RESOURCE CENTRE HR6 8AN HR6 8LR 0.557963 HR6 8AN PRIMARY CARE TRUST SITE BOOTS UK LIMITED HR6 8LR PHARMACY
12 KINGTON COURT HEALTH AND SOCIAL CARE CENTRE HR5 3BX HR5 3BJ 0.649622 HR5 3BX PRIMARY CARE TRUST SITE KINGTON PHARMACY HR5 3BJ PHARMACY
13 KINGTON SURGERY HR5 3EA HR5 3EA 0 HR5 3EA PRIMARY CARE TRUST SITE KINGTON MEDICAL PRACTICE HR5 3EA PRESCRIBING COST CENTRE