如何自动从数据框列进行自然对数计算?

How can I make my natural logarithm calculation from dataframe columns automatic?

我必须创建列来计算数据集中其他列的自然对数。列(特征)太多,我想让它自动,但我试过的 for 循环没有用。这是我称为 'features':

的列的列表
features=['price_seat',
              'days_length_of_stay',
              'days_to_departure',
              'distance',
              'unit_cost_brute',
              'unit_cost_clip',
              'unit_cost_mean',
              'unit_cost',
              'org_country_gdp_per_capita',
              'dst_country_gdp_per_capita',
              'competing_airline',
              #'yield',
              'price_seat_cluster',
              'yield_cluster',
              'low_cost',
              #'PAX',
              #'REVENUE',
              'LOCAL_PAX',
              'BEHIND_PAX',
              'BEYOND_PAX',
              'BRIDGE_PAX',
              'LOCAL_REVENUE',
              'BEHIND_REVENUE',
              'BEYOND_REVENUE',
              'BRIDGE_REVENUE',
              'REVENUE_WITH_TAXES',
              'LOCAL_REVENUE_WITH_TAXES',
              'BRIDGE_REVENUE_WITH_TAXES',
              'BEHIND_REVENUE_WITH_TAXES',
              'BEYOND_REVENUE_WITH_TAXES',
              'PERIOD',
              'n_flights_month',
              'avg_flights_month',
              'flights_month',
              #'pax_flight',
              'revenue_flight',
              #'revenue_pax',
              'WTI',
              'Brent',
              'Jet_fuel',
              'OilPrice_USD_bbl',
              'FuelPrice_USD_USgal',
              'Density',
              'Cf_USD_kg',
              'd_fr24',
              'distance_fr']

这是我使用过的代码,它有效:

 df=df9.withColumn('ln_price_seat', F.log('price_seat'))\
    .withColumn('ln_days_length_of_stay',F.log('days_length_of_stay'))\
    .withColumn('ln_days_to_departure',F.log('days_to_departure'))\
    .withColumn('ln_distance',F.log('distance'))\
    .withColumn('ln_unit_cost_brute',F.log('unit_cost_brute'))\
    .withColumn('ln_unit_cost_clip',F.log('unit_cost_clip'))\
    .withColumn('ln_unit_cost_mean',F.log('unit_cost_mean'))

但这对于这么多功能来说太过 'manual' 了,我将来可能会更改这些功能,所以我需要一些可以处理的东西。最重要的是,我的数据框非常大,大约 50M 或更多。在执行此操作之前,我能够执行此过程:

def get_log_features(self,df):

    
    features=['price_seat',
              'days_length_of_stay',
              'days_to_departure',
              'distance',
              'unit_cost_brute',
              'unit_cost_clip',
              'unit_cost_mean',
              'unit_cost',
              'org_country_gdp_per_capita',
              'dst_country_gdp_per_capita',
              'competing_airline',
              'price_seat_cluster',
              'yield_cluster',
              'low_cost',
              'LOCAL_PAX',
              'BEHIND_PAX',
              'BEYOND_PAX',
              'BRIDGE_PAX',
              'LOCAL_REVENUE',
              'BEHIND_REVENUE',
              'BEYOND_REVENUE',
              'BRIDGE_REVENUE',
              'REVENUE_WITH_TAXES',
              'LOCAL_REVENUE_WITH_TAXES',
              'BRIDGE_REVENUE_WITH_TAXES',
              'BEHIND_REVENUE_WITH_TAXES',
              'BEYOND_REVENUE_WITH_TAXES',
              'PERIOD',
              'n_flights_month',
              'avg_flights_month',
              'flights_month',
              'revenue_flight',
              'WTI',
              'Brent',
              'Jet_fuel',
              'OilPrice_USD_bbl',
              'FuelPrice_USD_USgal',
              'Density',
              'Cf_USD_kg',
              'd_fr24',
              'distance_fr']
    


    features_for_log=features
    df_log= (df.select(*features_for_log,'org_airport','dst_airport','d_year','d_month'))
    
    for new_col in features_for_log:
        df_log = df_log.withColumn('ln_'+ new_col, F.log(F.col(new_col)))
        
    df_log= (df_log.drop(*features_for_log))
    
    
    df=(df.join(df_log,['org_airport','dst_airport','d_year','d_month'],how='outer'))

但是当我调用这个函数时它需要几个小时,它的计算成本太高,这就是为什么我考虑 'appending' 原始数据框和特征列表定义的列的自然对数,这可能是更便宜。

你有什么建议吗?

最简单和最快的方法就是您已经描述的方法:将日志列添加到数据框:

cols = [F.col(col) for col in df.columns]
ln_cols = [F.log(col).alias(f"ln_{col}") for col in features_for_log]
df = df.select(cols + ln_cols)

我这样做了并且成功了:

from pyspark.sql.types import *

from pyspark.sql 导入函数作为 F

class GetLogFeatures:

def __init__(self,spark):
    self._spark_=spark
    
def get_log_features(self,df):

    
    features=['price_seat',
              'days_length_of_stay',
              'days_to_departure',
              'distance',
              'unit_cost_brute',
              'unit_cost_clip',
              'unit_cost_mean',
              'unit_cost',
              'org_country_gdp_per_capita',
              'dst_country_gdp_per_capita',
              'competing_airline',
              'price_seat_cluster',
              'yield_cluster',
              'low_cost',
              'LOCAL_PAX',
              'BEHIND_PAX',
              'BEYOND_PAX',
              'BRIDGE_PAX',
              'LOCAL_REVENUE',
              'BEHIND_REVENUE',
              'BEYOND_REVENUE',
              'BRIDGE_REVENUE',
              'REVENUE_WITH_TAXES',
              'LOCAL_REVENUE_WITH_TAXES',
              'BRIDGE_REVENUE_WITH_TAXES',
              'BEHIND_REVENUE_WITH_TAXES',
              'BEYOND_REVENUE_WITH_TAXES',
              'PERIOD',
              'n_flights_month',
              'avg_flights_month',
              'flights_month',
              'revenue_flight',
              'WTI',
              'Brent',
              'Jet_fuel',
              'OilPrice_USD_bbl',
              'FuelPrice_USD_USgal',
              'Density',
              'Cf_USD_kg',
              'd_fr24',
              'distance_fr']
    
    for new_col in features:
        df = df.withColumn('ln_'+ new_col, F.log(F.col(new_col)))
    
    return df