为 pandas 数据框中的列向量化逐步函数
Vectorize step-wise function for column in pandas dataframe
我有一个稍微复杂的函数,它通过预定义的逐步逻辑(取决于固定边界以及基于实际值的相对边界)为给定数据分配质量级别。下面的函数 'get_quality()' 对每一行执行此操作,使用 pandas DataFrame.apply 对于大型数据集来说非常慢。所以我想矢量化这个计算。显然我可以为内部 if 逻辑做类似 df.groupby(pd.cut(df.ground_truth, [-np.inf, 10.0, 20.0, 50.0, np.inf]))
的事情,然后在每个组内应用类似的子分组(基于每个组的边界),但我将如何为最后一个平分做那取决于每行给定的 real/ground_truth 值?
使用 df['quality'] = np.vectorize(get_quality)(df['measured'], df['ground_truth'])
已经快很多了,但是是否有真正的向量化方法来计算相同的 'quality' 列?
import pandas as pd
import numpy as np
from bisect import bisect
quality_levels = ['WayTooLow', 'TooLow', 'OK', 'TooHigh', 'WayTooHigh']
# Note: to make the vertical borders always lead towards the 'better' score we use a small epsilon around them
eps = 0.000001
def get_quality(measured_value, real_value):
diff = measured_value - real_value
if real_value <= 10.0:
i = bisect([-4.0-eps, -2.0-eps, 2.0+eps, 4.0+eps], diff)
return quality_levels[i]
elif real_value <= 20.0:
i = bisect([-14.0-eps, -6.0-eps, 6.0+eps, 14.0+eps], diff)
return quality_levels[i]
elif real_value <= 50.0:
i = bisect([-45.0-eps, -20.0-eps, 20.0+eps, 45.0+eps], diff)
return quality_levels[i]
else:
i = bisect([-0.5*real_value-eps, -0.25*real_value-eps,
0.25*real_value+eps, 0.5*real_value+eps], diff)
return quality_levels[i]
N = 100000
df = pd.DataFrame({'ground_truth': np.random.randint(0, 100, N),
'measured': np.random.randint(0, 100, N)})
df['quality'] = df.apply(lambda row: get_quality((row['measured']), (row['ground_truth'])), axis=1)
print(df.head())
print(df.quality2.value_counts())
# ground_truth measured quality
#0 51 1 WayTooLow
#1 7 25 WayTooHigh
#2 38 95 WayTooHigh
#3 76 32 WayTooLow
#4 0 18 WayTooHigh
#OK 30035
#WayTooHigh 24257
#WayTooLow 18998
#TooLow 14593
#TooHigh 12117
这可以通过 np.select 实现。
import numpy as np
quality_levels = ['WayTooLow', 'TooLow', 'OK', 'TooHigh', 'WayTooHigh']
def get_quality_vectorized(df):
# Prepare the first 4 conditions, to match the 4 sets of boundaries.
gt = df['ground_truth']
conds = [gt <= 10, gt <= 20, gt <= 50, True]
lo = np.select(conds, [2, 6, 20, 0.25 * gt])
hi = np.select(conds, [4, 14, 45, 0.5 * gt])
# Prepare inner 5 conditions, to match the 5 quality levels.
diff = df['measured'] - df['ground_truth']
quality_conds = [diff < -hi-eps, diff < -lo-eps, diff < lo+eps, diff < hi+eps, True]
df['quality'] = np.select(quality_conds, quality_levels)
return df
我有一个稍微复杂的函数,它通过预定义的逐步逻辑(取决于固定边界以及基于实际值的相对边界)为给定数据分配质量级别。下面的函数 'get_quality()' 对每一行执行此操作,使用 pandas DataFrame.apply 对于大型数据集来说非常慢。所以我想矢量化这个计算。显然我可以为内部 if 逻辑做类似 df.groupby(pd.cut(df.ground_truth, [-np.inf, 10.0, 20.0, 50.0, np.inf]))
的事情,然后在每个组内应用类似的子分组(基于每个组的边界),但我将如何为最后一个平分做那取决于每行给定的 real/ground_truth 值?
使用 df['quality'] = np.vectorize(get_quality)(df['measured'], df['ground_truth'])
已经快很多了,但是是否有真正的向量化方法来计算相同的 'quality' 列?
import pandas as pd
import numpy as np
from bisect import bisect
quality_levels = ['WayTooLow', 'TooLow', 'OK', 'TooHigh', 'WayTooHigh']
# Note: to make the vertical borders always lead towards the 'better' score we use a small epsilon around them
eps = 0.000001
def get_quality(measured_value, real_value):
diff = measured_value - real_value
if real_value <= 10.0:
i = bisect([-4.0-eps, -2.0-eps, 2.0+eps, 4.0+eps], diff)
return quality_levels[i]
elif real_value <= 20.0:
i = bisect([-14.0-eps, -6.0-eps, 6.0+eps, 14.0+eps], diff)
return quality_levels[i]
elif real_value <= 50.0:
i = bisect([-45.0-eps, -20.0-eps, 20.0+eps, 45.0+eps], diff)
return quality_levels[i]
else:
i = bisect([-0.5*real_value-eps, -0.25*real_value-eps,
0.25*real_value+eps, 0.5*real_value+eps], diff)
return quality_levels[i]
N = 100000
df = pd.DataFrame({'ground_truth': np.random.randint(0, 100, N),
'measured': np.random.randint(0, 100, N)})
df['quality'] = df.apply(lambda row: get_quality((row['measured']), (row['ground_truth'])), axis=1)
print(df.head())
print(df.quality2.value_counts())
# ground_truth measured quality
#0 51 1 WayTooLow
#1 7 25 WayTooHigh
#2 38 95 WayTooHigh
#3 76 32 WayTooLow
#4 0 18 WayTooHigh
#OK 30035
#WayTooHigh 24257
#WayTooLow 18998
#TooLow 14593
#TooHigh 12117
这可以通过 np.select 实现。
import numpy as np
quality_levels = ['WayTooLow', 'TooLow', 'OK', 'TooHigh', 'WayTooHigh']
def get_quality_vectorized(df):
# Prepare the first 4 conditions, to match the 4 sets of boundaries.
gt = df['ground_truth']
conds = [gt <= 10, gt <= 20, gt <= 50, True]
lo = np.select(conds, [2, 6, 20, 0.25 * gt])
hi = np.select(conds, [4, 14, 45, 0.5 * gt])
# Prepare inner 5 conditions, to match the 5 quality levels.
diff = df['measured'] - df['ground_truth']
quality_conds = [diff < -hi-eps, diff < -lo-eps, diff < lo+eps, diff < hi+eps, True]
df['quality'] = np.select(quality_conds, quality_levels)
return df