Numpy:模糊 'greater_than' 运算符,处理值列表(请求对现有代码的建议)
Numpy: fuzzy 'greater_than' operator, working on list of values (requesting advices on existing code)
我实现了一个 numpy 函数:
- 作为输入:
- n(行)x m(列)浮点数数组。
- a
threshold
(浮点数)
- 每一行:
- 如果行的最大值大于或等于
threshold
,
- 如果同一行中此最大值前面没有小于或等于
-threshold
的最小值,
- 则此行被标记为
True
(大于),
- 否则此行被标记为
False
(不大于)
- returns 那么这个 n(行)x 1(列)布尔数组
我已经实施的工作(至少在提供的示例中),但我远不是 numpy 的专家,我想知道是否没有更有效的方法来处理这个问题(可能避免杂项 transpose
& tile
例如?)
我很乐意接受有关如何使此功能更有效 and/or 可读的任何建议。
import numpy as np
import pandas as pd
# Test data
threshold=0.02 #2%
df = pd.DataFrame({'variation_1': [0.01, 0.02, 0.005, -0.02, -0.01, -0.01],
'variation_2': [-0.01, 0.08, 0.08, 0.01, -0.02, 0.01],
'variation_3': [0.005, -0.03, -0.03, 0.002, 0.025, -0.03],
})
data = df.values
检查预期结果:
In [75]: df
Out[75]:
variation_1 variation_2 variation_3 # Expecting
0 0.010 -0.01 0.005 # False (no value larger than threshold)
1 0.020 0.08 -0.030 # True (1st value equal to threshold)
2 0.005 0.08 -0.030 # True (2nd value larger than threshold)
3 -0.020 0.01 0.002 # False (no value larger than threshold)
4 -0.010 -0.02 0.025 # False (2nd value lower than -threshold)
5 -0.010 0.01 -0.030 # False (no value larger than threshold)
当前函数。
def greater_than(data: np.ndarray, threshold: float) -> np.ndarray:
# Step 1.
# Filtering out from 'low_max' mask the rows which 'max' is not greater than or equal
# to 'threshold'. 'low_max' is reshaped like input array for use in next step.
data_max = np.amax(data, axis=1)
low_max = np.transpose([data_max >= threshold] * data.shape[1])
# Step 2.
# Filtering values preceding max of each row
max_idx = np.argmax(data, axis=1) # Get idx of max.
max_idx = np.transpose([max_idx] * data.shape[1]) # Reshape like input array.
# Create an array of index.
idx_array = np.tile(np.arange(data.shape[1]), (data.shape[0],1))
# Keep indices lower than index of max for each row, and filter out rows with
# a max too low vs 'threshold' (from step 1).
mask_max = (idx_array <= max_idx) & (low_max)
# Step 3.
# On a masked array re-using mask from step 2 to filter out unqualifying values,
# filter out rows with a 'min' preceding the 'max' and that are lower than or
# equal to '-threshold'.
data = np.ma.array(data, mask=~mask_max)
data_min = np.amin(data, axis=1)
mask_min = data_min > -threshold
# Return 'mask_min', filling masked values with 'False'.
return np.ma.filled(mask_min, False)
结果。
res = greater_than(data, threshold)
In [78]:res
Out[78]: array([False, True, True, False, False, False])
提前感谢您的任何建议!
lesser = data <= -threshold
greater = data >= threshold
idx_lesser = np.argmax(lesser, axis=1)
idx_greater = np.argmax(greater, axis=1)
has_lesser = np.any(lesser, axis=1)
has_greater = np.any(greater, axis=1)
outptut = has_greater * (has_lesser * (idx_lesser > idx_greater) + np.logical_not(has_lesser))
生成您预期的数据输出并且应该非常快。另外,我不完全确定我理解你的解释,所以如果这对你的实际数据不起作用,请告诉我。
我实现了一个 numpy 函数:
- 作为输入:
- n(行)x m(列)浮点数数组。
- a
threshold
(浮点数)
- 每一行:
- 如果行的最大值大于或等于
threshold
, - 如果同一行中此最大值前面没有小于或等于
-threshold
的最小值, - 则此行被标记为
True
(大于), - 否则此行被标记为
False
(不大于)
- 如果行的最大值大于或等于
- returns 那么这个 n(行)x 1(列)布尔数组
我已经实施的工作(至少在提供的示例中),但我远不是 numpy 的专家,我想知道是否没有更有效的方法来处理这个问题(可能避免杂项 transpose
& tile
例如?)
我很乐意接受有关如何使此功能更有效 and/or 可读的任何建议。
import numpy as np
import pandas as pd
# Test data
threshold=0.02 #2%
df = pd.DataFrame({'variation_1': [0.01, 0.02, 0.005, -0.02, -0.01, -0.01],
'variation_2': [-0.01, 0.08, 0.08, 0.01, -0.02, 0.01],
'variation_3': [0.005, -0.03, -0.03, 0.002, 0.025, -0.03],
})
data = df.values
检查预期结果:
In [75]: df
Out[75]:
variation_1 variation_2 variation_3 # Expecting
0 0.010 -0.01 0.005 # False (no value larger than threshold)
1 0.020 0.08 -0.030 # True (1st value equal to threshold)
2 0.005 0.08 -0.030 # True (2nd value larger than threshold)
3 -0.020 0.01 0.002 # False (no value larger than threshold)
4 -0.010 -0.02 0.025 # False (2nd value lower than -threshold)
5 -0.010 0.01 -0.030 # False (no value larger than threshold)
当前函数。
def greater_than(data: np.ndarray, threshold: float) -> np.ndarray:
# Step 1.
# Filtering out from 'low_max' mask the rows which 'max' is not greater than or equal
# to 'threshold'. 'low_max' is reshaped like input array for use in next step.
data_max = np.amax(data, axis=1)
low_max = np.transpose([data_max >= threshold] * data.shape[1])
# Step 2.
# Filtering values preceding max of each row
max_idx = np.argmax(data, axis=1) # Get idx of max.
max_idx = np.transpose([max_idx] * data.shape[1]) # Reshape like input array.
# Create an array of index.
idx_array = np.tile(np.arange(data.shape[1]), (data.shape[0],1))
# Keep indices lower than index of max for each row, and filter out rows with
# a max too low vs 'threshold' (from step 1).
mask_max = (idx_array <= max_idx) & (low_max)
# Step 3.
# On a masked array re-using mask from step 2 to filter out unqualifying values,
# filter out rows with a 'min' preceding the 'max' and that are lower than or
# equal to '-threshold'.
data = np.ma.array(data, mask=~mask_max)
data_min = np.amin(data, axis=1)
mask_min = data_min > -threshold
# Return 'mask_min', filling masked values with 'False'.
return np.ma.filled(mask_min, False)
结果。
res = greater_than(data, threshold)
In [78]:res
Out[78]: array([False, True, True, False, False, False])
提前感谢您的任何建议!
lesser = data <= -threshold
greater = data >= threshold
idx_lesser = np.argmax(lesser, axis=1)
idx_greater = np.argmax(greater, axis=1)
has_lesser = np.any(lesser, axis=1)
has_greater = np.any(greater, axis=1)
outptut = has_greater * (has_lesser * (idx_lesser > idx_greater) + np.logical_not(has_lesser))
生成您预期的数据输出并且应该非常快。另外,我不完全确定我理解你的解释,所以如果这对你的实际数据不起作用,请告诉我。