我有一个 DataFrame,需要在列之间执行计算。我的函数 do_something 可以向量化吗?
I have a DataFrame and need to perform calculations between columns. Can my function do_something be vectorised?
我有一个 DataFrame 并且需要在列之间执行计算。我的函数 do_something
可以向量化吗?
第['1min', '2min', '5min', '15min', '30min', '1hour', '2hour', '4hour', '1day', '2day', '7day',]
列需要依次与价格和上一列的值进行比较。如果列小于 price
且小于 previous column
,则为 min_sig
分配列的值,并为 min_bar
分配列的名称。如果它 does not match
,如果它是 '1min' 列,则 min_sig
和 min_bar
被分配值 False
,而其他列 interrupt
循环.
我的代码可以达到我想要的效果,函数generate_data()
可以向量化优化吗?
我的代码如下:
import pandas as pd
import numpy as np
def generate_data():
code = ['a', 'b', 'c', 'd']
price = [72, 50.8, 77.8, 54.6]
min1 = [69.78, 49.21, 79.75, 56.21]
min2 = [69.9, 49.22, 79.4, 55.85]
min5 = [73.36, 51.81, 74.78, 52]
min15 = [79.07, 56.25, 67.86, 46.9]
min30 = [77.1, 54.86, 70.38, 48.91]
hour1 = [75.12, 53.49, 72.84, 51.29]
hour2 = [74.1, 52.75, 73.51, 51.79]
hour4 = [72.18, 51.69, 77.83, 55.96]
day1 = [78.13, 56.76, 73.47, 52.37]
day2 = [80.42, 58.72, 71.88, 51.78]
day7 = [110.79, 84.6, 83.73, 65.48]
dict1 = {'code': code, 'price': price, '1min': min1, '2min': min2, '5min': min5, '15min': min15, '30min': min30,
'1hour': hour1, '2hour': hour2, '4hour': hour4, '1day': day1, '2day': day2, '7day': day7, }
df = pd.DataFrame(dict1)
df['min_bar'] = np.NAN
df['min_sig'] = np.NAN
col = ['code', 'price', 'min_bar', 'min_sig', '1min', '2min', '5min', '15min', '30min', '1hour', '2hour', '4hour',
'1day', '2day', '7day', ]
df = df[col]
return df
def do_something(a):
list1 = ['1min', '2min', '5min', '15min', '30min', '1hour', '2hour', '4hour',
'1day', '2day', '7day', ]
for i in range(len(list1)):
bar = list1[i]
if i == 0:
if a['price'] >= a[bar]:
a['min_sig'] = a[bar]
a['min_bar'] = bar
else:
a['min_sig'] = False
a['min_bar'] = False
break
else:
if a['min_sig'] >= a[bar]:
a['min_sig'] = a[bar]
a['min_bar'] = bar
else:
break
return a
def main():
df = generate_data()
print('Dataframe before running generate_data():')
print(df)
df = df.apply(do_something, axis=1)
print('The result after running is the result I want:')
print(df)
if __name__ == '__main__':
main()
Dataframe before running generate_data():
code price min_bar min_sig 1min ... 2hour 4hour 1day 2day 7day
0 a 72.0 NaN NaN 69.78 ... 74.10 72.18 78.13 80.42 110.79
1 b 50.8 NaN NaN 49.21 ... 52.75 51.69 56.76 58.72 84.60
2 c 77.8 NaN NaN 79.75 ... 73.51 77.83 73.47 71.88 83.73
3 d 54.6 NaN NaN 56.21 ... 51.79 55.96 52.37 51.78 65.48
[4 rows x 15 columns]
The result after running is the result I want:
code price min_bar min_sig 1min ... 2hour 4hour 1day 2day 7day
0 a 72.0 1min 69.78 69.78 ... 74.10 72.18 78.13 80.42 110.79
1 b 50.8 1min 49.21 49.21 ... 52.75 51.69 56.76 58.72 84.60
2 c 77.8 False False 79.75 ... 73.51 77.83 73.47 71.88 83.73
3 d 54.6 False False 56.21 ... 51.79 55.96 52.37 51.78 65.48
[4 rows x 15 columns]
%timeit df.apply(do_something,axis=1)
4.88 ms ± 50.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
IIUC,如果第一个值不大于价格,您似乎想用 False 屏蔽 idxmin
和 min
。
您可以使用 numpy 一次获得两个操作:
m = np.argmin(df[list1].values, axis=1)
(pd.DataFrame({'min_bar': np.take(list1, m),
'min_sig': np.take(df[list1].values, m)})
.mask(df['price'].lt(df[list1[0]]), False)
)
(然后加入或分配给原来的df)
输出:
min_bar min_sig
0 1min 69.78
1 1min 69.78
2 False False
3 False False
使用pandas
不过这需要搜索两次最小值
m = df['price'].lt(df[list1[0]])
df['min_bar'] = df[list1].idxmin(axis=1).mask(m, False)
df['min_sig'] = df[list1].min(axis=1).mask(m, False)
我有一个 DataFrame 并且需要在列之间执行计算。我的函数 do_something
可以向量化吗?
第['1min', '2min', '5min', '15min', '30min', '1hour', '2hour', '4hour', '1day', '2day', '7day',]
列需要依次与价格和上一列的值进行比较。如果列小于 price
且小于 previous column
,则为 min_sig
分配列的值,并为 min_bar
分配列的名称。如果它 does not match
,如果它是 '1min' 列,则 min_sig
和 min_bar
被分配值 False
,而其他列 interrupt
循环.
我的代码可以达到我想要的效果,函数generate_data()
可以向量化优化吗?
我的代码如下:
import pandas as pd
import numpy as np
def generate_data():
code = ['a', 'b', 'c', 'd']
price = [72, 50.8, 77.8, 54.6]
min1 = [69.78, 49.21, 79.75, 56.21]
min2 = [69.9, 49.22, 79.4, 55.85]
min5 = [73.36, 51.81, 74.78, 52]
min15 = [79.07, 56.25, 67.86, 46.9]
min30 = [77.1, 54.86, 70.38, 48.91]
hour1 = [75.12, 53.49, 72.84, 51.29]
hour2 = [74.1, 52.75, 73.51, 51.79]
hour4 = [72.18, 51.69, 77.83, 55.96]
day1 = [78.13, 56.76, 73.47, 52.37]
day2 = [80.42, 58.72, 71.88, 51.78]
day7 = [110.79, 84.6, 83.73, 65.48]
dict1 = {'code': code, 'price': price, '1min': min1, '2min': min2, '5min': min5, '15min': min15, '30min': min30,
'1hour': hour1, '2hour': hour2, '4hour': hour4, '1day': day1, '2day': day2, '7day': day7, }
df = pd.DataFrame(dict1)
df['min_bar'] = np.NAN
df['min_sig'] = np.NAN
col = ['code', 'price', 'min_bar', 'min_sig', '1min', '2min', '5min', '15min', '30min', '1hour', '2hour', '4hour',
'1day', '2day', '7day', ]
df = df[col]
return df
def do_something(a):
list1 = ['1min', '2min', '5min', '15min', '30min', '1hour', '2hour', '4hour',
'1day', '2day', '7day', ]
for i in range(len(list1)):
bar = list1[i]
if i == 0:
if a['price'] >= a[bar]:
a['min_sig'] = a[bar]
a['min_bar'] = bar
else:
a['min_sig'] = False
a['min_bar'] = False
break
else:
if a['min_sig'] >= a[bar]:
a['min_sig'] = a[bar]
a['min_bar'] = bar
else:
break
return a
def main():
df = generate_data()
print('Dataframe before running generate_data():')
print(df)
df = df.apply(do_something, axis=1)
print('The result after running is the result I want:')
print(df)
if __name__ == '__main__':
main()
Dataframe before running generate_data():
code price min_bar min_sig 1min ... 2hour 4hour 1day 2day 7day
0 a 72.0 NaN NaN 69.78 ... 74.10 72.18 78.13 80.42 110.79
1 b 50.8 NaN NaN 49.21 ... 52.75 51.69 56.76 58.72 84.60
2 c 77.8 NaN NaN 79.75 ... 73.51 77.83 73.47 71.88 83.73
3 d 54.6 NaN NaN 56.21 ... 51.79 55.96 52.37 51.78 65.48
[4 rows x 15 columns]
The result after running is the result I want:
code price min_bar min_sig 1min ... 2hour 4hour 1day 2day 7day
0 a 72.0 1min 69.78 69.78 ... 74.10 72.18 78.13 80.42 110.79
1 b 50.8 1min 49.21 49.21 ... 52.75 51.69 56.76 58.72 84.60
2 c 77.8 False False 79.75 ... 73.51 77.83 73.47 71.88 83.73
3 d 54.6 False False 56.21 ... 51.79 55.96 52.37 51.78 65.48
[4 rows x 15 columns]
%timeit df.apply(do_something,axis=1)
4.88 ms ± 50.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
IIUC,如果第一个值不大于价格,您似乎想用 False 屏蔽 idxmin
和 min
。
您可以使用 numpy 一次获得两个操作:
m = np.argmin(df[list1].values, axis=1)
(pd.DataFrame({'min_bar': np.take(list1, m),
'min_sig': np.take(df[list1].values, m)})
.mask(df['price'].lt(df[list1[0]]), False)
)
(然后加入或分配给原来的df)
输出:
min_bar min_sig
0 1min 69.78
1 1min 69.78
2 False False
3 False False
使用pandas
不过这需要搜索两次最小值
m = df['price'].lt(df[list1[0]])
df['min_bar'] = df[list1].idxmin(axis=1).mask(m, False)
df['min_sig'] = df[list1].min(axis=1).mask(m, False)