Pandas Groupby 应用函数非常慢,循环每个组 > 应用函数 > 添加结果作为新列
Pandas Groupby apply function is very slow , Looping every group > applying function>adding results as new column
我有库存数据,我试图在其中查找价格是否连续 5 天下降。示例数据:
我按 Symbol
分组。
df_s = df.groupby(['Symbol'])
我想为每个组申请的功能是:
def strictly_decreasing(L):
#---it will return True if L(list) is decreasing, eg:[7,5,3,2] = True---
return all(x>y for x, y in zip(L, L[1:]))
def strick_dec(group):
dec = []
for i in range(0,len(group)-4):
chec = group["close"][i:i+5]
dec.append(strictly_decreasing(list(chec)))
dummy = [False,False,False,False] # because we skiped last 4 values
final = dec + dummy # can also be used as dummy + dec, up to you
group['Strictly_decreasing'] = final # adding the results as new column
return group
df_new = df_s.apply(strick_dec)
我们怎样才能加快这个过程?太浪费时间了。
它可能永远不会很快,但使用 pandas
方法只会加快一点点。嵌套循环并不是真正需要的。尝试这样的事情:
import datetime
import pandas
import random
import itertools
# Create some test data
now = datetime.datetime.now()
df = pandas.DataFrame(
itertools.chain.from_iterable(
[
[
{
"symbol": "".join(symbol),
"date": now + pandas.Timedelta(-i, unit="D"),
"close": random.randint(10, 100) + random.random(),
"volume": random.randint(20000, 1000000),
}
for i in range(60)
]
for symbol in itertools.combinations("ABCDEFGHIJKLM", 4)
]
)
)
def check_decreasing(group: pandas.DataFrame, column: str = "close") -> pandas.DataFrame:
# Add shifted columns to show the previous value of close in the next column
for i in range(4, 0, -1):
group[f"{column}_minus_{i}"] = group[f"{column}"].shift(i)
# Use pandas.is_monotonic_decrease to check if the values are decreasing
group["is_monotonic_decreasing"] = group[[f"{column}_minus_{i}" for i in range(4, 0, -1)] + [f"{column}"]].apply(lambda row: row.is_monotonic_decreasing, axis=1)
# Remove the shifted columns (no longer needed)
group = group.drop(columns=[f"{column}_minus_{i}" for i in range(4, 0, -1)])
# Return the group
return group
# Fix some rows for testing (random will not always give results), this will create artificial monotonic decrease in the first 10 rows
for i in range(10):
df.at[i, "close"] = 100 - i*5
# Apply the function
df = df.groupby("symbol").apply(check_decreasing, column="close")
输出:
symbol date close volume \
0 ABCD 2020-11-30 09:00:16.102408 100.000000 631890
1 ABCD 2020-11-29 09:00:16.102408 95.000000 717153
2 ABCD 2020-11-28 09:00:16.102408 90.000000 248423
3 ABCD 2020-11-27 09:00:16.102408 85.000000 987648
4 ABCD 2020-11-26 09:00:16.102408 80.000000 613279
... ... ... ... ...
42895 JKLM 2020-10-06 09:00:16.102408 31.103065 740687
42896 JKLM 2020-10-05 09:00:16.102408 75.330438 794853
42897 JKLM 2020-10-04 09:00:16.102408 47.115309 279714
42898 JKLM 2020-10-03 09:00:16.102408 15.527207 972621
42899 JKLM 2020-10-02 09:00:16.102408 60.094327 765083
is_monotonic_decreasing
0 False
1 False
2 False
3 False
4 True
... ...
42895 False
42896 False
42897 False
42898 False
42899 False
[42900 rows x 5 columns]
我有库存数据,我试图在其中查找价格是否连续 5 天下降。示例数据:
我按 Symbol
分组。
df_s = df.groupby(['Symbol'])
我想为每个组申请的功能是:
def strictly_decreasing(L):
#---it will return True if L(list) is decreasing, eg:[7,5,3,2] = True---
return all(x>y for x, y in zip(L, L[1:]))
def strick_dec(group):
dec = []
for i in range(0,len(group)-4):
chec = group["close"][i:i+5]
dec.append(strictly_decreasing(list(chec)))
dummy = [False,False,False,False] # because we skiped last 4 values
final = dec + dummy # can also be used as dummy + dec, up to you
group['Strictly_decreasing'] = final # adding the results as new column
return group
df_new = df_s.apply(strick_dec)
我们怎样才能加快这个过程?太浪费时间了。
它可能永远不会很快,但使用 pandas
方法只会加快一点点。嵌套循环并不是真正需要的。尝试这样的事情:
import datetime
import pandas
import random
import itertools
# Create some test data
now = datetime.datetime.now()
df = pandas.DataFrame(
itertools.chain.from_iterable(
[
[
{
"symbol": "".join(symbol),
"date": now + pandas.Timedelta(-i, unit="D"),
"close": random.randint(10, 100) + random.random(),
"volume": random.randint(20000, 1000000),
}
for i in range(60)
]
for symbol in itertools.combinations("ABCDEFGHIJKLM", 4)
]
)
)
def check_decreasing(group: pandas.DataFrame, column: str = "close") -> pandas.DataFrame:
# Add shifted columns to show the previous value of close in the next column
for i in range(4, 0, -1):
group[f"{column}_minus_{i}"] = group[f"{column}"].shift(i)
# Use pandas.is_monotonic_decrease to check if the values are decreasing
group["is_monotonic_decreasing"] = group[[f"{column}_minus_{i}" for i in range(4, 0, -1)] + [f"{column}"]].apply(lambda row: row.is_monotonic_decreasing, axis=1)
# Remove the shifted columns (no longer needed)
group = group.drop(columns=[f"{column}_minus_{i}" for i in range(4, 0, -1)])
# Return the group
return group
# Fix some rows for testing (random will not always give results), this will create artificial monotonic decrease in the first 10 rows
for i in range(10):
df.at[i, "close"] = 100 - i*5
# Apply the function
df = df.groupby("symbol").apply(check_decreasing, column="close")
输出:
symbol date close volume \
0 ABCD 2020-11-30 09:00:16.102408 100.000000 631890
1 ABCD 2020-11-29 09:00:16.102408 95.000000 717153
2 ABCD 2020-11-28 09:00:16.102408 90.000000 248423
3 ABCD 2020-11-27 09:00:16.102408 85.000000 987648
4 ABCD 2020-11-26 09:00:16.102408 80.000000 613279
... ... ... ... ...
42895 JKLM 2020-10-06 09:00:16.102408 31.103065 740687
42896 JKLM 2020-10-05 09:00:16.102408 75.330438 794853
42897 JKLM 2020-10-04 09:00:16.102408 47.115309 279714
42898 JKLM 2020-10-03 09:00:16.102408 15.527207 972621
42899 JKLM 2020-10-02 09:00:16.102408 60.094327 765083
is_monotonic_decreasing
0 False
1 False
2 False
3 False
4 True
... ...
42895 False
42896 False
42897 False
42898 False
42899 False
[42900 rows x 5 columns]