Python - 提高 value_counts 的速度
Python - Improve Speed on value_counts
我正在处理一些原始数据并希望计算链中某些指标('Stat' 列)的实例(由 'chain_id' 列中注明的唯一标识符 'c' 命名) ),这被保存到一个字典,然后映射到一个新列(下面未显示)。
不过我希望:
- 提高循环速度,我必须在 34k 行上从最初的 ~3 提高到 ~10 it/s。
- 改进try/except语句的结构,注意每个链在[=37=中不会总是有例如'Kick'或'Mark'等]() 输出,因此这些必须为 0.
我已经在 SOF 上搜索了其他方法,但是 none 现有答案的花色 - 请忽略 for 循环的缩进,它不允许我更正它
import pandas as pd
from tqdm.notebook import tqdm
s = ['Hitout', 'Kick', 'Disposal', 'Centre Clearance', 'Tackle', 'Hitout',
'Hitout To Advantage', 'Free Against', 'Contested Possession', 'Free For',
'Handball', 'Disposal', 'Effective Disposal', 'Stoppage Clearance',
'Uncontested Possession', 'Kick', 'Effective Kick', 'Disposal', 'Effective Disposal',
'Mark', 'Uncontested Possession', 'F 50 Mark', 'Mark On Lead', 'Kick', 'Disposal',
'Shot At Goal', 'Behind', 'Kick In', 'One Percenter', 'Kick', 'Effective Kick',
'Disposal', 'Effective Disposal', 'Rebound 50', 'Spoil', 'One Percenter']
x = ['Hitout', 'RI-1', 'RI-1', 'RI-1', 'RI-1', 'Hitout', 'Hitout', 'RI-7', 'RI-7',
'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7',
'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'CA-27', 'CA-27',
'CA-27', 'CA-27', 'CA-27', 'CA-27', 'CA-27', 'CA-27', 'CA-27']
df = pd.DataFrame({'chain_id':x,'Stat':s})
for c in tqdm(chains):
if c == 'Hitout':
chain_count[c] = 0
hb_count[c] = 0
ki_count[c] = 0
m_count[c] = 0
goal_count[c] = 0
behind_count[c] = 0
cp_count[c] = 0
up_count[c] = 0
t_count[c] = 0
chain_time[c] = 0
else:
temp = df[df['chain_id']==c]['Stat'].value_counts()
try:
chain_count[c] = temp['Disposal']
except:
chain_count[c] = 0
try:
ki_count[c] = temp['Kick']
except:
ki_count[c] = 0
try:
hb_count[c] = temp['Handball']
except:
hb_count[c] = 0
try:
m_count[c] = temp['Mark']
except:
m_count[c] = 0
try:
goal_count[c] = temp['Goal']
except:
goal_count[c] = 0
try:
behind_count[c] = temp['Behind']
except:
behind_count[c] = 0
try:
cp_count[c] = temp['Contested Possession']
except:
cp_count[c] = 0
try:
up_count[c] = temp['Uncontested Possession']
except:
up_count[c] = 0
try:
t_count[c] = temp['Tackle']
except:
t_count[c] = 0
chain_time[c] = time(c)
df['chain_length'] = df['chain_id'].map(chain_count)
df['chain_hb'] = df['chain_id'].map(hb_count)
df['chain_ki'] = df['chain_id'].map(ki_count)
df['chain_m'] = df['chain_id'].map(m_count)
df['chain_goal'] = df['chain_id'].map(goal_count)
df['chain_behind'] = df['chain_id'].map(behind_count)
df['chain_cp'] = df['chain_id'].map(cp_count)
df['chain_up'] = df['chain_id'].map(up_count)
df['chain_t'] = df['chain_id'].map(t_count)
df['chain_time'] = df['chain_id'].map(chain_time)
已编辑:包括一个示例,输出其当前的工作方式如下
创建一个状态列来隔离处置、手球和其他行为
df['status']=np.select([df['Stat'].eq('Handball'),df['Stat'].eq('Disposal')],['chain_length','chain_hb'],'Notimportant')
找出每个行为的发生频率
s=df.join(df.groupby(['chain_id','Stat']).apply(lambda x: pd.get_dummies(x['status'])).fillna(0)).drop(columns=['status','Notimportant'])
使用 Transform 求和并级联总计
s[['chain_hb','chain_length']]=s.groupby('chain_id')[['chain_hb','chain_length']].transform('sum')
结果
chain_id Stat chain_hb chain_length
0 Hitout Hitout 0.0 0.0
1 RI-1 Kick 1.0 0.0
2 RI-1 Disposal 1.0 0.0
3 RI-1 Centre Clearance 1.0 0.0
4 RI-1 Tackle 1.0 0.0
5 Hitout Hitout 0.0 0.0
6 Hitout Hitout To Advantage 0.0 0.0
7 RI-7 Free Against 3.0 1.0
8 RI-7 Contested Possession 3.0 1.0
9 RI-7 Free For 3.0 1.0
10 RI-7 Handball 3.0 1.0
11 RI-7 Disposal 3.0 1.0
12 RI-7 Effective Disposal 3.0 1.0
13 RI-7 Stoppage Clearance 3.0 1.0
14 RI-7 Uncontested Possession 3.0 1.0
15 RI-7 Kick 3.0 1.0
16 RI-7 Effective Kick 3.0 1.0
17 RI-7 Disposal 3.0 1.0
18 RI-7 Effective Disposal 3.0 1.0
19 RI-7 Mark 3.0 1.0
20 RI-7 Uncontested Possession 3.0 1.0
21 RI-7 F 50 Mark 3.0 1.0
22 RI-7 Mark On Lead 3.0 1.0
23 RI-7 Kick 3.0 1.0
24 RI-7 Disposal 3.0 1.0
25 RI-7 Shot At Goal 3.0 1.0
26 RI-7 Behind 3.0 1.0
27 CA-27 Kick In 1.0 0.0
28 CA-27 One Percenter 1.0 0.0
29 CA-27 Kick 1.0 0.0
30 CA-27 Effective Kick 1.0 0.0
31 CA-27 Disposal 1.0 0.0
32 CA-27 Effective Disposal 1.0 0.0
33 CA-27 Rebound 50 1.0 0.0
34 CA-27 Spoil 1.0 0.0
35 CA-27 One Percenter 1.0 0.0
我正在处理一些原始数据并希望计算链中某些指标('Stat' 列)的实例(由 'chain_id' 列中注明的唯一标识符 'c' 命名) ),这被保存到一个字典,然后映射到一个新列(下面未显示)。
不过我希望:
- 提高循环速度,我必须在 34k 行上从最初的 ~3 提高到 ~10 it/s。
- 改进try/except语句的结构,注意每个链在[=37=中不会总是有例如'Kick'或'Mark'等]() 输出,因此这些必须为 0.
我已经在 SOF 上搜索了其他方法,但是 none 现有答案的花色 - 请忽略 for 循环的缩进,它不允许我更正它
import pandas as pd
from tqdm.notebook import tqdm
s = ['Hitout', 'Kick', 'Disposal', 'Centre Clearance', 'Tackle', 'Hitout',
'Hitout To Advantage', 'Free Against', 'Contested Possession', 'Free For',
'Handball', 'Disposal', 'Effective Disposal', 'Stoppage Clearance',
'Uncontested Possession', 'Kick', 'Effective Kick', 'Disposal', 'Effective Disposal',
'Mark', 'Uncontested Possession', 'F 50 Mark', 'Mark On Lead', 'Kick', 'Disposal',
'Shot At Goal', 'Behind', 'Kick In', 'One Percenter', 'Kick', 'Effective Kick',
'Disposal', 'Effective Disposal', 'Rebound 50', 'Spoil', 'One Percenter']
x = ['Hitout', 'RI-1', 'RI-1', 'RI-1', 'RI-1', 'Hitout', 'Hitout', 'RI-7', 'RI-7',
'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7',
'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'RI-7', 'CA-27', 'CA-27',
'CA-27', 'CA-27', 'CA-27', 'CA-27', 'CA-27', 'CA-27', 'CA-27']
df = pd.DataFrame({'chain_id':x,'Stat':s})
for c in tqdm(chains):
if c == 'Hitout':
chain_count[c] = 0
hb_count[c] = 0
ki_count[c] = 0
m_count[c] = 0
goal_count[c] = 0
behind_count[c] = 0
cp_count[c] = 0
up_count[c] = 0
t_count[c] = 0
chain_time[c] = 0
else:
temp = df[df['chain_id']==c]['Stat'].value_counts()
try:
chain_count[c] = temp['Disposal']
except:
chain_count[c] = 0
try:
ki_count[c] = temp['Kick']
except:
ki_count[c] = 0
try:
hb_count[c] = temp['Handball']
except:
hb_count[c] = 0
try:
m_count[c] = temp['Mark']
except:
m_count[c] = 0
try:
goal_count[c] = temp['Goal']
except:
goal_count[c] = 0
try:
behind_count[c] = temp['Behind']
except:
behind_count[c] = 0
try:
cp_count[c] = temp['Contested Possession']
except:
cp_count[c] = 0
try:
up_count[c] = temp['Uncontested Possession']
except:
up_count[c] = 0
try:
t_count[c] = temp['Tackle']
except:
t_count[c] = 0
chain_time[c] = time(c)
df['chain_length'] = df['chain_id'].map(chain_count)
df['chain_hb'] = df['chain_id'].map(hb_count)
df['chain_ki'] = df['chain_id'].map(ki_count)
df['chain_m'] = df['chain_id'].map(m_count)
df['chain_goal'] = df['chain_id'].map(goal_count)
df['chain_behind'] = df['chain_id'].map(behind_count)
df['chain_cp'] = df['chain_id'].map(cp_count)
df['chain_up'] = df['chain_id'].map(up_count)
df['chain_t'] = df['chain_id'].map(t_count)
df['chain_time'] = df['chain_id'].map(chain_time)
已编辑:包括一个示例,输出其当前的工作方式如下
创建一个状态列来隔离处置、手球和其他行为
df['status']=np.select([df['Stat'].eq('Handball'),df['Stat'].eq('Disposal')],['chain_length','chain_hb'],'Notimportant')
找出每个行为的发生频率
s=df.join(df.groupby(['chain_id','Stat']).apply(lambda x: pd.get_dummies(x['status'])).fillna(0)).drop(columns=['status','Notimportant'])
使用 Transform 求和并级联总计
s[['chain_hb','chain_length']]=s.groupby('chain_id')[['chain_hb','chain_length']].transform('sum')
结果
chain_id Stat chain_hb chain_length
0 Hitout Hitout 0.0 0.0
1 RI-1 Kick 1.0 0.0
2 RI-1 Disposal 1.0 0.0
3 RI-1 Centre Clearance 1.0 0.0
4 RI-1 Tackle 1.0 0.0
5 Hitout Hitout 0.0 0.0
6 Hitout Hitout To Advantage 0.0 0.0
7 RI-7 Free Against 3.0 1.0
8 RI-7 Contested Possession 3.0 1.0
9 RI-7 Free For 3.0 1.0
10 RI-7 Handball 3.0 1.0
11 RI-7 Disposal 3.0 1.0
12 RI-7 Effective Disposal 3.0 1.0
13 RI-7 Stoppage Clearance 3.0 1.0
14 RI-7 Uncontested Possession 3.0 1.0
15 RI-7 Kick 3.0 1.0
16 RI-7 Effective Kick 3.0 1.0
17 RI-7 Disposal 3.0 1.0
18 RI-7 Effective Disposal 3.0 1.0
19 RI-7 Mark 3.0 1.0
20 RI-7 Uncontested Possession 3.0 1.0
21 RI-7 F 50 Mark 3.0 1.0
22 RI-7 Mark On Lead 3.0 1.0
23 RI-7 Kick 3.0 1.0
24 RI-7 Disposal 3.0 1.0
25 RI-7 Shot At Goal 3.0 1.0
26 RI-7 Behind 3.0 1.0
27 CA-27 Kick In 1.0 0.0
28 CA-27 One Percenter 1.0 0.0
29 CA-27 Kick 1.0 0.0
30 CA-27 Effective Kick 1.0 0.0
31 CA-27 Disposal 1.0 0.0
32 CA-27 Effective Disposal 1.0 0.0
33 CA-27 Rebound 50 1.0 0.0
34 CA-27 Spoil 1.0 0.0
35 CA-27 One Percenter 1.0 0.0