Pandas 分组、分配和 to_excel - 在 loop/repeat
Pandas groupby, assign and to_excel - on loop/repeat
我有一个如下所示的数据框
import numpy as np
import pandas as pd
from numpy.random import default_rng
rng = default_rng(100)
cdf = pd.DataFrame({'Id':[1,2,3,4,5],
'year':[2017,2017,2018,2019,2018],
'customer': rng.choice(list('ACD'),size=(5)),
'region': rng.choice(list('PQRS'),size=(5)),
'dumeel': rng.choice(list('QWER'),size=(5)),
'dumma': rng.choice((1234),size=(5)),
'target_at50': rng.choice([0,1],size=(5)),
'target_at60': rng.choice([1,1],size=(5)),
'target_at70': rng.choice([0,0],size=(5))})
我的objective是做下面的事情
a) 根据多个条件对列进行分组(如下代码所示)
b) 根据目标列分配默认值。 (例如:如果 target_at50,则赋值 50,如果 target_at60,则赋值 60。如果 target_at70,则赋值 70)
b) 对不同的目标列重复相同的分组条件 (target_at60, target_at70)
c) 将每个目标的每个组语句的结果写入单个 excel 文件中的新 sheet。
我正在尝试类似下面的方法,但它既不高效也不优雅。您可以看到我使用相同的代码,但针对不同的目标列重复了三次 (target_at50. target_at60, target_at70
)
cdf.groupby(['region','target_at50']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=50).to_excel(writer,'sheet1')
cdf.groupby(['region','customer','target_at50']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=50).to_excel(writer,'sheet2')
cdf.groupby(['region','dumeel','target_at50']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=50).to_excel(writer,'sheet3')
cdf.groupby(['region','year','target_at50']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=50).to_excel(writer,'sheet4')
cdf.groupby(['region','year','customer','target_at50']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=50).to_excel(writer,'sheet5')
cdf.groupby(['region','year','dumeel','target_at50']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=50).to_excel(writer,'sheet6')
cdf.groupby(['region','target_at60']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=60).to_excel(writer,'sheet7')
cdf.groupby(['region','customer','target_at60']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=60).to_excel(writer,'sheet8')
cdf.groupby(['region','dumeel','target_at60']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=60).to_excel(writer,'sheet9')
cdf.groupby(['region','year','target_at60']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=60).to_excel(writer,'sheet10')
cdf.groupby(['region','year','customer','target_at60']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=60).to_excel(writer,'sheet11')
cdf.groupby(['region','year','dumeel','target_at60']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=60).to_excel(writer,'sheet12')
cdf.groupby(['region','target_at70']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=70).to_excel(writer,'sheet13')
cdf.groupby(['region','customer','target_at70']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=70).to_excel(writer,'sheet14')
cdf.groupby(['region','dumeel','target_at70']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=70).to_excel(writer,'sheet15')
cdf.groupby(['region','year','target_at70']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=70).to_excel(writer,'sheet16')
cdf.groupby(['region','year','customer','target_at70']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=70).to_excel(writer,'sheet17')
cdf.groupby(['region','year','dumeel','target_at70']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=70).to_excel(writer,'sheet18')
虽然以上所有的 group by 语句都可以正常工作,但我想知道是否有任何有效和更好的方法来执行上述任务?
我希望我的输出只有一个 excel 和多个 sheets(18 sheets)(这将有 18 个 groupby 语句的输出)
首先生成传递给groupby
的所有组合:
from itertools import combinations
targets = cdf.filter(like='target').columns
cols = ['customer', 'year', 'dumeel']
tups = [('region', *c, t) for t in targets
for i in range(0, len(cols)+1)
for c in combinations(cols, i)]
print (tups)
Ant 然后在循环中写入聚合值:
with pd.ExcelWriter('Values.xlsx') as writer:
for i, val in enumerate(tups, 1):
threshold = re.
df = (cdf.groupby(val).size().reset_index(name='Count')
.sort_values(by=['region','Count'],ascending=False)
.assign(threshold=int(re.findall('\d+',val[-1])[0]))
df.to_excel(writer,sheet_name=f'sheet{i}')
我有一个如下所示的数据框
import numpy as np
import pandas as pd
from numpy.random import default_rng
rng = default_rng(100)
cdf = pd.DataFrame({'Id':[1,2,3,4,5],
'year':[2017,2017,2018,2019,2018],
'customer': rng.choice(list('ACD'),size=(5)),
'region': rng.choice(list('PQRS'),size=(5)),
'dumeel': rng.choice(list('QWER'),size=(5)),
'dumma': rng.choice((1234),size=(5)),
'target_at50': rng.choice([0,1],size=(5)),
'target_at60': rng.choice([1,1],size=(5)),
'target_at70': rng.choice([0,0],size=(5))})
我的objective是做下面的事情
a) 根据多个条件对列进行分组(如下代码所示)
b) 根据目标列分配默认值。 (例如:如果 target_at50,则赋值 50,如果 target_at60,则赋值 60。如果 target_at70,则赋值 70)
b) 对不同的目标列重复相同的分组条件 (target_at60, target_at70)
c) 将每个目标的每个组语句的结果写入单个 excel 文件中的新 sheet。
我正在尝试类似下面的方法,但它既不高效也不优雅。您可以看到我使用相同的代码,但针对不同的目标列重复了三次 (target_at50. target_at60, target_at70
)
cdf.groupby(['region','target_at50']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=50).to_excel(writer,'sheet1')
cdf.groupby(['region','customer','target_at50']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=50).to_excel(writer,'sheet2')
cdf.groupby(['region','dumeel','target_at50']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=50).to_excel(writer,'sheet3')
cdf.groupby(['region','year','target_at50']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=50).to_excel(writer,'sheet4')
cdf.groupby(['region','year','customer','target_at50']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=50).to_excel(writer,'sheet5')
cdf.groupby(['region','year','dumeel','target_at50']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=50).to_excel(writer,'sheet6')
cdf.groupby(['region','target_at60']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=60).to_excel(writer,'sheet7')
cdf.groupby(['region','customer','target_at60']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=60).to_excel(writer,'sheet8')
cdf.groupby(['region','dumeel','target_at60']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=60).to_excel(writer,'sheet9')
cdf.groupby(['region','year','target_at60']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=60).to_excel(writer,'sheet10')
cdf.groupby(['region','year','customer','target_at60']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=60).to_excel(writer,'sheet11')
cdf.groupby(['region','year','dumeel','target_at60']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=60).to_excel(writer,'sheet12')
cdf.groupby(['region','target_at70']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=70).to_excel(writer,'sheet13')
cdf.groupby(['region','customer','target_at70']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=70).to_excel(writer,'sheet14')
cdf.groupby(['region','dumeel','target_at70']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=70).to_excel(writer,'sheet15')
cdf.groupby(['region','year','target_at70']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=70).to_excel(writer,'sheet16')
cdf.groupby(['region','year','customer','target_at70']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=70).to_excel(writer,'sheet17')
cdf.groupby(['region','year','dumeel','target_at70']).size().reset_index(name='Count').sort_values(by=['region','Count'],ascending=False).assign(threshold=70).to_excel(writer,'sheet18')
虽然以上所有的 group by 语句都可以正常工作,但我想知道是否有任何有效和更好的方法来执行上述任务?
我希望我的输出只有一个 excel 和多个 sheets(18 sheets)(这将有 18 个 groupby 语句的输出)
首先生成传递给groupby
的所有组合:
from itertools import combinations
targets = cdf.filter(like='target').columns
cols = ['customer', 'year', 'dumeel']
tups = [('region', *c, t) for t in targets
for i in range(0, len(cols)+1)
for c in combinations(cols, i)]
print (tups)
Ant 然后在循环中写入聚合值:
with pd.ExcelWriter('Values.xlsx') as writer:
for i, val in enumerate(tups, 1):
threshold = re.
df = (cdf.groupby(val).size().reset_index(name='Count')
.sort_values(by=['region','Count'],ascending=False)
.assign(threshold=int(re.findall('\d+',val[-1])[0]))
df.to_excel(writer,sheet_name=f'sheet{i}')