聚合多列 Pandas
Aggregating multiple columns Pandas
目前我的 csv 是这样的:
title
field1
field2
field3
field4
A
A1
A11
553
0
A
A1
A12
94
0
A
A1
A13
30
0
A
A1
{n/a}
0
9586
A
A2
A21
200
0
A
A2
{n/a}
0
3950
A
A3
A31
35
0
A
A3
{n/a}
0
2929
但我希望它看起来像这样:
title
field1
field2
field3
field4
A
A1
A11
553
9586
A
A1
A12
94
9586
A
A1
A13
30
9586
A
A2
A21
200
3950
A
A3
A31
35
2929
这是我的代码:
def fun(df, cols_to_aggregate, cols_order):
df = df.groupby(['field1', 'field2'], as_index=False)\
.agg(cols_to_aggregate)
df['title'] = 'A'
df = df[cols_order]
return df
def create_csv(df, month_date):
cols_to_aggregate = {'field3': 'sum', 'field4': 'sum'}
cols_order = ['title', 'field1', 'field2', 'field3']
funCSV = fun(df, cols_to_aggregate, cols_order)
return funCSV
任何帮助将不胜感激,因为我不知道如何将 field4 与所有相关的 field2 匹配。
使用:
def fun(df, cols_to_aggregate, cols_order):
df = df.groupby(['field1', 'field2'], as_index=False)\
.agg(cols_to_aggregate)
df['title'] = 'A'
#aggregate field4 to new column
df['field4'] = df.groupby('field1')['field4'].transform('sum')
df = df[cols_order]
return df
def create_csv(df, month_date):
cols_to_aggregate = {'field3': 'sum', 'field4': 'sum'}
#aded value 'field4'
cols_order = ['title', 'field1', 'field2', 'field3','field4']
funCSV = fun(df, cols_to_aggregate, cols_order)
return funCSV
print (create_csv(df, '2015-01').loc[lambda x: x['field2'].ne('{n/a}')])
title field1 field2 field3 field4
0 A A1 A11 553 9586
1 A A1 A12 94 9586
2 A A1 A13 30 9586
4 A A2 A21 200 3950
6 A A3 A31 35 2929
或者如果需要第一个非 0
值每个 field1
使用:
def fun(df, cols_to_aggregate, cols_order):
df = df.groupby(['field1', 'field2'], as_index=False)\
.agg(cols_to_aggregate)
df['title'] = 'A'
df['field4'] = df.groupby('field1')['field4'].transform('first')
df = df[cols_order]
return df
def create_csv(df, month_date):
cols_to_aggregate = {'field3': 'sum', 'field4': 'first'}
cols_order = ['title', 'field1', 'field2', 'field3','field4']
funCSV = fun(df, cols_to_aggregate, cols_order)
return funCSV
print (create_csv(df.replace({'field4':{0:np.nan}}), '2015-01').loc[lambda x: x['field2'].ne('{n/a}')])
title field1 field2 field3 field4
0 A A1 A11 553 9586.0
1 A A1 A12 94 9586.0
2 A A1 A13 30 9586.0
4 A A2 A21 200 3950.0
6 A A3 A31 35 2929.0
目前我的 csv 是这样的:
title | field1 | field2 | field3 | field4 |
---|---|---|---|---|
A | A1 | A11 | 553 | 0 |
A | A1 | A12 | 94 | 0 |
A | A1 | A13 | 30 | 0 |
A | A1 | {n/a} | 0 | 9586 |
A | A2 | A21 | 200 | 0 |
A | A2 | {n/a} | 0 | 3950 |
A | A3 | A31 | 35 | 0 |
A | A3 | {n/a} | 0 | 2929 |
但我希望它看起来像这样:
title | field1 | field2 | field3 | field4 |
---|---|---|---|---|
A | A1 | A11 | 553 | 9586 |
A | A1 | A12 | 94 | 9586 |
A | A1 | A13 | 30 | 9586 |
A | A2 | A21 | 200 | 3950 |
A | A3 | A31 | 35 | 2929 |
这是我的代码:
def fun(df, cols_to_aggregate, cols_order):
df = df.groupby(['field1', 'field2'], as_index=False)\
.agg(cols_to_aggregate)
df['title'] = 'A'
df = df[cols_order]
return df
def create_csv(df, month_date):
cols_to_aggregate = {'field3': 'sum', 'field4': 'sum'}
cols_order = ['title', 'field1', 'field2', 'field3']
funCSV = fun(df, cols_to_aggregate, cols_order)
return funCSV
任何帮助将不胜感激,因为我不知道如何将 field4 与所有相关的 field2 匹配。
使用:
def fun(df, cols_to_aggregate, cols_order):
df = df.groupby(['field1', 'field2'], as_index=False)\
.agg(cols_to_aggregate)
df['title'] = 'A'
#aggregate field4 to new column
df['field4'] = df.groupby('field1')['field4'].transform('sum')
df = df[cols_order]
return df
def create_csv(df, month_date):
cols_to_aggregate = {'field3': 'sum', 'field4': 'sum'}
#aded value 'field4'
cols_order = ['title', 'field1', 'field2', 'field3','field4']
funCSV = fun(df, cols_to_aggregate, cols_order)
return funCSV
print (create_csv(df, '2015-01').loc[lambda x: x['field2'].ne('{n/a}')])
title field1 field2 field3 field4
0 A A1 A11 553 9586
1 A A1 A12 94 9586
2 A A1 A13 30 9586
4 A A2 A21 200 3950
6 A A3 A31 35 2929
或者如果需要第一个非 0
值每个 field1
使用:
def fun(df, cols_to_aggregate, cols_order):
df = df.groupby(['field1', 'field2'], as_index=False)\
.agg(cols_to_aggregate)
df['title'] = 'A'
df['field4'] = df.groupby('field1')['field4'].transform('first')
df = df[cols_order]
return df
def create_csv(df, month_date):
cols_to_aggregate = {'field3': 'sum', 'field4': 'first'}
cols_order = ['title', 'field1', 'field2', 'field3','field4']
funCSV = fun(df, cols_to_aggregate, cols_order)
return funCSV
print (create_csv(df.replace({'field4':{0:np.nan}}), '2015-01').loc[lambda x: x['field2'].ne('{n/a}')])
title field1 field2 field3 field4
0 A A1 A11 553 9586.0
1 A A1 A12 94 9586.0
2 A A1 A13 30 9586.0
4 A A2 A21 200 3950.0
6 A A3 A31 35 2929.0