聚合多列 Pandas

Aggregating multiple columns Pandas

目前我的 csv 是这样的:

title field1 field2 field3 field4
A A1 A11 553 0
A A1 A12 94 0
A A1 A13 30 0
A A1 {n/a} 0 9586
A A2 A21 200 0
A A2 {n/a} 0 3950
A A3 A31 35 0
A A3 {n/a} 0 2929

但我希望它看起来像这样:

title field1 field2 field3 field4
A A1 A11 553 9586
A A1 A12 94 9586
A A1 A13 30 9586
A A2 A21 200 3950
A A3 A31 35 2929

这是我的代码:

def fun(df, cols_to_aggregate, cols_order):
    df = df.groupby(['field1', 'field2'], as_index=False)\
                .agg(cols_to_aggregate)
    df['title'] = 'A'
    df = df[cols_order]
    return df


def create_csv(df, month_date):
    cols_to_aggregate = {'field3': 'sum', 'field4': 'sum'}
    cols_order = ['title', 'field1', 'field2', 'field3']
    funCSV = fun(df, cols_to_aggregate, cols_order)
    return funCSV

任何帮助将不胜感激,因为我不知道如何将 field4 与所有相关的 field2 匹配。

使用:

def fun(df, cols_to_aggregate, cols_order):
    df = df.groupby(['field1', 'field2'], as_index=False)\
                .agg(cols_to_aggregate)
    df['title'] = 'A'
    #aggregate field4 to new column
    df['field4'] = df.groupby('field1')['field4'].transform('sum')
    df = df[cols_order]
    return df


def create_csv(df, month_date):
    cols_to_aggregate = {'field3': 'sum', 'field4': 'sum'}
    #aded value 'field4'
    cols_order = ['title', 'field1', 'field2', 'field3','field4']
    funCSV = fun(df, cols_to_aggregate, cols_order)
    return funCSV

print (create_csv(df, '2015-01').loc[lambda x: x['field2'].ne('{n/a}')])
  title field1 field2  field3  field4
0     A     A1    A11     553    9586
1     A     A1    A12      94    9586
2     A     A1    A13      30    9586
4     A     A2    A21     200    3950
6     A     A3    A31      35    2929

或者如果需要第一个非 0 值每个 field1 使用:

def fun(df, cols_to_aggregate, cols_order):
    df = df.groupby(['field1', 'field2'], as_index=False)\
                .agg(cols_to_aggregate)
    df['title'] = 'A'
    df['field4'] = df.groupby('field1')['field4'].transform('first')
    df = df[cols_order]
    return df


def create_csv(df, month_date):
    cols_to_aggregate = {'field3': 'sum', 'field4': 'first'}
    cols_order = ['title', 'field1', 'field2', 'field3','field4']
    funCSV = fun(df, cols_to_aggregate, cols_order)
    return funCSV

print (create_csv(df.replace({'field4':{0:np.nan}}), '2015-01').loc[lambda x: x['field2'].ne('{n/a}')])
  title field1 field2  field3  field4
0     A     A1    A11     553  9586.0
1     A     A1    A12      94  9586.0
2     A     A1    A13      30  9586.0
4     A     A2    A21     200  3950.0
6     A     A3    A31      35  2929.0