Pandas 多条件分组

Pandas groupby with multiple conditions

我正在尝试创建通话记录摘要。 有4个案例

  1. 一个phone的通话记录只有一条,而且有结果,我们 选择持续时间、状态和 outcome_record
  2. 的值
  3. 多个相同phone的通话记录都有结果,我们选择通话记录的摘要、持续时间和outcome_record最大持续时间
  4. 一个phone只有一个通话记录,没有 结果,我们选择其持续时间和状态的值。 outcome_record 将是 None
  5. 相同phone的多个通话记录没有结果,我们选择 通话记录的摘要和持续时间,最长持续时间。 outcome_record 将是 None

我尝试的是循环播放群组。但是在处理大量数据时速度非常慢。我想我需要使用 pandas 方法而不是循环。如何使用 pandas 方法实现相同的,具有多个条件。谢谢

import pandas as pd
def get_summarized_call_logs_df(df):
    data_list = []
    phone_groups = df.groupby('phone')
    unique_phones = df.phone.unique()
    for ph in unique_phones:
        row_data = {"phone": ph}
        group = phone_groups.get_group(ph)
        group_len = len(group)
        if True in group['outcome'].to_list():
            outcome = group.loc[group['outcome'] == True]
            row_data.update({"has_outcome": True})
            if outcome.phone.count() == 1:
                # Cases where there is outcome for single calls
                row_data.update({"status": outcome.status.iloc[0],
                                 "duration": outcome.duration.iloc[0],
                                 "outcome_record": outcome.id.iloc[0]})
            else:
                # Cases where there is outcome for multiple calls
                # We choose the status and duration of outcome record with maximum duration
                out_rec = outcome.loc[outcome['duration'] == outcome['duration'].max()]
                row_data.update({"status": out_rec.status.iloc[0],
                                 "duration": out_rec.duration.iloc[0],
                                 "outcome_record": out_rec.id.iloc[0]})
        else:
            row_data.update({"has_outcome": False, "outcome_record": None})
            if group_len == 1:
                # Cases where there is no outcome for single calls
                row_data.update({"status": group.status.iloc[0], "duration": group.duration.iloc[0]})
            else:
                # Cases where there is no outcome for multiple calls
                # We choose the status and duration of the record with maximum duration
                row_data.update({"status": group.loc[group['duration'] == group['duration'].max()].status.iloc[0],
                                "duration": group.loc[group['duration'] == group['duration'].max()].duration.iloc[0]})
        data_list.append(row_data)
    new_df = pd.DataFrame(data_list)
    return new_df

if __name__ == "__main__":
    data = [
    {"id": 1, "phone": "123", "outcome": True, "status": "sale", "duration": 1550},
    {"id": 2, "phone": "123", "outcome": False, "status": "failed", "duration": 3},
    {"id": 3, "phone": "123", "outcome": False, "status": "no_ring", "duration": 5},
    {"id": 4, "phone": "456", "outcome": True, "status": "call_back", "duration": 550},
    {"id": 5, "phone": "456", "outcome": True, "status": "sale", "duration": 2500},
    {"id": 6, "phone": "456", "outcome": False, "status": "no_ring", "duration": 5},
    {"id": 7, "phone": "789", "outcome": False, "status": "no_pick", "duration": 4},
    {"id": 8, "phone": "741", "outcome": False, "status": "try_again", "duration": 25},
    {"id": 9, "phone": "741", "outcome": False, "status": "try_again", "duration": 10},
    {"id": 10, "phone": "741", "outcome": False, "status": "no_ring", "duration": 5},
    ]
    df = pd.DataFrame(data)
    new_df = get_summarized_call_logs_df(df)
    print(new_df)

它应该产生一个输出

  phone  has_outcome     status  duration  outcome_record
0   123         True       sale      1550             1.0
1   456         True       sale      2500             5.0
2   789        False    no_pick         4             NaN
3   741        False  try_again        25             NaN

我认为你可以简化逻辑。如果您主要按 'outcome' 和 'duration' 对值进行排序,则只需删除重复项并保留每个排序组的最后一行,如下所示:

cols = ['phone', 'outcome', 'duration']
new_df = df.sort_values(cols).drop_duplicates('phone', keep='last')
print(new_df)

# Output:
   id phone  outcome     status  duration
0   1   123     True       sale      1550
4   5   456     True       sale      2500
7   8   741    False  try_again        25
6   7   789    False    no_pick         4

来自@user10375196,获得预期结果:

new_df = new_df.rename(columns={'id': 'outcome_record', 'outcome': 'has_outcome'})
new_df.loc[new_df.has_outcome == False, "outcome_record"] = None
new_df.reset_index(drop=True, inplace=True)
print(new_df)

# Output:
   outcome_record phone  has_outcome     status  duration
0             1.0   123         True       sale      1550
1             5.0   456         True       sale      2500
2             NaN   741        False  try_again        25
3             NaN   789        False    no_pick         4

只是为了提供一个替代的流处理选项(不需要将输入数据放入内存),基于convtools

from convtools import conversion as c

# fmt: off
data = [
    {"id": 1, "phone": "123", "outcome": True, "status": "sale", "duration": 1550},
    {"id": 2, "phone": "123", "outcome": False, "status": "failed", "duration": 3},
    {"id": 3, "phone": "123", "outcome": False, "status": "no_ring", "duration": 5},
    {"id": 4, "phone": "456", "outcome": True, "status": "call_back", "duration": 550},
    {"id": 5, "phone": "456", "outcome": True, "status": "sale", "duration": 2500},
    {"id": 6, "phone": "456", "outcome": False, "status": "no_ring", "duration": 5},
    {"id": 7, "phone": "789", "outcome": False, "status": "no_pick", "duration": 4},
    {"id": 8, "phone": "741", "outcome": False, "status": "try_again", "duration": 25},
    {"id": 9, "phone": "741", "outcome": False, "status": "try_again", "duration": 10},
    {"id": 10, "phone": "741", "outcome": False, "status": "no_ring", "duration": 5},
]
# fmt: on

# you are interested in rows with max duration
max_duration_call_log = c.ReduceFuncs.MaxRow(c.item("duration"))

# you need to know whether there's been an outcome
has_outcome = c.ReduceFuncs.Count(where=c.item("outcome")) > 0

converter = (
    c.group_by(c.item("phone"))
    .aggregate(
        {
            "phone": c.item("phone"),
            "has_outcome": has_outcome,
            "status": max_duration_call_log.item("status"),
            "duration": max_duration_call_log.item("duration"),
            "outcome_record": c.if_(
                has_outcome,
                max_duration_call_log.item("id"),
                None,
            ),
        }
    )
    # this step generates and compiles ad hoc function
    .gen_converter()
)

# fmt: off
assert converter(data) == [
    {'phone': '123', 'has_outcome': True, 'status': 'sale', 'duration': 1550, 'outcome_record': 1},
    {'phone': '456', 'has_outcome': True, 'status': 'sale', 'duration': 2500, 'outcome_record': 5},
    {'phone': '789', 'has_outcome': False, 'status': 'no_pick', 'duration': 4, 'outcome_record': None},
    {'phone': '741', 'has_outcome': False, 'status': 'try_again', 'duration': 25, 'outcome_record': None},
]
# fmt: on