将 2 个数据集与 pandas 在特定字段上按最大值组合

combining 2 data sets with pandas on specific fields by max value

嘿,我正在重构我项目中的一些旧遗留代码。

Here is the old code :

def find_most_session(campaigns, key, search):
    sessions = 0
    index_found = None
    for index, value in enumerate(campaigns):
        if value['campaign_id'] == key:
            if search:
                if value['sessions'] > sessions:
                    sessions = value['sessions']
                    index_found = index
            else:
                return index

    return index_found

for item in items:
    if item['campaign_id'] in merged:
        campaign_index = find_most_session(campaign_data, item['campaign_id'], True)
        revenue_index = find_most_session(provider_data, item['campaign_id'], False)
        if campaign_index is not None and revenue_index is not None and provider_data[revenue_index]['revenue'] > 0:
            if provider_data[revenue_index]['revenue'] != campaign_data[campaign_index]['revenue']:
                campaign_data[campaign_index]['taboola_revenue'] = provider_data[revenue_index]['revenue']
            else:
                campaign_data[campaign_index]['taboola_revenue'] = 0

        elif (campaign_index is not None and revenue_index is not None) or campaign_index is not None:
            campaign_data[campaign_index]['taboola_revenue'] = 0

    else:
    merged[item['campaign_id']] = item

Here is some data example :

campaign_data = [{u'sessions': 365.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4535814', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 361}, {u'sessions': 7.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4620856', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 4}, {u'sessions': 8.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4621240', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 8}, {u'sessions': 5.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4676111', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 5}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4686333', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 2}, {u'sessions': 15.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4710945', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 15}, {u'sessions': 7.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4740661', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 6}, {u'sessions': 1.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4740727', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 1}, {u'sessions': 1392.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4771091', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 1405}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4771908', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 2}, {u'sessions': 44.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4772767', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 45}, {u'sessions': 4.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4806006', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 5}, {u'sessions': 12.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4823520', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 12}, {u'sessions': 4.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4823652', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 4}, {u'sessions': 1.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4844285', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 1}, {u'sessions': 1407.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4844343', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 1453}, {u'sessions': 7.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4863932', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 8}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4863940', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 15090}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4864293', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 2476}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4864408', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 2}]

provider_data = [{u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 5172, u'campaign_id': u'(no campaign)', u'revenue': '48.0000'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 31676, u'campaign_id': u'(no campaign)', u'revenue': '70.8700'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 849, u'campaign_id': u'(no campaign)', u'revenue': '0.7500'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 292, u'campaign_id': u'(no campaign)', u'revenue': '0.1000'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 2573, u'campaign_id': u'(no campaign)', u'revenue': '2.5600'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 28244, u'campaign_id': u'(no campaign)', u'revenue': '48.8200'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 9422, u'campaign_id': u'(no campaign)', u'revenue': '11.9000'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 8682, u'campaign_id': u'(no campaign)', u'revenue': '12.0300'}, {u'last_update': datetime.datetime(2020, 6, 16, 16, 0), u'sessions': 3740, u'campaign_id': u'(no campaign)', u'revenue': '25.5300'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 43476, u'campaign_id': u'(no campaign)', u'revenue': '181.4400'}, {u'last_update': datetime.datetime(2020, 6, 16, 16, 0), u'sessions': 47393, u'campaign_id': u'(no campaign)', u'revenue': '139.7100'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 16044, u'campaign_id': u'(no campaign)', u'revenue': '54.2700'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 16793, u'campaign_id': u'(no campaign)', u'revenue': '60.8400'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 24350, u'campaign_id': u'(no campaign)', u'revenue': '68.8100'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 6997, u'campaign_id': u'(no campaign)', u'revenue': '38.6300'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 1482, u'campaign_id': u'(no campaign)', u'revenue': '0.6600'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 28287, u'campaign_id': u'(no campaign)', u'revenue': '83.1400'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 13732, u'campaign_id': u'(no campaign)', u'revenue': '24.9500'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 34987, u'campaign_id': u'(no campaign)', u'revenue': '142.6300'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 3181, u'campaign_id': u'(no campaign)', u'revenue': '11.0400'}] 

问题是,在当前状态下,运行时间为 n^2,今天我的数据中已有大约 20 万行,这需要大约 30 分钟才能完成。

我确信有一种有效的方法可以得到与 pandas 相同的结果。

我已经重写了 find_most_session 函数:

def find_most_session(campaigns, key, search):
    df = pd.DataFrame(campaigns)
    return df[df['campaign_id'] == key].sort_values('sessions', ascending=False)[:1].index.tolist()[0]

但我很难理解如何重写代码的 "main" 部分。 如果你们能帮助我 find/understand 甚至只是指导我如何以高性能时间重写此代码,我将不胜感激。

#best_sessions_campaings
dfbest = df.sort_values('sessions').groupby('campaign_id').last()
dfbest.reset_index(inplace=True)

将为您提供每个广告系列,并且只会保留包含最佳会话的行。 这相当于 find_most_sessions(..., ..., True)

由于 find_most_session(... , ..., False) 不寻找最大的会话,因此等效为:

providers_unique = df.groupby('campaign_id').first()
providers_unique.reset_index(inplace=True) #Thanks Maxim Kogan

现在您想合并两个数据框,并将提供者的 收入 放入相应的活动中。您可以使用 join,但我想知道它是否会非常节省内存。

如果我是你,我会创建一个字典:

area_dict = dict(zip(providers_unique.campaign_id, providers_unique.revenue))

然后将字典映射到您的广告系列:

def apply_dict(camp_id):
    try:
        return (area_dict[camp_id])
    except:
        #If the key is not here, it means it was not found or the camp_id is invalid
        return 0

dfbest['provider_revenue'] = dfbest['campaign_id'].apply(apply_dict)