将 2 个数据集与 pandas 在特定字段上按最大值组合
combining 2 data sets with pandas on specific fields by max value
嘿,我正在重构我项目中的一些旧遗留代码。
Here is the old code :
def find_most_session(campaigns, key, search):
sessions = 0
index_found = None
for index, value in enumerate(campaigns):
if value['campaign_id'] == key:
if search:
if value['sessions'] > sessions:
sessions = value['sessions']
index_found = index
else:
return index
return index_found
for item in items:
if item['campaign_id'] in merged:
campaign_index = find_most_session(campaign_data, item['campaign_id'], True)
revenue_index = find_most_session(provider_data, item['campaign_id'], False)
if campaign_index is not None and revenue_index is not None and provider_data[revenue_index]['revenue'] > 0:
if provider_data[revenue_index]['revenue'] != campaign_data[campaign_index]['revenue']:
campaign_data[campaign_index]['taboola_revenue'] = provider_data[revenue_index]['revenue']
else:
campaign_data[campaign_index]['taboola_revenue'] = 0
elif (campaign_index is not None and revenue_index is not None) or campaign_index is not None:
campaign_data[campaign_index]['taboola_revenue'] = 0
else:
merged[item['campaign_id']] = item
Here is some data example :
campaign_data = [{u'sessions': 365.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4535814', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 361}, {u'sessions': 7.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4620856', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 4}, {u'sessions': 8.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4621240', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 8}, {u'sessions': 5.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4676111', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 5}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4686333', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 2}, {u'sessions': 15.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4710945', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 15}, {u'sessions': 7.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4740661', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 6}, {u'sessions': 1.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4740727', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 1}, {u'sessions': 1392.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4771091', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 1405}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4771908', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 2}, {u'sessions': 44.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4772767', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 45}, {u'sessions': 4.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4806006', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 5}, {u'sessions': 12.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4823520', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 12}, {u'sessions': 4.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4823652', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 4}, {u'sessions': 1.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4844285', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 1}, {u'sessions': 1407.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4844343', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 1453}, {u'sessions': 7.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4863932', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 8}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4863940', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 15090}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4864293', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 2476}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4864408', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 2}]
provider_data = [{u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 5172, u'campaign_id': u'(no campaign)', u'revenue': '48.0000'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 31676, u'campaign_id': u'(no campaign)', u'revenue': '70.8700'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 849, u'campaign_id': u'(no campaign)', u'revenue': '0.7500'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 292, u'campaign_id': u'(no campaign)', u'revenue': '0.1000'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 2573, u'campaign_id': u'(no campaign)', u'revenue': '2.5600'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 28244, u'campaign_id': u'(no campaign)', u'revenue': '48.8200'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 9422, u'campaign_id': u'(no campaign)', u'revenue': '11.9000'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 8682, u'campaign_id': u'(no campaign)', u'revenue': '12.0300'}, {u'last_update': datetime.datetime(2020, 6, 16, 16, 0), u'sessions': 3740, u'campaign_id': u'(no campaign)', u'revenue': '25.5300'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 43476, u'campaign_id': u'(no campaign)', u'revenue': '181.4400'}, {u'last_update': datetime.datetime(2020, 6, 16, 16, 0), u'sessions': 47393, u'campaign_id': u'(no campaign)', u'revenue': '139.7100'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 16044, u'campaign_id': u'(no campaign)', u'revenue': '54.2700'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 16793, u'campaign_id': u'(no campaign)', u'revenue': '60.8400'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 24350, u'campaign_id': u'(no campaign)', u'revenue': '68.8100'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 6997, u'campaign_id': u'(no campaign)', u'revenue': '38.6300'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 1482, u'campaign_id': u'(no campaign)', u'revenue': '0.6600'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 28287, u'campaign_id': u'(no campaign)', u'revenue': '83.1400'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 13732, u'campaign_id': u'(no campaign)', u'revenue': '24.9500'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 34987, u'campaign_id': u'(no campaign)', u'revenue': '142.6300'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 3181, u'campaign_id': u'(no campaign)', u'revenue': '11.0400'}]
问题是,在当前状态下,运行时间为 n^2,今天我的数据中已有大约 20 万行,这需要大约 30 分钟才能完成。
我确信有一种有效的方法可以得到与 pandas 相同的结果。
我已经重写了 find_most_session 函数:
def find_most_session(campaigns, key, search):
df = pd.DataFrame(campaigns)
return df[df['campaign_id'] == key].sort_values('sessions', ascending=False)[:1].index.tolist()[0]
但我很难理解如何重写代码的 "main" 部分。
如果你们能帮助我 find/understand 甚至只是指导我如何以高性能时间重写此代码,我将不胜感激。
#best_sessions_campaings
dfbest = df.sort_values('sessions').groupby('campaign_id').last()
dfbest.reset_index(inplace=True)
将为您提供每个广告系列,并且只会保留包含最佳会话的行。
这相当于 find_most_sessions(..., ..., True)
由于 find_most_session(... , ..., False)
不寻找最大的会话,因此等效为:
providers_unique = df.groupby('campaign_id').first()
providers_unique.reset_index(inplace=True) #Thanks Maxim Kogan
现在您想合并两个数据框,并将提供者的 收入 放入相应的活动中。您可以使用 join,但我想知道它是否会非常节省内存。
如果我是你,我会创建一个字典:
area_dict = dict(zip(providers_unique.campaign_id, providers_unique.revenue))
然后将字典映射到您的广告系列:
def apply_dict(camp_id):
try:
return (area_dict[camp_id])
except:
#If the key is not here, it means it was not found or the camp_id is invalid
return 0
dfbest['provider_revenue'] = dfbest['campaign_id'].apply(apply_dict)
嘿,我正在重构我项目中的一些旧遗留代码。
Here is the old code :
def find_most_session(campaigns, key, search):
sessions = 0
index_found = None
for index, value in enumerate(campaigns):
if value['campaign_id'] == key:
if search:
if value['sessions'] > sessions:
sessions = value['sessions']
index_found = index
else:
return index
return index_found
for item in items:
if item['campaign_id'] in merged:
campaign_index = find_most_session(campaign_data, item['campaign_id'], True)
revenue_index = find_most_session(provider_data, item['campaign_id'], False)
if campaign_index is not None and revenue_index is not None and provider_data[revenue_index]['revenue'] > 0:
if provider_data[revenue_index]['revenue'] != campaign_data[campaign_index]['revenue']:
campaign_data[campaign_index]['taboola_revenue'] = provider_data[revenue_index]['revenue']
else:
campaign_data[campaign_index]['taboola_revenue'] = 0
elif (campaign_index is not None and revenue_index is not None) or campaign_index is not None:
campaign_data[campaign_index]['taboola_revenue'] = 0
else:
merged[item['campaign_id']] = item
Here is some data example :
campaign_data = [{u'sessions': 365.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4535814', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 361}, {u'sessions': 7.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4620856', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 4}, {u'sessions': 8.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4621240', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 8}, {u'sessions': 5.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4676111', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 5}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4686333', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 2}, {u'sessions': 15.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4710945', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 15}, {u'sessions': 7.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4740661', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 6}, {u'sessions': 1.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4740727', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 1}, {u'sessions': 1392.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4771091', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 1405}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4771908', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 2}, {u'sessions': 44.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4772767', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 45}, {u'sessions': 4.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4806006', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 5}, {u'sessions': 12.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4823520', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 12}, {u'sessions': 4.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4823652', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 4}, {u'sessions': 1.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4844285', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 1}, {u'sessions': 1407.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4844343', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 1453}, {u'sessions': 7.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4863932', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 8}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4863940', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 15090}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4864293', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 2476}, {u'sessions': 2.0, u'revenue': 0.0, u'site_id': 1, u'campaign_id': u'4864408', u'source': u'taboola', u'device': u'desktop', u'data_date': datetime.date(2020, 6, 15), u'clicks': 2}]
provider_data = [{u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 5172, u'campaign_id': u'(no campaign)', u'revenue': '48.0000'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 31676, u'campaign_id': u'(no campaign)', u'revenue': '70.8700'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 849, u'campaign_id': u'(no campaign)', u'revenue': '0.7500'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 292, u'campaign_id': u'(no campaign)', u'revenue': '0.1000'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 2573, u'campaign_id': u'(no campaign)', u'revenue': '2.5600'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 28244, u'campaign_id': u'(no campaign)', u'revenue': '48.8200'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 9422, u'campaign_id': u'(no campaign)', u'revenue': '11.9000'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 8682, u'campaign_id': u'(no campaign)', u'revenue': '12.0300'}, {u'last_update': datetime.datetime(2020, 6, 16, 16, 0), u'sessions': 3740, u'campaign_id': u'(no campaign)', u'revenue': '25.5300'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 43476, u'campaign_id': u'(no campaign)', u'revenue': '181.4400'}, {u'last_update': datetime.datetime(2020, 6, 16, 16, 0), u'sessions': 47393, u'campaign_id': u'(no campaign)', u'revenue': '139.7100'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 16044, u'campaign_id': u'(no campaign)', u'revenue': '54.2700'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 16793, u'campaign_id': u'(no campaign)', u'revenue': '60.8400'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 24350, u'campaign_id': u'(no campaign)', u'revenue': '68.8100'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 6997, u'campaign_id': u'(no campaign)', u'revenue': '38.6300'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 1482, u'campaign_id': u'(no campaign)', u'revenue': '0.6600'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 28287, u'campaign_id': u'(no campaign)', u'revenue': '83.1400'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 13732, u'campaign_id': u'(no campaign)', u'revenue': '24.9500'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 34987, u'campaign_id': u'(no campaign)', u'revenue': '142.6300'}, {u'last_update': datetime.datetime(2020, 6, 16, 17, 0), u'sessions': 3181, u'campaign_id': u'(no campaign)', u'revenue': '11.0400'}]
问题是,在当前状态下,运行时间为 n^2,今天我的数据中已有大约 20 万行,这需要大约 30 分钟才能完成。
我确信有一种有效的方法可以得到与 pandas 相同的结果。
我已经重写了 find_most_session 函数:
def find_most_session(campaigns, key, search):
df = pd.DataFrame(campaigns)
return df[df['campaign_id'] == key].sort_values('sessions', ascending=False)[:1].index.tolist()[0]
但我很难理解如何重写代码的 "main" 部分。 如果你们能帮助我 find/understand 甚至只是指导我如何以高性能时间重写此代码,我将不胜感激。
#best_sessions_campaings
dfbest = df.sort_values('sessions').groupby('campaign_id').last()
dfbest.reset_index(inplace=True)
将为您提供每个广告系列,并且只会保留包含最佳会话的行。
这相当于 find_most_sessions(..., ..., True)
由于 find_most_session(... , ..., False)
不寻找最大的会话,因此等效为:
providers_unique = df.groupby('campaign_id').first()
providers_unique.reset_index(inplace=True) #Thanks Maxim Kogan
现在您想合并两个数据框,并将提供者的 收入 放入相应的活动中。您可以使用 join,但我想知道它是否会非常节省内存。
如果我是你,我会创建一个字典:
area_dict = dict(zip(providers_unique.campaign_id, providers_unique.revenue))
然后将字典映射到您的广告系列:
def apply_dict(camp_id):
try:
return (area_dict[camp_id])
except:
#If the key is not here, it means it was not found or the camp_id is invalid
return 0
dfbest['provider_revenue'] = dfbest['campaign_id'].apply(apply_dict)