有没有办法让这段代码消耗更少的内存?
Is there a way how we can make this code less memory consuming?
我创建了一个逻辑,即计算有多少人购买了相同的产品。
它有效,但它确实 低效(运行 一直内存不足)。
因此,我希望有人有一个比我的更少内存消耗的逻辑。
这是我所做的:
df: # Please note: below you can find the code to duplicate this.
Order_Number Country Product
1 Ger [A,B]
2 NL [A,B,C]
3 USA [C,D]
4 NL [B,C,D]
5 GER [A]
我想知道有多少客户购买了相同的产品(显然至少有两种产品):
list_df = [df]
# Example for two products bought together
for X in list_df : #
#print(X)
combinations_list = []
for row in X.Product:
combinations = list(itertools.combinations(row, 2)) # Only counting for 2 products here
combinations_list.append(combinations)
Products_DF = pd.Series(combinations_list).explode().reset_index(drop=True)
Products_DF = Products_DF.value_counts()
Products_DF = Products_DF.to_frame()
Products_DF.reset_index(level=0, inplace=True)
Products_DF = Products_DF.rename(index = str, columns = {"index":"Product"})
Products_DF = Products_DF.rename(index = str, columns = {0:"Occurrence"})
Products_DF['Product_Combinations'] = 2 # Only counting for 2 products here
Products_DF['Country'] = X['Country']
main_dataframe = main_dataframe.append(Products_DF, ignore_index = True)
del(Products_DF)
然后,我把上面的3,4,5,6,7个一起买的产品重做了一遍。将所有信息附加到我的 main_dataframe.
结果是一个数据框,包含国家、一起购买的产品和出现的次数。正如下面数据的输出。
非常感谢!
PS 我也对 PySpark 解决方案持开放态度(感谢一切!)
完整示例:
import pandas as pd
import itertools
df= {'Order_Number':['1', '2', '3', '4', '5'],
'Country':['Ger', 'NL', 'USA', 'NL', 'Ger'],
'Product': ['[A,B]', '[A,B,C]','[C,D]', '[B,C,D]', '[A]']}
# Creates pandas DataFrame.
df = pd.DataFrame(df)
df = [df] # sorry, this is legacy in my code
main_dataframe = pd.DataFrame()
# Example for two products bought together
for X in df : #
#print(X)
combinations_list = []
for row in X.Product:
combinations = list(itertools.combinations(row, 2)) # Only counting for 2 products here
combinations_list.append(combinations)
Products_DF = pd.Series(combinations_list).explode().reset_index(drop=True)
Products_DF = Products_DF.value_counts()
Products_DF = Products_DF.to_frame()
Products_DF.reset_index(level=0, inplace=True)
Products_DF = Products_DF.rename(index = str, columns = {"index":"Product"})
Products_DF = Products_DF.rename(index = str, columns = {0:"Occurrence"})
Products_DF['Product_Combinations'] = 2 # Only counting for 2 products here
Products_DF['Country'] = X['Country']
main_dataframe = main_dataframe.append(Products_DF, ignore_index = True)
del(Products_DF)
最终清理编辑。我用 500 个独特的产品测试了 1e6 个订单,并在大约 3 分钟内得到了结果。
我确实通过要求最低订单数量和在 3 个产品组之后停止来进行过滤。这些可以在函数调用中更改:
memo = find_product_groups(df, min_unique_orders=10, max_group_size=3)
输出
testing 500 products in 1000000.0 orders where there are between 1 and 10 products per order
made test data in 72.29691982269287 seconds
counted_groups in 199.78216004371643 seconds
showing first five 3-member group sorted by decreasing number of orders
('product_259', 'product_263', 'product_435') 11
('product_252', 'product_432', 'product_63') 11
('product_114', 'product_139', 'product_156') 11
('product_101', 'product_11', 'product_179') 11
('product_115', 'product_301', 'product_45') 11
代码
import pandas as pd
import collections
import time #just for timing
import numpy as np #just for creating the test data
np.random.seed(1)
def init_data():
#Initialize test data
start_create_data = time.time()
num_products = 500
num_orders = 1e6
max_order_size = 10
print('testing',num_products,'products in',num_orders,'orders where there are between 1 and',max_order_size,'products per order')
products = ['product_'+str(p) for p in range(num_products)]
order_sizes = np.round(np.random.uniform(low=1,high=max_order_size,size=int(num_orders)))
order_list = []
for order_size in order_sizes:
order = np.random.choice(products,int(order_size),replace=False)
order_list.append(order)
# Create pandas DataFrame
df = pd.DataFrame({'Product':order_list})
print('made test data in',time.time()-start_create_data,'seconds')
return df
def find_product_groups(df, min_unique_orders=1, max_group_size=None):
#Initialize the memo with single-product groups
memo = collections.defaultdict(set)
for ind,order in df['Product'].items():
for product in order:
memo[(product,)].add(ind)
#Build up the groups from 1-smaller groups, updating the memo
unique_prods = sorted(p[0] for p in memo.keys())
groups = list(memo.keys())
while groups:
group = groups.pop()
#Skip if the max_group_size is set and this group is already max size
if max_group_size and len(group) == max_group_size:
continue
last_prod = group[-1]
last_prod_ind = unique_prods.index(last_prod)
for prod in unique_prods[last_prod_ind+1:]:
new_inds = memo[group].intersection(memo[(prod,)])
#Only add the new group if there are enough orders
if len(new_inds) >= min_unique_orders:
new_group = (*group,prod)
memo[new_group] = new_inds
groups.append(new_group)
return memo
df = init_data()
start_count_groups = time.time()
memo = find_product_groups(df, min_unique_orders=10, max_group_size=3)
print('counted_groups in',time.time()-start_count_groups,'seconds')
print('showing first five 3-member group sorted by decreasing number of orders')
filt_memo = {group:order_inds for group,order_inds in memo.items() if len(group) >= 3}
filt_memo = dict(sorted(filt_memo.items(),key=lambda kv: -len(kv[1]))[:5])
for group,order_inds in filt_memo.items():
print(group,len(order_inds))
我创建了一个逻辑,即计算有多少人购买了相同的产品。 它有效,但它确实 低效(运行 一直内存不足)。
因此,我希望有人有一个比我的更少内存消耗的逻辑。
这是我所做的:
df: # Please note: below you can find the code to duplicate this.
Order_Number Country Product
1 Ger [A,B]
2 NL [A,B,C]
3 USA [C,D]
4 NL [B,C,D]
5 GER [A]
我想知道有多少客户购买了相同的产品(显然至少有两种产品):
list_df = [df]
# Example for two products bought together
for X in list_df : #
#print(X)
combinations_list = []
for row in X.Product:
combinations = list(itertools.combinations(row, 2)) # Only counting for 2 products here
combinations_list.append(combinations)
Products_DF = pd.Series(combinations_list).explode().reset_index(drop=True)
Products_DF = Products_DF.value_counts()
Products_DF = Products_DF.to_frame()
Products_DF.reset_index(level=0, inplace=True)
Products_DF = Products_DF.rename(index = str, columns = {"index":"Product"})
Products_DF = Products_DF.rename(index = str, columns = {0:"Occurrence"})
Products_DF['Product_Combinations'] = 2 # Only counting for 2 products here
Products_DF['Country'] = X['Country']
main_dataframe = main_dataframe.append(Products_DF, ignore_index = True)
del(Products_DF)
然后,我把上面的3,4,5,6,7个一起买的产品重做了一遍。将所有信息附加到我的 main_dataframe.
结果是一个数据框,包含国家、一起购买的产品和出现的次数。正如下面数据的输出。
非常感谢!
PS 我也对 PySpark 解决方案持开放态度(感谢一切!)
完整示例:
import pandas as pd
import itertools
df= {'Order_Number':['1', '2', '3', '4', '5'],
'Country':['Ger', 'NL', 'USA', 'NL', 'Ger'],
'Product': ['[A,B]', '[A,B,C]','[C,D]', '[B,C,D]', '[A]']}
# Creates pandas DataFrame.
df = pd.DataFrame(df)
df = [df] # sorry, this is legacy in my code
main_dataframe = pd.DataFrame()
# Example for two products bought together
for X in df : #
#print(X)
combinations_list = []
for row in X.Product:
combinations = list(itertools.combinations(row, 2)) # Only counting for 2 products here
combinations_list.append(combinations)
Products_DF = pd.Series(combinations_list).explode().reset_index(drop=True)
Products_DF = Products_DF.value_counts()
Products_DF = Products_DF.to_frame()
Products_DF.reset_index(level=0, inplace=True)
Products_DF = Products_DF.rename(index = str, columns = {"index":"Product"})
Products_DF = Products_DF.rename(index = str, columns = {0:"Occurrence"})
Products_DF['Product_Combinations'] = 2 # Only counting for 2 products here
Products_DF['Country'] = X['Country']
main_dataframe = main_dataframe.append(Products_DF, ignore_index = True)
del(Products_DF)
最终清理编辑。我用 500 个独特的产品测试了 1e6 个订单,并在大约 3 分钟内得到了结果。
我确实通过要求最低订单数量和在 3 个产品组之后停止来进行过滤。这些可以在函数调用中更改:
memo = find_product_groups(df, min_unique_orders=10, max_group_size=3)
输出
testing 500 products in 1000000.0 orders where there are between 1 and 10 products per order
made test data in 72.29691982269287 seconds
counted_groups in 199.78216004371643 seconds
showing first five 3-member group sorted by decreasing number of orders
('product_259', 'product_263', 'product_435') 11
('product_252', 'product_432', 'product_63') 11
('product_114', 'product_139', 'product_156') 11
('product_101', 'product_11', 'product_179') 11
('product_115', 'product_301', 'product_45') 11
代码
import pandas as pd
import collections
import time #just for timing
import numpy as np #just for creating the test data
np.random.seed(1)
def init_data():
#Initialize test data
start_create_data = time.time()
num_products = 500
num_orders = 1e6
max_order_size = 10
print('testing',num_products,'products in',num_orders,'orders where there are between 1 and',max_order_size,'products per order')
products = ['product_'+str(p) for p in range(num_products)]
order_sizes = np.round(np.random.uniform(low=1,high=max_order_size,size=int(num_orders)))
order_list = []
for order_size in order_sizes:
order = np.random.choice(products,int(order_size),replace=False)
order_list.append(order)
# Create pandas DataFrame
df = pd.DataFrame({'Product':order_list})
print('made test data in',time.time()-start_create_data,'seconds')
return df
def find_product_groups(df, min_unique_orders=1, max_group_size=None):
#Initialize the memo with single-product groups
memo = collections.defaultdict(set)
for ind,order in df['Product'].items():
for product in order:
memo[(product,)].add(ind)
#Build up the groups from 1-smaller groups, updating the memo
unique_prods = sorted(p[0] for p in memo.keys())
groups = list(memo.keys())
while groups:
group = groups.pop()
#Skip if the max_group_size is set and this group is already max size
if max_group_size and len(group) == max_group_size:
continue
last_prod = group[-1]
last_prod_ind = unique_prods.index(last_prod)
for prod in unique_prods[last_prod_ind+1:]:
new_inds = memo[group].intersection(memo[(prod,)])
#Only add the new group if there are enough orders
if len(new_inds) >= min_unique_orders:
new_group = (*group,prod)
memo[new_group] = new_inds
groups.append(new_group)
return memo
df = init_data()
start_count_groups = time.time()
memo = find_product_groups(df, min_unique_orders=10, max_group_size=3)
print('counted_groups in',time.time()-start_count_groups,'seconds')
print('showing first five 3-member group sorted by decreasing number of orders')
filt_memo = {group:order_inds for group,order_inds in memo.items() if len(group) >= 3}
filt_memo = dict(sorted(filt_memo.items(),key=lambda kv: -len(kv[1]))[:5])
for group,order_inds in filt_memo.items():
print(group,len(order_inds))