Python 中的多处理使用 Pool
Multiprocessing in Python using Pool
我想使用 Python 在数据帧上并行化一个函数。我看了教程,发现一些 code.I 已经根据我的需要进行了调整。当我执行地图功能时,程序冻结。代码似乎很可靠。请问是什么问题
import pandas as pd
import numpy as np
from multiprocessing import cpu_count, Pool
attributes1 = pd.read_csv('attributes1.csv')
def replace_data(data):
for i in range(0, len(data.index)):
temp = data.iloc[i, 1]
temp = temp.replace('in.', 'inch')
data.iloc[i, 1] = temp
return data
num_partitions = 10 #number of partitions to split dataframe
num_cores = cpu_count() #number of cores on your machine
def parallelize_dataframe(df, func):
df_split = np.array_split(df, num_partitions)
pool = Pool(num_cores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
df1 = parallelize_dataframe(attributes1, replace_data)
您可能正在使用 windows,要修复它您需要 freeze_support
:
from multiprocessing cpu_count, Pool, freeze_support
...
if __name__ == '__main__':
freeze_support()
df1 = parallelize_dataframe(attributes1, replace_data)
这只是 Windows 用户的问题。首先,我创建了另一个 .py 文件,我们将其命名为 helpy.py,其中我有 replace_data 函数
def replace_data(data):
for i in range(0, len(data.index)):
temp = data.iloc[i, 1]
temp = temp.replace('in.', 'inch')
data.iloc[i, 1] = temp
return data
然后我将我的函数导入到我的主 .py 文件中。
import pandas as pd
import numpy as np
from multiprocessing import cpu_count, Pool
from helpy import replace_data
attributes1 = pd.read_csv('attributes1.csv')
num_partitions = 10 #number of partitions to split dataframe
num_cores = cpu_count() #number of cores on your machine
def parallelize_dataframe(df, func):
df_split = np.array_split(df, num_partitions)
pool = Pool(num_cores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
if __name__ == '__main__':
df1 = parallelize_dataframe(attributes1, replace_data)
我也加了if __name__ == '__main__':
现在程序运行流畅
我想使用 Python 在数据帧上并行化一个函数。我看了教程,发现一些 code.I 已经根据我的需要进行了调整。当我执行地图功能时,程序冻结。代码似乎很可靠。请问是什么问题
import pandas as pd
import numpy as np
from multiprocessing import cpu_count, Pool
attributes1 = pd.read_csv('attributes1.csv')
def replace_data(data):
for i in range(0, len(data.index)):
temp = data.iloc[i, 1]
temp = temp.replace('in.', 'inch')
data.iloc[i, 1] = temp
return data
num_partitions = 10 #number of partitions to split dataframe
num_cores = cpu_count() #number of cores on your machine
def parallelize_dataframe(df, func):
df_split = np.array_split(df, num_partitions)
pool = Pool(num_cores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
df1 = parallelize_dataframe(attributes1, replace_data)
您可能正在使用 windows,要修复它您需要 freeze_support
:
from multiprocessing cpu_count, Pool, freeze_support
...
if __name__ == '__main__':
freeze_support()
df1 = parallelize_dataframe(attributes1, replace_data)
这只是 Windows 用户的问题。首先,我创建了另一个 .py 文件,我们将其命名为 helpy.py,其中我有 replace_data 函数
def replace_data(data):
for i in range(0, len(data.index)):
temp = data.iloc[i, 1]
temp = temp.replace('in.', 'inch')
data.iloc[i, 1] = temp
return data
然后我将我的函数导入到我的主 .py 文件中。
import pandas as pd
import numpy as np
from multiprocessing import cpu_count, Pool
from helpy import replace_data
attributes1 = pd.read_csv('attributes1.csv')
num_partitions = 10 #number of partitions to split dataframe
num_cores = cpu_count() #number of cores on your machine
def parallelize_dataframe(df, func):
df_split = np.array_split(df, num_partitions)
pool = Pool(num_cores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
if __name__ == '__main__':
df1 = parallelize_dataframe(attributes1, replace_data)
我也加了if __name__ == '__main__':
现在程序运行流畅