需要帮助来尝试让 Python 多进程池工作
Need help trying to get a Python multiprocess pool working
我有一个数据库 table 我正在读取行(在本例中为 ~360k 行)并将 pyodbc.row 对象放入列表中供以后使用,然后使用此脚本编写。
from hashlib import sha1
import multiprocessing
import datetime, os
import pyodbc
import math
import traceback, sys
source_rows = []
processors = 8
def split(a, n):
k, m = len(a) / n, len(a) % n
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n))
def sqn(*args):
return sha1('}#EUCLID#{'.join([str(arg.upper().strip()) for arg in args]).encode()).hexdigest().upper()
def sDB_read():
t1 = datetime.datetime.now()
#Initialize Source Database
src_driver='{SQL Server Native Client 11.0}'
src_server='SCCMSV2SS010'
src_database='STAGE'
src_trusted_conn='yes'
src_uid = 'myUserID'
src_pwd = 'myPwd'
if src_trusted_conn == 'yes':
src_conn_str = """Driver=""" + src_driver + """;Server=""" + src_server + """;Database=""" + src_database + """;Trusted_Connection=""" + src_trusted_conn + """;PacketSize=32767;"""
else:
src_conn_str = """Driver=""" + src_driver + """;Server=""" + src_server + """;Database=""" + src_database + """;UID=""" + src_UID + """;PWD=""" + src_pwd + """;PacketSize=32767;"""
sql = 'SELECT [AgentID] ,[BootDevice00] ,[BuildNumber00] ,[BuildType00] ,[Caption00] ,[CodeSet00] ,[CountryCode00] ,[CSDVersion00] ,[CurrentTimeZone00] ,[Debug00] ,[Description00] ,[Distributed00] ,[ForegroundApplicationBoost00] ,[FreePhysicalMemory00] ,[FreeSpaceInPagingFiles00] ,[FreeVirtualMemory00] ,[InstallDate00] ,[InstanceKey] ,[LastBootUpTime00] ,[LocalDateTime00] ,[Locale00] ,[MachineID] ,[Manufacturer00] ,[MaxNumberOfProcesses00] ,[MaxProcessMemorySize00] ,[Name00] ,[NumberOfLicensedUsers00] ,[NumberOfProcesses00] ,[NumberOfUsers00] ,[OperatingSystemSKU00] ,[Organization00] ,[OSArchitecture00] ,[OSLanguage00] ,[OSProductSuite00] ,[OSType00] ,[OtherTypeDescription00] ,[PlusProductID00] ,[PlusVersionNumber00] ,[Primary00] ,[ProductType00] ,[RegisteredUser00] ,[RevisionID] ,[rowversion] ,[SerialNumber00] ,[ServicePackMajorVersion00] ,[ServicePackMinorVersion00] ,[SizeStoredInPagingFiles00] ,[Status00] ,[SystemDevice00] ,[SystemDirectory00] ,[TimeKey] ,[TotalSwapSpaceSize00] ,[TotalVirtualMemorySize00] ,[TotalVisibleMemorySize00] ,[Version00] ,[WindowsDirectory00] FROM [STAGE].[dbo].[Operating_System_DATA]'
src_db_conn = pyodbc.connect(src_conn_str)
src_db_conn.autocommit = False
src_db_cursor = src_db_conn.cursor()
src_db_cursor.execute( sql )
source_rows = [ {c[0]: v for (c, v) in zip(row.cursor_description, row)} for row in src_db_cursor.fetchall() ]
t2 = datetime.datetime.now()
print('\nWe Read ' + str(len(source_rows)) + ' rows in ' + str( t2 - t1 ))
return source_rows
def tDB_write():
print('\nPOOL: Received ' + str(len( source_rows )) + ' rows to work on')
t1 = datetime.datetime.now()
#Initialize Target Database
targ_driver='{SQL Server Native Client 11.0}'
targ_server='SCCMSV2SS010'
targ_database='STAGE'
targ_trusted_conn='yes'
targ_uid = 'myUserID'
targ_pwd = 'myPwd'
if targ_trusted_conn == 'yes':
targ_conn_str = """Driver=""" + targ_driver + """;Server=""" + targ_server + """;Database=""" + targ_database + """;Trusted_Connection=""" + targ_trusted_conn + """;PacketSize=32767;"""
else:
targ_conn_str = """Driver=""" + targ_driver + """;Server=""" + targ_server + """;Database=""" + targ_database + """;UID=""" + targ_UID + """;PWD=""" + targ_pwd + """;PacketSize=32767;"""
targ_db_conn = pyodbc.connect(targ_conn_str)
targ_db_conn.autocommit = False
targ_db_cursor = targ_db_conn.cursor()
table = 'Operating_System_DATA_INSERT'
for sourceitems in source_rows:
for source_row in sourceitems:
try:
sql = ''
#print( str( source_row ) )
columns = ', '.join( source_row.keys() )
placeholders = ', '.join(['?'] * len( source_row ))
obj = source_row.values()
sql = "targ_db_cursor.execute('INSERT into {} ( {} ) VALUES ( {} )', {} )".format(table, columns, placeholders , obj )
#print( sql )
res = eval( sql )
except Exception,e:
traceback.print_exc(file=sys.stdout)
targ_db_conn.commit()
t2 = datetime.datetime.now()
print('\nWe Wrote ' + str(len(source_rows)) + ' rows in ' + str( t2 - t1 ))
return
if __name__ == '__main__':
print( '\nStarting multiprocessing pool..' )
pool = multiprocessing.Pool( processes = processors )
source_rows = sDB_read()
print( type(source_rows) )
targetsize = len(source_rows)
print( '\nProcessing ' + str(targetsize) + ' rows')
chunksize = math.ceil(len(source_rows) / processors )
print( 'Splitting into ' + str(processors) + " chunks with " + str(chunksize) + ' rows each')
source_rows = list(split( source_rows , processors ))
write_pool_outputs = pool.map( tDB_write() , source_rows )
print( '\nClosing multiprocessing pool..' )
pool.close()
pool.join()
提供以下输出
c:\Python27>.\python27.exe .\multitest.py
Starting multiprocessing pool..
We Read 361554 rows in 0:00:14.064000
<type 'list'>
Processing 361554 rows
Splitting into 8 chunks with 45194.0 rows each
POOL: Received 8 rows to work on
我想我不清楚如何 split/chunk 列出列表以便每个工作人员都能平等地分配要处理的行。无论我是尝试手动拆分列表还是将未更改的列表直接传递给 pool.map,都只会启动一名工作人员获取所有行。有人可以教我正确的方法来完成这个吗?
我认为您可能在 pool.map 调用中遗漏了块大小:
map(func, iterable[, chunksize])
我在这里对chunksize做了解释:
希望对您有所帮助。
我有一个数据库 table 我正在读取行(在本例中为 ~360k 行)并将 pyodbc.row 对象放入列表中供以后使用,然后使用此脚本编写。
from hashlib import sha1
import multiprocessing
import datetime, os
import pyodbc
import math
import traceback, sys
source_rows = []
processors = 8
def split(a, n):
k, m = len(a) / n, len(a) % n
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in xrange(n))
def sqn(*args):
return sha1('}#EUCLID#{'.join([str(arg.upper().strip()) for arg in args]).encode()).hexdigest().upper()
def sDB_read():
t1 = datetime.datetime.now()
#Initialize Source Database
src_driver='{SQL Server Native Client 11.0}'
src_server='SCCMSV2SS010'
src_database='STAGE'
src_trusted_conn='yes'
src_uid = 'myUserID'
src_pwd = 'myPwd'
if src_trusted_conn == 'yes':
src_conn_str = """Driver=""" + src_driver + """;Server=""" + src_server + """;Database=""" + src_database + """;Trusted_Connection=""" + src_trusted_conn + """;PacketSize=32767;"""
else:
src_conn_str = """Driver=""" + src_driver + """;Server=""" + src_server + """;Database=""" + src_database + """;UID=""" + src_UID + """;PWD=""" + src_pwd + """;PacketSize=32767;"""
sql = 'SELECT [AgentID] ,[BootDevice00] ,[BuildNumber00] ,[BuildType00] ,[Caption00] ,[CodeSet00] ,[CountryCode00] ,[CSDVersion00] ,[CurrentTimeZone00] ,[Debug00] ,[Description00] ,[Distributed00] ,[ForegroundApplicationBoost00] ,[FreePhysicalMemory00] ,[FreeSpaceInPagingFiles00] ,[FreeVirtualMemory00] ,[InstallDate00] ,[InstanceKey] ,[LastBootUpTime00] ,[LocalDateTime00] ,[Locale00] ,[MachineID] ,[Manufacturer00] ,[MaxNumberOfProcesses00] ,[MaxProcessMemorySize00] ,[Name00] ,[NumberOfLicensedUsers00] ,[NumberOfProcesses00] ,[NumberOfUsers00] ,[OperatingSystemSKU00] ,[Organization00] ,[OSArchitecture00] ,[OSLanguage00] ,[OSProductSuite00] ,[OSType00] ,[OtherTypeDescription00] ,[PlusProductID00] ,[PlusVersionNumber00] ,[Primary00] ,[ProductType00] ,[RegisteredUser00] ,[RevisionID] ,[rowversion] ,[SerialNumber00] ,[ServicePackMajorVersion00] ,[ServicePackMinorVersion00] ,[SizeStoredInPagingFiles00] ,[Status00] ,[SystemDevice00] ,[SystemDirectory00] ,[TimeKey] ,[TotalSwapSpaceSize00] ,[TotalVirtualMemorySize00] ,[TotalVisibleMemorySize00] ,[Version00] ,[WindowsDirectory00] FROM [STAGE].[dbo].[Operating_System_DATA]'
src_db_conn = pyodbc.connect(src_conn_str)
src_db_conn.autocommit = False
src_db_cursor = src_db_conn.cursor()
src_db_cursor.execute( sql )
source_rows = [ {c[0]: v for (c, v) in zip(row.cursor_description, row)} for row in src_db_cursor.fetchall() ]
t2 = datetime.datetime.now()
print('\nWe Read ' + str(len(source_rows)) + ' rows in ' + str( t2 - t1 ))
return source_rows
def tDB_write():
print('\nPOOL: Received ' + str(len( source_rows )) + ' rows to work on')
t1 = datetime.datetime.now()
#Initialize Target Database
targ_driver='{SQL Server Native Client 11.0}'
targ_server='SCCMSV2SS010'
targ_database='STAGE'
targ_trusted_conn='yes'
targ_uid = 'myUserID'
targ_pwd = 'myPwd'
if targ_trusted_conn == 'yes':
targ_conn_str = """Driver=""" + targ_driver + """;Server=""" + targ_server + """;Database=""" + targ_database + """;Trusted_Connection=""" + targ_trusted_conn + """;PacketSize=32767;"""
else:
targ_conn_str = """Driver=""" + targ_driver + """;Server=""" + targ_server + """;Database=""" + targ_database + """;UID=""" + targ_UID + """;PWD=""" + targ_pwd + """;PacketSize=32767;"""
targ_db_conn = pyodbc.connect(targ_conn_str)
targ_db_conn.autocommit = False
targ_db_cursor = targ_db_conn.cursor()
table = 'Operating_System_DATA_INSERT'
for sourceitems in source_rows:
for source_row in sourceitems:
try:
sql = ''
#print( str( source_row ) )
columns = ', '.join( source_row.keys() )
placeholders = ', '.join(['?'] * len( source_row ))
obj = source_row.values()
sql = "targ_db_cursor.execute('INSERT into {} ( {} ) VALUES ( {} )', {} )".format(table, columns, placeholders , obj )
#print( sql )
res = eval( sql )
except Exception,e:
traceback.print_exc(file=sys.stdout)
targ_db_conn.commit()
t2 = datetime.datetime.now()
print('\nWe Wrote ' + str(len(source_rows)) + ' rows in ' + str( t2 - t1 ))
return
if __name__ == '__main__':
print( '\nStarting multiprocessing pool..' )
pool = multiprocessing.Pool( processes = processors )
source_rows = sDB_read()
print( type(source_rows) )
targetsize = len(source_rows)
print( '\nProcessing ' + str(targetsize) + ' rows')
chunksize = math.ceil(len(source_rows) / processors )
print( 'Splitting into ' + str(processors) + " chunks with " + str(chunksize) + ' rows each')
source_rows = list(split( source_rows , processors ))
write_pool_outputs = pool.map( tDB_write() , source_rows )
print( '\nClosing multiprocessing pool..' )
pool.close()
pool.join()
提供以下输出
c:\Python27>.\python27.exe .\multitest.py
Starting multiprocessing pool..
We Read 361554 rows in 0:00:14.064000
<type 'list'>
Processing 361554 rows
Splitting into 8 chunks with 45194.0 rows each
POOL: Received 8 rows to work on
我想我不清楚如何 split/chunk 列出列表以便每个工作人员都能平等地分配要处理的行。无论我是尝试手动拆分列表还是将未更改的列表直接传递给 pool.map,都只会启动一名工作人员获取所有行。有人可以教我正确的方法来完成这个吗?
我认为您可能在 pool.map 调用中遗漏了块大小:
map(func, iterable[, chunksize])
我在这里对chunksize做了解释:
希望对您有所帮助。