线程队列挂起 Python
Threading queue hangs in Python
我正在尝试通过队列使解析器成为多线程。它似乎有效,但我的队列挂起。如果有人能告诉我如何解决这个问题,我将不胜感激,因为我很少编写多线程代码。
此代码从问题中读取:
from silk import *
import json
import datetime
import pandas
import Queue
from threading import Thread
l = []
q = Queue.Queue()
def parse_record():
d = {}
while not q.empty():
rec = q.get()
d['timestamp'] = rec.stime.strftime("%Y-%m-%d %H:%M:%S")
# ... many ops like this
d['dport'] = rec.dport
l.append(d) # l is global
这就填满了问题:
def parse_records():
ffile = '/tmp/query.rwf'
flows = SilkFile(ffile, READ)
numthreads = 2
# fill queue
for rec in flows:
q.put(rec)
# work on Queue
for i in range(numthreads):
t = Thread(target = parse_record)
t.daemon = True
t.start()
# blocking
q.join()
# never reached
data_df = pandas.DataFrame.from_records(l)
return data_df
我只在 main 中调用 parse_records()
。它永远不会终止。
...if empty() returns False it doesn’t guarantee that a subsequent call to get() will not block.
您至少应该使用 get_nowait
否则可能会丢失数据。但更重要的是,仅当所有排队的项目都已通过 Queue.task_done 调用标记为完成时,连接才会释放:
If a join() is currently blocking, it will resume when all items have been processed (meaning that a task_done() call was received for every item that had been put() into the queue).
附带说明,l.append(d)
不是原子的,应该用锁保护。
from silk import *
import json
import datetime
import pandas
import Queue
from threading import Thread, Lock
l = []
l_lock = Lock()
q = Queue.Queue()
def parse_record():
d = {}
while 1:
try:
rec = q.getnowait()
d['timestamp'] = rec.stime.strftime("%Y-%m-%d %H:%M:%S")
# ... many ops like this
d['dport'] = rec.dport
with l_lock():
l.append(d) # l is global
q.task_done()
except Queue.Empty:
return
您可以通过使用标准库中的线程池来大大缩短您的代码。
from silk import *
import json
import datetime
import pandas
import multiprocessing.pool
def parse_record(rec):
d = {}
d['timestamp'] = rec.stime.strftime("%Y-%m-%d %H:%M:%S")
# ... many ops like this
d['dport'] = rec.dport
return d
def parse_records():
ffile = '/tmp/query.rwf'
flows = SilkFile(ffile, READ)
pool = multiprocessing.pool.Pool(2)
data_df = pandas.DataFrame.from_records(pool.map(parse_record), flows)
pool.close()
return data_df
我正在尝试通过队列使解析器成为多线程。它似乎有效,但我的队列挂起。如果有人能告诉我如何解决这个问题,我将不胜感激,因为我很少编写多线程代码。
此代码从问题中读取:
from silk import *
import json
import datetime
import pandas
import Queue
from threading import Thread
l = []
q = Queue.Queue()
def parse_record():
d = {}
while not q.empty():
rec = q.get()
d['timestamp'] = rec.stime.strftime("%Y-%m-%d %H:%M:%S")
# ... many ops like this
d['dport'] = rec.dport
l.append(d) # l is global
这就填满了问题:
def parse_records():
ffile = '/tmp/query.rwf'
flows = SilkFile(ffile, READ)
numthreads = 2
# fill queue
for rec in flows:
q.put(rec)
# work on Queue
for i in range(numthreads):
t = Thread(target = parse_record)
t.daemon = True
t.start()
# blocking
q.join()
# never reached
data_df = pandas.DataFrame.from_records(l)
return data_df
我只在 main 中调用 parse_records()
。它永远不会终止。
...if empty() returns False it doesn’t guarantee that a subsequent call to get() will not block.
您至少应该使用 get_nowait
否则可能会丢失数据。但更重要的是,仅当所有排队的项目都已通过 Queue.task_done 调用标记为完成时,连接才会释放:
If a join() is currently blocking, it will resume when all items have been processed (meaning that a task_done() call was received for every item that had been put() into the queue).
附带说明,l.append(d)
不是原子的,应该用锁保护。
from silk import *
import json
import datetime
import pandas
import Queue
from threading import Thread, Lock
l = []
l_lock = Lock()
q = Queue.Queue()
def parse_record():
d = {}
while 1:
try:
rec = q.getnowait()
d['timestamp'] = rec.stime.strftime("%Y-%m-%d %H:%M:%S")
# ... many ops like this
d['dport'] = rec.dport
with l_lock():
l.append(d) # l is global
q.task_done()
except Queue.Empty:
return
您可以通过使用标准库中的线程池来大大缩短您的代码。
from silk import *
import json
import datetime
import pandas
import multiprocessing.pool
def parse_record(rec):
d = {}
d['timestamp'] = rec.stime.strftime("%Y-%m-%d %H:%M:%S")
# ... many ops like this
d['dport'] = rec.dport
return d
def parse_records():
ffile = '/tmp/query.rwf'
flows = SilkFile(ffile, READ)
pool = multiprocessing.pool.Pool(2)
data_df = pandas.DataFrame.from_records(pool.map(parse_record), flows)
pool.close()
return data_df