Spacy 是否支持多个 GPU?
Does Spacy support multiple GPUs?
我想知道 Spacy 是否通过 mpi4py 支持多 GPU?
我目前正在使用 Spacy 的 nlp.pipe 在支持 MPI 协议并具有许多 GPU 的高性能计算集群上进行命名实体识别。它说 here 我需要指定 GPU 与 cupy 一起使用,但是对于 PyMPI,我不确定以下是否有效(我应该在调用 cupy 设备后导入 spacy 吗?):
from mpi4py import MPI
import cupy
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
if rank == 0:
data = ["His friend Nicolas J. Smith is here with Bart Simpon and Fred."*100]
else:
data = None
unit = comm.scatter(data, root=0)
with cupy.cuda.Device(rank):
import spacy
from thinc.api import set_gpu_allocator, require_gpu
set_gpu_allocator("pytorch")
require_gpu(rank)
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe("merge_entities")
tmp_list = []
for doc in nlp.pipe(unit):
res = " ".join([t.text if not t.ent_type_ else t.ent_type_ for t in doc])
tmp_list.append(res)
result = comm.gather(tmp_list, root=0)
if comm.rank == 0:
print (result)
else:
result = None
或者如果我在同一台机器上有 4 个 GPU 而我不想使用 MPI,我可以执行以下操作吗:
from joblib import Parallel, delayed
import cupy
rank = 0
def chunker(iterable, total_length, chunksize):
return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))
def flatten(list_of_lists):
"Flatten a list of lists to a combined list"
return [item for sublist in list_of_lists for item in sublist]
def process_chunk(texts):
with cupy.cuda.Device(rank):
import spacy
from thinc.api import set_gpu_allocator, require_gpu
set_gpu_allocator("pytorch")
require_gpu(rank)
preproc_pipe = []
for doc in nlp.pipe(texts, batch_size=20):
preproc_pipe.append(lemmatize_pipe(doc))
rank+=1
return preproc_pipe
def preprocess_parallel(texts, chunksize=100):
executor = Parallel(n_jobs=4, backend='multiprocessing', prefer="processes")
do = delayed(process_chunk)
tasks = (do(chunk) for chunk in chunker(texts, len(texts), chunksize=chunksize))
result = executor(tasks)
return flatten(result)
preprocess_parallel(texts = ["His friend Nicolas J. Smith is here with Bart Simpon and Fred."*100], chunksize=1000)
我想我已经想出了如何做到这一点:
关键是让cupy使用新的GPU。
import multiprocessing as mp
mp.set_start_method('spawn', force=True)
from joblib import Parallel, delayed
from itertools import cycle
import cupy
import spacy
from thinc.api import set_gpu_allocator, require_gpu
def chunker(iterable, total_length, chunksize):
return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))
def flatten(list_of_lists):
"Flatten a list of lists to a combined list"
return [item for sublist in list_of_lists for item in sublist]
def process_entity(doc):
super_word_ls = []
for s in doc.sents:
word_ls = []
for t in s:
if not t.ent_type_:
if (t.text.strip()!=""):
word_ls.append(t.text)
else:
word_ls.append(t.ent_type_)
if len(word_ls)>0:
super_word_ls.append(" ".join(word_ls))
return " ".join(super_word_ls)
def process_chunk(texts, rank):
print(rank)
with cupy.cuda.Device(rank):
set_gpu_allocator("pytorch")
require_gpu(rank)
nlp = spacy.load("en_core_web_trf")
preproc_pipe = []
for doc in nlp.pipe(texts, batch_size=20):
preproc_pipe.append(process_entity(doc))
rank+=1
return preproc_pipe
def preprocess_parallel(texts, chunksize=100):
executor = Parallel(n_jobs=2, backend='multiprocessing', prefer="processes")
do = delayed(process_chunk)
tasks = []
gpus = list(range(0, cupy.cuda.runtime.getDeviceCount()))
rank = 0
for chunk in chunker(texts, len(texts), chunksize=chunksize):
tasks.append(do(chunk, rank))
rank = (rank+1)%len(gpus)
result = executor(tasks)
return flatten(result)
if __name__ == '__main__':
print(preprocess_parallel(texts = ["His friend Nicolas J. Smith is here with Bart Simpon and Fred."]*100, chunksize=50))
我想知道 Spacy 是否通过 mpi4py 支持多 GPU?
我目前正在使用 Spacy 的 nlp.pipe 在支持 MPI 协议并具有许多 GPU 的高性能计算集群上进行命名实体识别。它说 here 我需要指定 GPU 与 cupy 一起使用,但是对于 PyMPI,我不确定以下是否有效(我应该在调用 cupy 设备后导入 spacy 吗?):
from mpi4py import MPI
import cupy
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
if rank == 0:
data = ["His friend Nicolas J. Smith is here with Bart Simpon and Fred."*100]
else:
data = None
unit = comm.scatter(data, root=0)
with cupy.cuda.Device(rank):
import spacy
from thinc.api import set_gpu_allocator, require_gpu
set_gpu_allocator("pytorch")
require_gpu(rank)
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe("merge_entities")
tmp_list = []
for doc in nlp.pipe(unit):
res = " ".join([t.text if not t.ent_type_ else t.ent_type_ for t in doc])
tmp_list.append(res)
result = comm.gather(tmp_list, root=0)
if comm.rank == 0:
print (result)
else:
result = None
或者如果我在同一台机器上有 4 个 GPU 而我不想使用 MPI,我可以执行以下操作吗:
from joblib import Parallel, delayed
import cupy
rank = 0
def chunker(iterable, total_length, chunksize):
return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))
def flatten(list_of_lists):
"Flatten a list of lists to a combined list"
return [item for sublist in list_of_lists for item in sublist]
def process_chunk(texts):
with cupy.cuda.Device(rank):
import spacy
from thinc.api import set_gpu_allocator, require_gpu
set_gpu_allocator("pytorch")
require_gpu(rank)
preproc_pipe = []
for doc in nlp.pipe(texts, batch_size=20):
preproc_pipe.append(lemmatize_pipe(doc))
rank+=1
return preproc_pipe
def preprocess_parallel(texts, chunksize=100):
executor = Parallel(n_jobs=4, backend='multiprocessing', prefer="processes")
do = delayed(process_chunk)
tasks = (do(chunk) for chunk in chunker(texts, len(texts), chunksize=chunksize))
result = executor(tasks)
return flatten(result)
preprocess_parallel(texts = ["His friend Nicolas J. Smith is here with Bart Simpon and Fred."*100], chunksize=1000)
我想我已经想出了如何做到这一点:
关键是让cupy使用新的GPU。
import multiprocessing as mp
mp.set_start_method('spawn', force=True)
from joblib import Parallel, delayed
from itertools import cycle
import cupy
import spacy
from thinc.api import set_gpu_allocator, require_gpu
def chunker(iterable, total_length, chunksize):
return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))
def flatten(list_of_lists):
"Flatten a list of lists to a combined list"
return [item for sublist in list_of_lists for item in sublist]
def process_entity(doc):
super_word_ls = []
for s in doc.sents:
word_ls = []
for t in s:
if not t.ent_type_:
if (t.text.strip()!=""):
word_ls.append(t.text)
else:
word_ls.append(t.ent_type_)
if len(word_ls)>0:
super_word_ls.append(" ".join(word_ls))
return " ".join(super_word_ls)
def process_chunk(texts, rank):
print(rank)
with cupy.cuda.Device(rank):
set_gpu_allocator("pytorch")
require_gpu(rank)
nlp = spacy.load("en_core_web_trf")
preproc_pipe = []
for doc in nlp.pipe(texts, batch_size=20):
preproc_pipe.append(process_entity(doc))
rank+=1
return preproc_pipe
def preprocess_parallel(texts, chunksize=100):
executor = Parallel(n_jobs=2, backend='multiprocessing', prefer="processes")
do = delayed(process_chunk)
tasks = []
gpus = list(range(0, cupy.cuda.runtime.getDeviceCount()))
rank = 0
for chunk in chunker(texts, len(texts), chunksize=chunksize):
tasks.append(do(chunk, rank))
rank = (rank+1)%len(gpus)
result = executor(tasks)
return flatten(result)
if __name__ == '__main__':
print(preprocess_parallel(texts = ["His friend Nicolas J. Smith is here with Bart Simpon and Fred."]*100, chunksize=50))