Spacy 是否支持多个 GPU?

Does Spacy support multiple GPUs?

我想知道 Spacy 是否通过 mpi4py 支持多 GPU?

我目前正在使用 Spacy 的 nlp.pipe 在支持 MPI 协议并具有许多 GPU 的高性能计算集群上进行命名实体识别。它说 here 我需要指定 GPU 与 cupy 一起使用,但是对于 PyMPI,我不确定以下是否有效(我应该在调用 cupy 设备后导入 spacy 吗?):


from mpi4py import MPI
import cupy

comm = MPI.COMM_WORLD
rank = comm.Get_rank()

if rank == 0:
    data = ["His friend Nicolas J. Smith is here with Bart Simpon and Fred."*100]
else:
    data = None

unit = comm.scatter(data, root=0)

with cupy.cuda.Device(rank):
    import spacy
    from thinc.api import set_gpu_allocator, require_gpu
    set_gpu_allocator("pytorch")
    require_gpu(rank)
    nlp = spacy.load('en_core_web_lg')
    nlp.add_pipe("merge_entities")
    tmp_list = []
    for doc in nlp.pipe(unit):
        res = " ".join([t.text if not t.ent_type_ else t.ent_type_ for t in doc])
        tmp_list.append(res)

result = comm.gather(tmp_list, root=0)

if comm.rank == 0:
    print (result)
else:
    result = None

或者如果我在同一台机器上有 4 个 GPU 而我不想使用 MPI,我可以执行以下操作吗:

from joblib import Parallel, delayed
import cupy

rank = 0

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    "Flatten a list of lists to a combined list"
    return [item for sublist in list_of_lists for item in sublist]

def process_chunk(texts):
    with cupy.cuda.Device(rank):
        import spacy
        from thinc.api import set_gpu_allocator, require_gpu
        set_gpu_allocator("pytorch")
        require_gpu(rank)
        preproc_pipe = []
        for doc in nlp.pipe(texts, batch_size=20):
            preproc_pipe.append(lemmatize_pipe(doc))
        rank+=1
        return preproc_pipe

def preprocess_parallel(texts, chunksize=100):
    executor = Parallel(n_jobs=4, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk)
    tasks = (do(chunk) for chunk in chunker(texts, len(texts), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)

preprocess_parallel(texts = ["His friend Nicolas J. Smith is here with Bart Simpon and Fred."*100], chunksize=1000)

我想我已经想出了如何做到这一点:

关键是让cupy使用新的GPU。

import multiprocessing as mp
mp.set_start_method('spawn', force=True)
from joblib import Parallel, delayed
from itertools import cycle
import cupy
import spacy
from thinc.api import set_gpu_allocator, require_gpu


def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    "Flatten a list of lists to a combined list"
    return [item for sublist in list_of_lists for item in sublist]

def process_entity(doc):
    super_word_ls = []
    for s in doc.sents:
        word_ls = []
        for t in s:
            if not t.ent_type_:
                if (t.text.strip()!=""):
                    word_ls.append(t.text)
            else:
                word_ls.append(t.ent_type_)
        if len(word_ls)>0:
            super_word_ls.append(" ".join(word_ls))
    return " ".join(super_word_ls)

def process_chunk(texts, rank):
    print(rank)
    with cupy.cuda.Device(rank):
        set_gpu_allocator("pytorch")
        require_gpu(rank)
        nlp = spacy.load("en_core_web_trf")
        preproc_pipe = []
        for doc in nlp.pipe(texts, batch_size=20):
            preproc_pipe.append(process_entity(doc))
        rank+=1
        return preproc_pipe


def preprocess_parallel(texts, chunksize=100):
    executor = Parallel(n_jobs=2, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk)
    tasks = []
    gpus = list(range(0, cupy.cuda.runtime.getDeviceCount()))
    rank = 0
    for chunk in chunker(texts, len(texts), chunksize=chunksize):
        tasks.append(do(chunk, rank))
        rank = (rank+1)%len(gpus)
    result = executor(tasks)
    return flatten(result)

if __name__ == '__main__':
    print(preprocess_parallel(texts = ["His friend Nicolas J. Smith is here with Bart Simpon and Fred."]*100, chunksize=50))