如何在 Python 中使用 "model.trt"

How to use "model.trt" in Python

我有一个 pytorch 模型,我导出到 ONNX 并使用以下命令转换为 tensorflow 模型:

trtexec --onnx=model.onnx --batch=400 --saveEngine=model.trt

所有这些都有效,但我现在如何在 python 和 运行 中加载这个 model.trt 推理?

根据 this tutorial 找到答案。

import numpy as np
import tensorrt as trt
import pycuda.driver as cuda

dev = cuda.Device(0)
ctx = dev.make_context()

try:
    TRT_LOGGER = trt.Logger(trt.Logger.INFO)
    with open("model.trt", 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())

    with engine.create_execution_context() as context:
        # get sizes of input and output and allocate memory required for input data and for output data
        for binding in engine:
            if engine.binding_is_input(binding):  # we expect only one input
                input_shape = engine.get_binding_shape(binding)
                input_size = trt.volume(input_shape) * engine.max_batch_size * np.dtype(np.float32).itemsize  # in bytes
                device_input = cuda.mem_alloc(input_size)
            else:  # and one output
                output_shape = engine.get_binding_shape(binding)
                # create page-locked memory buffers (i.e. won't be swapped to disk)
                host_output = cuda.pagelocked_empty(trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32)
                device_output = cuda.mem_alloc(host_output.nbytes)

        stream = cuda.Stream()

        host_input = np.array(batch, dtype=np.float32, order='C')
        cuda.memcpy_htod_async(device_input, host_input, stream)

        context.execute_async(bindings=[int(device_input), int(device_output)], stream_handle=stream.handle)
        cuda.memcpy_dtoh_async(host_output, device_output, stream)
        stream.synchronize()

        # postprocess results
        output_data = host_output.reshape(engine.max_batch_size, output_shape[0]).T

finally:
    ctx.pop()

官方文档很多examples。要遵循的基本步骤是:

  • ONNX 解析器:将经过训练的 ONNX 格式模型作为输入,并在 TensorRT 中填充网络对象
  • Builder:在 TensorRT 中获取网络并生成针对目标平台优化的引擎
  • 引擎:获取输入数据,执行推理并发出推理输出
  • Logger:与构建器和引擎关联的对象,用于在构建和推理阶段捕获错误、警告和其他信息

引擎示例如下:

 import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
from onnx import ModelProto 
import onnx

import numpy as np
import matplotlib.pyplot as plt
from time import time




TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)

#batch_size = 1
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
#inp_shape = [batch_size, 3, 1024, 1024] # the shape I was using


def build_engine(onnx_path, shape = inp_shape):
    with trt.Builder(TRT_LOGGER) as builder,builder.create_builder_config() as config,\
    builder.create_network(explicit_batch) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        if builder.platform_has_fast_fp16:
            builder.fp16_mode = True
        builder.max_workspace_size = (1 << 30)
        #builder.max_workspace_size = (3072 << 20)
        #profile = builder.create_optimization_profile()
        #config.max_workspace_size = (3072 << 20)
        #config.add_optimization_profile(profile)
        print("parsing")
        with open(onnx_path, 'rb') as model:
            print("onnx found")
            if not parser.parse(model.read()):
                print("parse failed")
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
            #parser.parse(model.read())
        last_layer = network.get_layer(network.num_layers - 1)
        # Check if last layer recognizes it's output
        if not last_layer.get_output(0):
            # If not, then mark the output using TensorRT API
            network.mark_output(last_layer.get_output(0))
        network.get_input(0).shape = shape

        engine = builder.build_cuda_engine(network)
        return engine


def save_engine(engine, file_name):
    buf = engine.serialize()
    with open(file_name, 'wb') as f:
        f.write(buf)


def load_engine(trt_runtime, plan_path):
    with open(engine_path, 'rb') as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine


if __name__ == "__main__":
    onnx_path = "./path/to/your/model.onnx"
    engine_name = "./path/to/engine.plan"
    
    model = ModelProto()
    with open(onnx_path, "rb") as f:
        model.ParseFromString(f.read())

    d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
    d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
    d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
    shape = [batch_size , d0, d1 ,d2]
    print(shape)

    print("trying to build engine")
    engine = build_engine(onnx_path,shape)
    save_engine(engine,engine_name)


    print("finished")

关注此 page 以获得另一个示例和信息。