如何在 Python 中使用 "model.trt"

Question

我有一个 pytorch 模型，我导出到 ONNX 并使用以下命令转换为 tensorflow 模型：

trtexec --onnx=model.onnx --batch=400 --saveEngine=model.trt

所有这些都有效，但我现在如何在 python 和运行中加载这个 model.trt 推理？

Answer 1

根据 this tutorial 找到答案。

import numpy as np
import tensorrt as trt
import pycuda.driver as cuda

dev = cuda.Device(0)
ctx = dev.make_context()

try:
    TRT_LOGGER = trt.Logger(trt.Logger.INFO)
    with open("model.trt", 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())

    with engine.create_execution_context() as context:
        # get sizes of input and output and allocate memory required for input data and for output data
        for binding in engine:
            if engine.binding_is_input(binding):  # we expect only one input
                input_shape = engine.get_binding_shape(binding)
                input_size = trt.volume(input_shape) * engine.max_batch_size * np.dtype(np.float32).itemsize  # in bytes
                device_input = cuda.mem_alloc(input_size)
            else:  # and one output
                output_shape = engine.get_binding_shape(binding)
                # create page-locked memory buffers (i.e. won't be swapped to disk)
                host_output = cuda.pagelocked_empty(trt.volume(output_shape) * engine.max_batch_size, dtype=np.float32)
                device_output = cuda.mem_alloc(host_output.nbytes)

        stream = cuda.Stream()

        host_input = np.array(batch, dtype=np.float32, order='C')
        cuda.memcpy_htod_async(device_input, host_input, stream)

        context.execute_async(bindings=[int(device_input), int(device_output)], stream_handle=stream.handle)
        cuda.memcpy_dtoh_async(host_output, device_output, stream)
        stream.synchronize()

        # postprocess results
        output_data = host_output.reshape(engine.max_batch_size, output_shape[0]).T

finally:
    ctx.pop()

Answer 2

官方文档很多examples。要遵循的基本步骤是：

ONNX 解析器：将经过训练的 ONNX 格式模型作为输入，并在 TensorRT 中填充网络对象
Builder：在 TensorRT 中获取网络并生成针对目标平台优化的引擎
引擎：获取输入数据，执行推理并发出推理输出
Logger：与构建器和引擎关联的对象，用于在构建和推理阶段捕获错误、警告和其他信息

引擎示例如下：

 import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
from onnx import ModelProto 
import onnx

import numpy as np
import matplotlib.pyplot as plt
from time import time




TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(TRT_LOGGER)

#batch_size = 1
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
#inp_shape = [batch_size, 3, 1024, 1024] # the shape I was using


def build_engine(onnx_path, shape = inp_shape):
    with trt.Builder(TRT_LOGGER) as builder,builder.create_builder_config() as config,\
    builder.create_network(explicit_batch) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        if builder.platform_has_fast_fp16:
            builder.fp16_mode = True
        builder.max_workspace_size = (1 << 30)
        #builder.max_workspace_size = (3072 << 20)
        #profile = builder.create_optimization_profile()
        #config.max_workspace_size = (3072 << 20)
        #config.add_optimization_profile(profile)
        print("parsing")
        with open(onnx_path, 'rb') as model:
            print("onnx found")
            if not parser.parse(model.read()):
                print("parse failed")
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
            #parser.parse(model.read())
        last_layer = network.get_layer(network.num_layers - 1)
        # Check if last layer recognizes it's output
        if not last_layer.get_output(0):
            # If not, then mark the output using TensorRT API
            network.mark_output(last_layer.get_output(0))
        network.get_input(0).shape = shape

        engine = builder.build_cuda_engine(network)
        return engine


def save_engine(engine, file_name):
    buf = engine.serialize()
    with open(file_name, 'wb') as f:
        f.write(buf)


def load_engine(trt_runtime, plan_path):
    with open(engine_path, 'rb') as f:
        engine_data = f.read()
    engine = trt_runtime.deserialize_cuda_engine(engine_data)
    return engine


if __name__ == "__main__":
    onnx_path = "./path/to/your/model.onnx"
    engine_name = "./path/to/engine.plan"
    
    model = ModelProto()
    with open(onnx_path, "rb") as f:
        model.ParseFromString(f.read())

    d0 = model.graph.input[0].type.tensor_type.shape.dim[1].dim_value
    d1 = model.graph.input[0].type.tensor_type.shape.dim[2].dim_value
    d2 = model.graph.input[0].type.tensor_type.shape.dim[3].dim_value
    shape = [batch_size , d0, d1 ,d2]
    print(shape)

    print("trying to build engine")
    engine = build_engine(onnx_path,shape)
    save_engine(engine,engine_name)


    print("finished")

关注此 page 以获得另一个示例和信息。

如何在 Python 中使用 "model.trt"

How to use "model.trt" in Python

python

pytorch

tensorrt

onnx