Chinaunix首页 | 论坛 | 博客
  • 博客访问: 3657264
  • 博文数量: 365
  • 博客积分: 0
  • 博客等级: 民兵
  • 技术积分: 2522
  • 用 户 组: 普通用户
  • 注册时间: 2019-10-28 13:40
文章分类

全部博文(365)

文章存档

2023年(8)

2022年(130)

2021年(155)

2020年(50)

2019年(22)

我的朋友

分类: Python/Ruby

2021-06-28 17:16:48

import pycuda.autoinit

import pycuda.driver as cuda

import tensorrt as trt

import torch

import time

from PIL import Image

import cv2,os

import torchvision

import numpy as np

from scipy.special import softmax

### get_img_np_nchw hpostprocess_the_output函数根据需要进行修改

TRT_LOGGER = trt.Logger()

def get_img_np_nchw(img_path):

img = Image.open(img_path).convert('L')

img = np.asarray(img, dtype='float32')

img = cv2.resize(np.array(img),(224, 224), interpolation = cv2.INTER_CUBIC)

img = img / 255.

img = img[np.newaxis, np.newaxis]

return image

class HostDeviceMem(object):

    def __init__(self, host_mem, device_mem):

        """host_mom指代cpu内存,device_mem指代GPU内存

        """

        self.host = host_mem

        self.device = device_mem

    def __str__(self):

        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):

        return self.__str__()

def allocate_buffers(engine):

    inputs = []

    outputs = []

    bindings = []

    stream = cuda.Stream()

    for binding in engine:

        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size

        dtype = trt.nptype(engine.get_binding_dtype(binding))

        # Allocate host and device buffers

        host_mem = cuda.pagelocked_empty(size, dtype)

        device_mem = cuda.mem_alloc(host_mem.nbytes)

        # Append the device buffer to device bindings.

        bindings.append(int(device_mem))

        # Append to the appropriate list.

        if engine.binding_is_input(binding):

            inputs.append(HostDeviceMem(host_mem, device_mem))

        else:

            outputs.append(HostDeviceMem(host_mem, device_mem))

    return inputs, outputs, bindings, stream

def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="",fp16_mode=False, int8_mode=False,save_engine=False):

    """

    params max_batch_size:      预先指定大小好分配显存

    params onnx_file_path:      onnx文件路径

    params engine_file_path:    待保存的序列化的引擎文件路径

    params fp16_mode:           是否采用FP16

    params int8_mode:           是否采用INT8

    params save_engine:         是否保存引擎

    returns:                    ICudaEngine

    """

    # 如果已经存在序列化之后的引擎,则直接反序列化得到cudaEngine

    if os.path.exists(engine_file_path):

        print("Reading engine from file: {}".format(engine_file_path))

        with open(engine_file_path, 'rb') as f, \

            trt.Runtime(TRT_LOGGER) as runtime:

            return runtime.deserialize_cuda_engine(f.read())  # 反序列化

    else:  # onnx创建cudaEngine

        # 使用logger创建一个builder

        # builder创建一个计算图 INetworkDefinition

        explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)

        # In TensorRT 7.0, the ONNX parser only supports full-dimensions mode, meaning that your network definition must be created with the explicitBatch flag set. For more information, see Working With Dynamic Shapes.

        with trt.Builder(TRT_LOGGER) as builder, \

            builder.create_network(explicit_batch) as network,  \

            trt.OnnxParser(network, TRT_LOGGER) as parser, \

            builder.create_builder_config() as config: # 使用onnx的解析器绑定计算图,后续将通过解析填充计算图

            profile = builder.create_optimization_profile()

            profile.set_shape("inputs", (1, 1, 224, 224),(1,1,224,224),(1,1,224,224))

            config.add_optimization_profile(profile)

            config.max_workspace_size = 1<<30  # 预先分配的工作空间大小,ICudaEngine执行时GPU最大需要的空间

            builder.max_batch_size = max_batch_size # 执行时最大可以使用的batchsize

            builder.fp16_mode = fp16_mode

            builder.int8_mode = int8_mode

            if int8_mode:

                # To be updated

                raise NotImplementedError

            # 解析onnx文件,填充计算图

            if not os.path.exists(onnx_file_path):

                quit("ONNX file {} not found!".format(onnx_file_path))

            print('loading onnx file from path {} ...'.format(onnx_file_path))

            # with open(onnx_file_path, 'rb') as model: # 二值化的网络结果和参数

            #     print("Begining onnx file parsing")

            #     parser.parse(model.read())  # 解析onnx文件

            parser.parse_from_file(onnx_file_path) # parser还有一个从文件解析onnx的方法

            print("Completed parsing of onnx file")

            # 填充计算图完成后,则使用builder从计算图中创建CudaEngine

            print("Building an engine from file{}' this may take a while...".format(onnx_file_path))

            #################

            # import pdb;pdb.set_trace()

            print(network.get_layer(network.num_layers-1).get_output(0).shape)

            # network.mark_output(network.get_layer(network.num_layers -1).get_output(0))

            engine = builder.build_engine(network,config)  # 注意,这里的networkINetworkDefinition类型,即填充后的计算图

            print("Completed creating Engine")

            if save_engine:  #保存engine供以后直接反序列化使用

                with open(engine_file_path, 'wb') as f:

                    f.write(engine.serialize())  # 序列化

            return engine

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):

    # Transfer data from CPU to the GPU.

    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]

    # Run inference.

    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)

    # Transfer predictions back from the GPU.

    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]

    # Synchronize the stream

    stream.synchronize()

    # Return only the host outputs.

    return [out.host for out in outputs]

def postprocess_the_outputs(outputs, shape_of_output):

    outputs = outputs.reshape(*shape_of_output)

    out = np.argmax(softmax(outputs,axis=1)[0,...],axis=0)

    # import pdb;pdb.set_trace()

    return out

# 验证TensorRT模型是否正确

onnx_model_path = './Net.onnx'

max_batch_size = 1

# These two modes are dependent on hardwares

fp16_mode = False

int8_mode = False

trt_engine_path = './model_fp16_{}_int8_{}.trt'.format(fp16_mode, int8_mode)

# Build an engine

engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode, int8_mode , save_engine=True)

# Create the context for this engine

context = engine.create_execution_context()

# Allocate buffers for input and output

inputs, outputs, bindings, stream = allocate_buffers(engine)  # input, output: host # bindings

# Do inference

img_np_nchw = get_img_np_nchw(img_path)

inputs[0].host = img_np_nchw.reshape(-1)

shape_of_output = (max_batch_size, 2, 224, 224)

# inputs[1].host = ... for multiple input

t1 = time.time()

trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data

t2 = time.time()

feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)

print('TensorRT ok')

print("Inference time with the TensorRT engine: {}".format(t2-t1))

阅读(3068) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~