分类: Python/Ruby
2021-06-28 17:16:48
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import torch
import time
from PIL import Image
import cv2,os
import torchvision
import numpy as np
from scipy.special import softmax
### get_img_np_nchw h和postprocess_the_output函数根据需要进行修改
TRT_LOGGER = trt.Logger()
def get_img_np_nchw(img_path):
img = Image.open(img_path).convert('L')
img = np.asarray(img, dtype='float32')
img = cv2.resize(np.array(img),(224, 224), interpolation = cv2.INTER_CUBIC)
img = img / 255.
img = img[np.newaxis, np.newaxis]
return image
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
"""host_mom指代cpu内存,device_mem指代GPU内存
"""
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="",fp16_mode=False, int8_mode=False,save_engine=False):
"""
params max_batch_size: 预先指定大小好分配显存
params onnx_file_path: onnx文件路径
params engine_file_path: 待保存的序列化的引擎文件路径
params fp16_mode: 是否采用FP16
params int8_mode: 是否采用INT8
params save_engine: 是否保存引擎
returns: ICudaEngine
"""
# 如果已经存在序列化之后的引擎,则直接反序列化得到cudaEngine
if os.path.exists(engine_file_path):
print("Reading engine from file: {}".format(engine_file_path))
with open(engine_file_path, 'rb') as f, \
trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read()) # 反序列化
else: # 由onnx创建cudaEngine
# 使用logger创建一个builder
# builder创建一个计算图 INetworkDefinition
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
# In TensorRT 7.0, the ONNX parser only supports full-dimensions mode, meaning that your network definition must be created with the explicitBatch flag set. For more information, see Working With Dynamic Shapes.
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network(explicit_batch) as network, \
trt.OnnxParser(network, TRT_LOGGER) as parser, \
builder.create_builder_config() as config: # 使用onnx的解析器绑定计算图,后续将通过解析填充计算图
profile = builder.create_optimization_profile()
profile.set_shape("inputs", (1, 1, 224, 224),(1,1,224,224),(1,1,224,224))
config.add_optimization_profile(profile)
config.max_workspace_size = 1<<30 # 预先分配的工作空间大小,即ICudaEngine执行时GPU最大需要的空间
builder.max_batch_size = max_batch_size # 执行时最大可以使用的batchsize
builder.fp16_mode = fp16_mode
builder.int8_mode = int8_mode
if int8_mode:
# To be updated
raise NotImplementedError
# 解析onnx文件,填充计算图
if not os.path.exists(onnx_file_path):
quit("ONNX file {} not found!".format(onnx_file_path))
print('loading onnx file from path {} ...'.format(onnx_file_path))
# with open(onnx_file_path, 'rb') as model: # 二值化的网络结果和参数
# print("Begining onnx file parsing")
# parser.parse(model.read()) # 解析onnx文件
parser.parse_from_file(onnx_file_path) # parser还有一个从文件解析onnx的方法
print("Completed parsing of onnx file")
# 填充计算图完成后,则使用builder从计算图中创建CudaEngine
print("Building an engine from file{}' this may take a while...".format(onnx_file_path))
#################
# import pdb;pdb.set_trace()
print(network.get_layer(network.num_layers-1).get_output(0).shape)
# network.mark_output(network.get_layer(network.num_layers -1).get_output(0))
engine = builder.build_engine(network,config) # 注意,这里的network是INetworkDefinition类型,即填充后的计算图
print("Completed creating Engine")
if save_engine: #保存engine供以后直接反序列化使用
with open(engine_file_path, 'wb') as f:
f.write(engine.serialize()) # 序列化
return engine
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer data from CPU to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def postprocess_the_outputs(outputs, shape_of_output):
outputs = outputs.reshape(*shape_of_output)
out = np.argmax(softmax(outputs,axis=1)[0,...],axis=0)
# import pdb;pdb.set_trace()
return out
# 验证TensorRT模型是否正确
onnx_model_path = './Net.onnx'
max_batch_size = 1
# These two modes are dependent on hardwares
fp16_mode = False
int8_mode = False
trt_engine_path = './model_fp16_{}_int8_{}.trt'.format(fp16_mode, int8_mode)
# Build an engine
engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode, int8_mode , save_engine=True)
# Create the context for this engine
context = engine.create_execution_context()
# Allocate buffers for input and output
inputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings
# Do inference
img_np_nchw = get_img_np_nchw(img_path)
inputs[0].host = img_np_nchw.reshape(-1)
shape_of_output = (max_batch_size, 2, 224, 224)
# inputs[1].host = ... for multiple input
t1 = time.time()
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data
t2 = time.time()
feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
print('TensorRT ok')
print("Inference time with the TensorRT engine: {}".format(t2-t1))