import ctypes
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
PLUGIN_LIBRARY = "yolov5_trt/build/libmyplugins.so"
ctypes.CDLL(PLUGIN_LIBRARY)
class YoLov5TRT(object):
"""
description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
"""
def __init__(self, engine_file_path="yolov5_trt/build/yolov5.engine", conf_thresh=0.3, iou_thresh=0.5, logger=None):
# Create a Context on this device,
self.ctx = cuda.Device(0).make_context()
stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
self.lock = threading.Lock()
self.conf_thresh = conf_thresh
self.iou_thresh = iou_thresh
self.logger = logger
# Deserialize the engine from file
with open(engine_file_path, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
for binding in engine:
self.logger.info(f"trt bingding {binding}, {engine.get_binding_shape(binding)}")
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(cuda_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
self.input_w = engine.get_binding_shape(binding)[-1]
self.input_h = engine.get_binding_shape(binding)[-2]
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
# Store
self.stream = stream
self.context = context
self.engine = engine
self.host_inputs = host_inputs
self.cuda_inputs = cuda_inputs
self.host_outputs = host_outputs
self.cuda_outputs = cuda_outputs
self.bindings = bindings
self.batch_size = engine.max_batch_size
self.batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
def infer(self, image):
threading.Thread.__init__(self)
# Make self the active context, pushing it on top of the context stack.
image, origin_hwc = self.preprocess_image(image)
self.lock.acquire()
t0_dec = time.time()
self.ctx.push()
np.copyto(self.batch_input_image[0], image)
# Copy input image to host buffer
np.copyto(self.host_inputs[0], self.batch_input_image.ravel())
start = time.time()
# Transfer input data to the GPU.
cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0], self.stream)
# Run inference.
self.context.execute_async(batch_size=self.batch_size, bindings=self.bindings, stream_handle=self.stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(self.host_outputs[0], self.cuda_outputs[0], self.stream)
# Synchronize the stream
self.stream.synchronize()
# Here we use the first row of output in that batch_size = 1
output = self.host_outputs[0]
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
t1_dec = time.time()
self.logger.debug("trt stage: {:.1f}ms.".format(1000 * (t1_dec - t0_dec)))
results = self.post_process(
output[0: 6001], origin_hwc[0], origin_hwc[1]
)
self.lock.release()
return results # [n, 6] xywh,score,classid
def preprocess_image(self, rgb_image):
# Calculate widht and height and paddings
h, w, c = rgb_image.shape
r_w = self.input_w / w
r_h = self.input_h / h
if r_h > r_w:
tw = self.input_w
th = int(r_w * h)
tx1 = tx2 = 0
ty1 = int((self.input_h - th) / 2)
ty2 = self.input_h - th - ty1
else:
tw = int(r_h * w)
th = self.input_h
tx1 = int((self.input_w - tw) / 2)
tx2 = self.input_w - tw - tx1
ty1 = ty2 = 0
# Resize the image with long side while maintaining ratio
image = cv2.resize(rgb_image, (tw, th))
# Pad the short side with (128,128,128)
image = cv2.copyMakeBorder(
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
)
image = image.astype(np.float32)
# Normalize to [0,1]
image /= 255.0
# HWC to CHW format:
image = np.transpose(image, [2, 0, 1])
# CHW to NCHW format
image = np.expand_dims(image, axis=0)
# Convert the image to row-major order, also known as "C order":
image = np.ascontiguousarray(image)
return image, (h,w,c)
def post_process(self, output, origin_h, origin_w):
"""
description: postprocess the prediction
param:
output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
origin_h: height of original image
origin_w: width of original image
return:
result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
result_scores: finally scores, a numpy, each element is the score correspoing to box
result_classid: finally classid, a numpy, each element is the classid correspoing to box
"""
# Get the num of boxes detected
num = int(output[0])
# Reshape to a two dimentional ndarray
pred = np.reshape(output[1:], (-1, 6))[:num, :]
# Do nms
boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=self.conf_thresh, nms_thres=self.iou_thresh)
return boxes
def destroy(self):
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
其中:
class pycuda.driver.Context 的相关文档
- push() —— Make self the active context, pushing it on top of the context stack. CUDA 2.0 and above only.static