2022年8月16日

pytorch-yolov5-TRT加速推理

import ctypes
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt

PLUGIN_LIBRARY = "yolov5_trt/build/libmyplugins.so"
ctypes.CDLL(PLUGIN_LIBRARY)

class YoLov5TRT(object):
    """
    description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path="yolov5_trt/build/yolov5.engine", conf_thresh=0.3, iou_thresh=0.5, logger=None):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)
        self.lock = threading.Lock()
        
        self.conf_thresh = conf_thresh
        self.iou_thresh = iou_thresh 
        self.logger = logger
        
        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            self.logger.info(f"trt bingding {binding}, {engine.get_binding_shape(binding)}")
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)
        
        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size
        self.batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])

    def infer(self, image):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        image, origin_hwc = self.preprocess_image(image)
        self.lock.acquire()
        
        t0_dec = time.time()
        self.ctx.push()
        np.copyto(self.batch_input_image[0], image)
        # Copy input image to host buffer
        np.copyto(self.host_inputs[0], self.batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(self.cuda_inputs[0], self.host_inputs[0], self.stream)
        # Run inference.
        self.context.execute_async(batch_size=self.batch_size, bindings=self.bindings, stream_handle=self.stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(self.host_outputs[0], self.cuda_outputs[0], self.stream)
        # Synchronize the stream
        self.stream.synchronize()
        # Here we use the first row of output in that batch_size = 1
        output = self.host_outputs[0]
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        t1_dec = time.time()
        self.logger.debug("trt stage: {:.1f}ms.".format(1000 * (t1_dec - t0_dec)))
        
        results = self.post_process(
                output[0: 6001], origin_hwc[0], origin_hwc[1]
        )
        self.lock.release()
        return results # [n, 6] xywh,score,classid

    def preprocess_image(self, rgb_image):
        # Calculate widht and height and paddings
        h, w, c = rgb_image.shape
        
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(rgb_image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, (h,w,c)
    
    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] 
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, 6))[:num, :]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=self.conf_thresh, nms_thres=self.iou_thresh)

        return boxes
        
    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

其中:

class pycuda.driver.Context相关文档

  • push() —— Make self the active context, pushing it on top of the context stack. CUDA 2.0 and above only.static 
  • pop() —— Remove any context from the top of the context stack, deactivating it. CUDA 2.0 and above only.
  • 多进程间可以通过上述两个方法将进程隔离,但内部多线程仍需要单独加锁处理

Share

You may also like...

发表评论

您的电子邮箱地址不会被公开。