cuMemcpyHtoDAsync failed: invalid argument by using TensorRT (Python)

305 Views Asked by At

I am trying to copy an np array to the GPU using TensorRT in Python but I keep getting the error 'cuMemcpyHtoDAsync failed: invalid argument'. The array has the correct format (float32) and size, but the error remains. Does anyone have an idea of what I am doing wrong or how I can fix this error?

import tensorrt as trt
import pycuda.driver as cuda
import numpy as np
import cv2

def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    
    cuda.init()
    device = cuda.Device(0)
    ctx = device.make_context()
    stream = cuda.Stream()

#    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(host_mem)
        else:
            outputs.append(host_mem)
    return inputs, outputs, bindings, stream

def do_inference(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp, i, stream) for inp, i in zip(bindings[:len(inputs)], inputs)]
    # Run inference.
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out, o, stream) for out, o in zip(outputs, bindings[len(inputs):])]
    # Synchronize the stream
    stream.synchronize()

def detect_objects(image, engine, context, threshold=0.5):
    # Preprocess the image
    image = cv2.resize(image, (640, 640))
    image = np.transpose(image, (2, 0, 1))
    image = np.expand_dims(image, axis=0)
    # Allocate buffers
    inputs, outputs, bindings, stream = allocate_buffers(engine)
    #inputs[0] = np.ascontiguousarray(image)
    inputs[0] = np.ascontiguousarray(image, dtype=np.float32) / 255.0
    print(inputs[0].shape)
    print(inputs[0].dtype)
    
    # Run inference
    do_inference(context, bindings, inputs, outputs, stream)
    # Postprocess the outputs
    outputs = outputs[0]
    outputs = outputs[outputs[:, 0] > threshold]
    # Get the bounding boxes
    boxes = outputs[:, 1:]
    
    return boxes

# Load the engine
engine = trt.Runtime(trt.Logger(trt.Logger.WARNING)).deserialize_cuda_engine(open("Modelle/best.engine", "rb").read())
context = engine.create_execution_context()

# Read the image
image = cv2.imread("Test.jpg")

# Detect objects in the image
boxes = detect_objects(image, engine, context)

print (boxes)

or am I doing something fundamentally wrong when loading the tensorRT file? Is there another way to index an object on an image?

Thanks

1

There are 1 best solutions below

0
On

This seems to be an alignment, size issue.

If you use cuda.mem_alloc_alike, cuda.memcpy_htod_async no longer fails.

host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc_like(host_mem)