CUDA Execution Provider in ONNX makes error where combining TensorRT with ONNX

380 Views Asked by At

Moving from CUDA ONNX to TensorRT code in Python

got this error while running a model from ONNX (with CUDA Provider) and model from TensorRT in the same code.

got this errors

2023-11-26 11:46:35.483254243 [W:onnxruntime:, session_state.cc:1162 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf. 2023-11-26 11:46:35.483279701 [W:onnxruntime:, session_state.cc:1164 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments. [11/26/2023-11:46:36] [TRT] [E] 1: [convolutionRunner.cpp::execute::391] Error Code 1: Cask (Cask convolution execution) [11/26/2023-11:46:36] [TRT] [E] 1: [checkMacros.cpp::catchCudaError::272] Error Code 1: Cuda Runtime (invalid resource handle)

this errors not appear when I runs the models separately or set providers=['CPUExecutionProvider'] instead of providers=['CUDAExecutionProvider']

Make this sample code that combines both models that displays this error

import cv2
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
np.bool = np.bool_

import onnx
import onnxruntime

from profiling import GlobalProfTime, ProfTimer, mode_to_str


with GlobalProfTime('profile_tensorrt_10_000images') as t:
    with ProfTimer('TensorRT basic image profiler') as t:

   #TensorRT code
        # Load TensorRT Engine
        TRT_ENGINE_PATH = '/app/models/buffalo_l/det_10g640x640.engine'  # Path to your TensorRT engine file

        # Create a runtime
        runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))

        # Deserialize the engine
        with open(TRT_ENGINE_PATH, 'rb') as f:
            engine_data = f.read()
            engine = runtime.deserialize_cuda_engine(engine_data)

        assert engine is not None

        # Create an execution context
        context = engine.create_execution_context()

        # Allocate memory for inputs and outputs
        inputs, outputs, bindings, stream = [], [], [], cuda.Stream()
        for binding in engine:
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            bindings.append(int(device_mem))
            if engine.binding_is_input(binding):
                inputs.append({'host': host_mem, 'device': device_mem, 'name':  binding, 'shape': engine.get_binding_shape(binding), 'type': engine.get_binding_dtype(binding)})
            else:
                outputs.append({'host': host_mem, 'device': device_mem, 'name':  binding, 'shape': engine.get_binding_shape(binding), 'type': engine.get_binding_dtype(binding)})


            # Load and preprocess input image from file
            image_path = "/app/models/buffalo_l/image.png"
            image = cv2.imread(image_path)

            # Check if the image is loaded successfully
            assert image is not None

            # Preprocess the image
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, (640, 640))



            #
            image = image.astype(np.float32) / 255.0
            input_data = np.expand_dims(image.transpose(2, 0, 1), axis=0)






        #ONNX code
        onnx_model_path = "/app/models/buffalo_l/det_10g.onnx"
        onnx_model = onnx.load(onnx_model_path)
        # Create ONNX Runtime Session
        ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=['CUDAExecutionProvider']) #['CPUExecutionProvider']








        #TensorRT code
        for _ in range(1):
            with ProfTimer('TensorRT per call') as t:
                # Copy the input data to the GPU
                cuda.memcpy_htod_async(inputs[0]['device'], input_data.ravel(), stream)
                # Run inference
                if context.execute_async(batch_size=1, bindings=bindings, stream_handle=stream.handle) == 0:
                    print("Error: Unable to launch TensorRT inference.")
                # Transfer predictions back from the GPU
                if cuda.memcpy_dtoh_async(outputs[0]['host'], outputs[0]['device'], stream) == 0:
                    print("Error: Unable to copy results from GPU to host.")
                # The result is now in outputs[0]['host']
                result = outputs[0]['host']

                
                # Synchronize the stream
                stream.synchronize()

                # Print the inference results
                print("Inference TensorRT Results:")
                print(result[:20])
        stream.synchronize()









        #ONNX code
        for _ in range(1):
            with ProfTimer('ONNX(CUDA) per call') as t:
                image_path = "/app/models/buffalo_l/image.png"
                image = cv2.imread(image_path)
                assert image is not None


                # Check if the image is loaded successfully
                # Continue with image processing
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                image = cv2.resize(image, (640, 640))
                image = image.astype(np.float32) / 255.0
                input_data = np.expand_dims(image.transpose(2, 0, 1), axis=0)
                        # Run inference with the loaded image
                input_name = ort_session.get_inputs()[0].name
                outputs = ort_session.run(None, {input_name: input_data})
                # Print the inference results
                print("Inference ONNX Results:")
                print(f"{np.transpose(outputs[0][:20])}")






0

There are 0 best solutions below