Moving from CUDA ONNX to TensorRT code in Python
got this error while running a model from ONNX (with CUDA Provider) and model from TensorRT in the same code.
got this errors
2023-11-26 11:46:35.483254243 [W:onnxruntime:, session_state.cc:1162 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf. 2023-11-26 11:46:35.483279701 [W:onnxruntime:, session_state.cc:1164 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments. [11/26/2023-11:46:36] [TRT] [E] 1: [convolutionRunner.cpp::execute::391] Error Code 1: Cask (Cask convolution execution) [11/26/2023-11:46:36] [TRT] [E] 1: [checkMacros.cpp::catchCudaError::272] Error Code 1: Cuda Runtime (invalid resource handle)
this errors not appear when I runs the models separately or set providers=['CPUExecutionProvider'] instead of providers=['CUDAExecutionProvider']
Make this sample code that combines both models that displays this error
import cv2
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
np.bool = np.bool_
import onnx
import onnxruntime
from profiling import GlobalProfTime, ProfTimer, mode_to_str
with GlobalProfTime('profile_tensorrt_10_000images') as t:
with ProfTimer('TensorRT basic image profiler') as t:
#TensorRT code
# Load TensorRT Engine
TRT_ENGINE_PATH = '/app/models/buffalo_l/det_10g640x640.engine' # Path to your TensorRT engine file
# Create a runtime
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
# Deserialize the engine
with open(TRT_ENGINE_PATH, 'rb') as f:
engine_data = f.read()
engine = runtime.deserialize_cuda_engine(engine_data)
assert engine is not None
# Create an execution context
context = engine.create_execution_context()
# Allocate memory for inputs and outputs
inputs, outputs, bindings, stream = [], [], [], cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if engine.binding_is_input(binding):
inputs.append({'host': host_mem, 'device': device_mem, 'name': binding, 'shape': engine.get_binding_shape(binding), 'type': engine.get_binding_dtype(binding)})
else:
outputs.append({'host': host_mem, 'device': device_mem, 'name': binding, 'shape': engine.get_binding_shape(binding), 'type': engine.get_binding_dtype(binding)})
# Load and preprocess input image from file
image_path = "/app/models/buffalo_l/image.png"
image = cv2.imread(image_path)
# Check if the image is loaded successfully
assert image is not None
# Preprocess the image
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = cv2.resize(image, (640, 640))
#
image = image.astype(np.float32) / 255.0
input_data = np.expand_dims(image.transpose(2, 0, 1), axis=0)
#ONNX code
onnx_model_path = "/app/models/buffalo_l/det_10g.onnx"
onnx_model = onnx.load(onnx_model_path)
# Create ONNX Runtime Session
ort_session = onnxruntime.InferenceSession(onnx_model_path, providers=['CUDAExecutionProvider']) #['CPUExecutionProvider']
#TensorRT code
for _ in range(1):
with ProfTimer('TensorRT per call') as t:
# Copy the input data to the GPU
cuda.memcpy_htod_async(inputs[0]['device'], input_data.ravel(), stream)
# Run inference
if context.execute_async(batch_size=1, bindings=bindings, stream_handle=stream.handle) == 0:
print("Error: Unable to launch TensorRT inference.")
# Transfer predictions back from the GPU
if cuda.memcpy_dtoh_async(outputs[0]['host'], outputs[0]['device'], stream) == 0:
print("Error: Unable to copy results from GPU to host.")
# The result is now in outputs[0]['host']
result = outputs[0]['host']
# Synchronize the stream
stream.synchronize()
# Print the inference results
print("Inference TensorRT Results:")
print(result[:20])
stream.synchronize()
#ONNX code
for _ in range(1):
with ProfTimer('ONNX(CUDA) per call') as t:
image_path = "/app/models/buffalo_l/image.png"
image = cv2.imread(image_path)
assert image is not None
# Check if the image is loaded successfully
# Continue with image processing
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = cv2.resize(image, (640, 640))
image = image.astype(np.float32) / 255.0
input_data = np.expand_dims(image.transpose(2, 0, 1), axis=0)
# Run inference with the loaded image
input_name = ort_session.get_inputs()[0].name
outputs = ort_session.run(None, {input_name: input_data})
# Print the inference results
print("Inference ONNX Results:")
print(f"{np.transpose(outputs[0][:20])}")