I'm trying to use the tensor-rt framework to enhance the inference speed of my deep learning model. I've created a very simple python code to test tensor-rt with pytorch.
import torch
import argparse
import time
import numpy as np
import torch_tensorrt
# Define a simple PyTorch model
class MyModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = torch.nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
self.relu1 = torch.nn.ReLU()
self.conv2 = torch.nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
self.relu2 = torch.nn.ReLU()
self.pool = torch.nn.MaxPool2d(kernel_size=2, stride=2)
self.fc1 = torch.nn.Linear(64 * 16 * 16, 512)
self.relu3 = torch.nn.ReLU()
self.fc2 = torch.nn.Linear(512, 10)
def forward(self, x):
x = self.conv1(x)
x = self.relu1(x)
x = self.conv2(x)
x = self.relu2(x)
x = self.pool(x)
x = x.view(-1, 64 * 16 * 16)
x = self.fc1(x)
x = self.relu3(x)
x = self.fc2(x)
return x
def compute(use_tensorrt=False):
force_cpu = False
useCuda = torch.cuda.is_available() and not force_cpu
if useCuda:
print('Using CUDA.')
dtype = torch.cuda.FloatTensor
ltype = torch.cuda.LongTensor
device = torch.device("cuda:0")
else:
print('No CUDA available.')
dtype = torch.FloatTensor
ltype = torch.LongTensor
device = torch.device("cpu")
model = MyModel()
input_shape = (8192, 3, 32, 32)
if use_tensorrt:
model = torch.compile(
model,
backend="torch_tensorrt",
options={
"truncate_long_and_double": True,
"precision": dtype,
"workspace_size" : 20 << 30
},
dynamic=False,
)
model = model.to(device)
model.eval()
num_iterations = 100
total_time = 0.0
with torch.no_grad():
input_data = torch.randn(input_shape).to(device).type(dtype)
#warmup
for i in range(100):
output_data = model(input_data)
for i in range(num_iterations):
start_time = time.time()
output_data = model(input_data)
end_time = time.time()
total_time += end_time - start_time
pytorch_fps = num_iterations / total_time
print(f"PyTorch FPS: {pytorch_fps:.2f}")
if __name__ == "__main__":
print("Without TensorRT")
compute()
print("With TensorRT")
compute(use_tensorrt=True)
Unfortunately, when I run this code, I get approximately the same FPS with tensor-rt and without, which is ~14.2, even with a significant warmup. Does anyone know what could be the issue ? Is there something I'm missing ?
Here are some more information about my setup:
libraries:
torch 2.0.1
torch_tensorrt 1.4.0
GPU:
nvcc: NVIDIA (R) Cuda compiler driver
Cuda compilation tools, release 11.5, V11.5.119
Build cuda_11.5.r11.5/compiler.30672275_0
As
torch.compile
was first released in PyTorch 2.0, most of it was still experimental and wasn't documented very thoroughly.I have absolutely no idea why your implementation doesn't work, but you get the desired improvement by using
torch_tensorrt.compile
instead oftorch.compile
.On my system with the same library versions, this yields a 3x improvement
If I had to guess, it probably has something to do with
torch.compile
failing with the options provided, and defaulting to some other IR.