Identifying the cause of poor training performance on RTX 4090

70 Views Asked by At

Hardware specs-

  • CPU- Ryzen 3970X Threadripper
  • GPU- 2x4090
  • OS- Ubuntu 22.04 (PyTorch Docker Image)
  • RAM- 128GB

nvidia-smi

+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:21:00.0 Off |                  Off |
|  0%   46C    P8              37W / 450W |    269MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        Off | 00000000:49:00.0 Off |                  Off |
|  0%   49C    P8              28W / 450W |     11MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

nvcc -V

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Jul_11_02:20:44_PDT_2023
Cuda compilation tools, release 12.2, V12.2.128
Build cuda_12.2.r12.2/compiler.33053471_0

For some reason, training on the GPUs happens rather slowly as compared to my other machine with a 3080ti and 13600k (WSL 2.0) While only using a single 4090 for comparison, the 3080ti machine is much faster than the 4090 machine.

The training script for reference (was written using a tutorial from DataCamp)-

import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from torch.profiler import profile, record_function, ProfilerActivity
import torch

MODEL_NAME = "t5-small"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Acquire the training data from Hugging Face
DATA_NAME = "yahoo_answers_qa"
yahoo_answers_qa = load_dataset(DATA_NAME)

yahoo_answers_qa = yahoo_answers_qa["train"].train_test_split(test_size=100, train_size=1000)

# We prefix our tasks with "answer the question"
prefix = "Please answer this question: "

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["question"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)
  
   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["answer"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

# Map the preprocessing function across our dataset
tokenized_dataset = yahoo_answers_qa.map(preprocess_function, batched=True)

nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 2 # small batch size due to limited resources on the 3080ti machine
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 1

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)

trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

torch.cuda.synchronize()

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True) as prof:
    trainer.train()
print(prof.key_averages().table(row_limit=15, sort_by="self_cuda_time_total"))

I tried using PyTorch's internal profiler to figure out what could be causing this but I couldn't understand the output very well (I tried both torch.profiler and torch.utils.bottleneck)

Here are the results of the various runs on both the machines-

torch.profiler on 4090

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 4441.05 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 3872.25 examples/s]
STAGE:2024-03-21 02:18:26 270203:270203 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
{'loss': 4.1186, 'learning_rate': 0.0, 'epoch': 1.0}                                                                                                                                                                                                                                     
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:48<00:00, 10.12it/s]
{'eval_loss': 3.9080231189727783, 'eval_rouge1': 0.13428924748788773, 'eval_rouge2': 0.02317430268607329, 'eval_rougeL': 0.11411167280744719, 'eval_rougeLsum': 0.11964790797296734, 'eval_runtime': 6.5172, 'eval_samples_per_second': 15.344, 'eval_steps_per_second': 3.836, 'epoch': 1.0}
{'train_runtime': 56.4093, 'train_samples_per_second': 17.728, 'train_steps_per_second': 8.864, 'train_loss': 4.1186474609375, 'epoch': 1.0}                                                                                                                                             
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:56<00:00,  8.86it/s]
STAGE:2024-03-21 02:19:30 270203:270203 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-03-21 02:19:33 270203:270203 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                               aten::mm        11.72%        4.517s        17.70%        6.821s      39.563us        1.492s        25.09%        1.673s       9.703us           0 b           0 b     151.09 Gb     151.09 Gb        172400  
                                    aten::_foreach_mul_         0.49%     189.232ms         0.61%     235.115ms     117.558us     789.891ms        13.28%     792.785ms     396.392us           0 b           0 b           0 b           0 b          2000  
void at::native::(anonymous namespace)::multi_tensor...         0.00%       0.000us         0.00%       0.000us       0.000us     543.324ms         9.14%     543.324ms     155.235us           0 b           0 b           0 b           0 b          3500  
                                aten::_foreach_addcdiv_         0.28%     109.787ms         0.32%     123.529ms     123.529us     535.961ms         9.01%     536.923ms     536.923us           0 b           0 b           0 b           0 b          1000  
void at::native::(anonymous namespace)::multi_tensor...         0.00%       0.000us         0.00%       0.000us       0.000us     535.961ms         9.01%     535.961ms     267.981us           0 b           0 b           0 b           0 b          2000  
                                              aten::bmm         4.20%        1.619s         5.53%        2.132s      32.012us     418.858ms         7.04%     462.192ms       6.940us           0 b           0 b      10.87 Gb      10.87 Gb         66600  
                                aten::_foreach_addcmul_         0.27%     104.808ms         0.32%     121.843ms     121.843us     403.754ms         6.79%     404.620ms     404.620us           0 b           0 b           0 b           0 b          1000  
void at::native::(anonymous namespace)::multi_tensor...         0.00%       0.000us         0.00%       0.000us       0.000us     403.754ms         6.79%     403.754ms     201.877us           0 b           0 b           0 b           0 b          2000  
                                   aten::_foreach_lerp_         0.09%      34.689ms         0.14%      52.157ms      52.157us     402.333ms         6.76%     402.989ms     402.989us           0 b           0 b           0 b           0 b          1000  
void at::native::(anonymous namespace)::multi_tensor...         0.00%       0.000us         0.00%       0.000us       0.000us     402.333ms         6.76%     402.333ms     201.167us           0 b           0 b           0 b           0 b          2000  
                                       cudaLaunchKernel        17.51%        6.747s        17.51%        6.747s       8.644us     370.963ms         6.24%     370.963ms       0.475us        -936 b        -936 b     -71.47 Mb     -71.47 Mb        780540  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     275.655ms         4.63%     275.655ms       3.952us           0 b           0 b           0 b           0 b         69751  
                                    aten::_foreach_sqrt         0.37%     142.381ms         1.67%     644.885ms     644.885us     273.307ms         4.60%     274.641ms     274.641us           0 b           0 b     112.70 Gb           0 b          1000  
void at::native::(anonymous namespace)::multi_tensor...         0.00%       0.000us         0.00%       0.000us       0.000us     273.307ms         4.60%     273.307ms     136.654us           0 b           0 b           0 b           0 b          2000  
                                    aten::_foreach_add_         0.56%     217.295ms         2.39%     921.531ms     460.765us     272.497ms         4.58%     273.238ms     136.619us           8 b    -171.88 Kb           0 b           0 b          2000  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 38.541s
Self CUDA time total: 5.947s

Don't have profiler results for the 3080Ti machine as the process gets killed due to OOM But I do have the output to show that training does happen faster-

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 6551.93 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 6014.12 examples/s]

STAGE:2024-03-20 16:38:27 1468:1468 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
{'loss': 4.1458, 'grad_norm': 2.1316089630126953, 'learning_rate': 0.0, 'epoch': 1.0}                                                                                                                                                                                                                                                                                             
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:20<00:00, 23.12it/s]
UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.
  warnings.warn(
{'eval_loss': 3.9349136352539062, 'eval_rouge1': 0.153426813681583, 'eval_rouge2': 0.02742618897451911, 'eval_rougeL': 0.1270918468217938, 'eval_rougeLsum': 0.1377312505422213, 'eval_runtime': 3.9379, 'eval_samples_per_second': 25.395, 'eval_steps_per_second': 6.349, 'epoch': 1.0}                                                                                         
{'train_runtime': 25.2432, 'train_samples_per_second': 39.615, 'train_steps_per_second': 19.807, 'train_loss': 4.1458447265625, 'epoch': 1.0}                                                                                                                                                                                                                                     
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:25<00:00, 19.81it/s]
STAGE:2024-03-20 16:39:03 1468:1468 ActivityProfilerController.cpp:320] Completed Stage: Collection
Killed

I was able to run the bottleneck module on the 3080Ti machine using a very simple matrix multiplication code loop Displaying only the autograd profiler output (CUDA mode) for both

torch.utils.bottleneck on 4090

-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
    cudaDeviceSynchronize        43.12%     443.696ms        43.12%     443.696ms     443.696ms       0.000us         0.00%       0.000us       0.000us             1  
    cudaDeviceSynchronize        39.95%     411.095ms        39.95%     411.095ms     411.095ms       0.000us         0.00%       0.000us       0.000us             1  
               aten::rand         0.01%      63.000us         7.91%      81.361ms      81.361ms      30.000us         0.02%      81.365ms      81.365ms             1  
               aten::rand         0.01%      59.000us         7.90%      81.317ms      81.317ms      25.000us         0.01%      81.320ms      81.320ms             1  
           aten::uniform_         7.90%      81.274ms         7.90%      81.274ms      81.274ms      81.307ms        46.65%      81.307ms      81.307ms             1  
           aten::uniform_         7.89%      81.226ms         7.89%      81.226ms      81.226ms      81.258ms        46.63%      81.258ms      81.258ms             1  
                 aten::to         0.00%      15.000us         0.57%       5.910ms       5.910ms      15.000us         0.01%       5.914ms       5.914ms             1  
           aten::_to_copy         0.00%      45.000us         0.57%       5.895ms       5.895ms      32.000us         0.02%       5.899ms       5.899ms             1  
              aten::copy_         0.00%      31.000us         0.57%       5.829ms       5.829ms       5.842ms         3.35%       5.842ms       5.842ms             1  
                 aten::to         0.00%      18.000us         0.56%       5.780ms       5.780ms      18.000us         0.01%       5.784ms       5.784ms             1  
          cudaMemcpyAsync         0.56%       5.767ms         0.56%       5.767ms       5.767ms       0.000us         0.00%       0.000us       0.000us             1  
           aten::_to_copy         0.00%      49.000us         0.56%       5.762ms       5.762ms      36.000us         0.02%       5.766ms       5.766ms             1  
              aten::copy_         0.00%      31.000us         0.55%       5.688ms       5.688ms       5.700ms         3.27%       5.700ms       5.700ms             1  
          cudaMemcpyAsync         0.55%       5.627ms         0.55%       5.627ms       5.627ms       0.000us         0.00%       0.000us       0.000us             1  
             aten::matmul         0.00%      15.000us         0.01%     123.000us     123.000us      10.000us         0.01%       2.472ms       2.472ms             1  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.029s
Self CUDA time total: 174.273ms

torch.utils.bottleneck on 3080Ti

-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
    cudaDeviceSynchronize        48.05%        1.146s        48.05%        1.146s        1.146s       0.000us         0.00%       0.000us       0.000us             1  
    cudaDeviceSynchronize        47.56%        1.134s        47.56%        1.134s        1.134s       0.000us         0.00%       0.000us       0.000us             1  
               aten::rand         0.00%      32.000us         1.88%      44.793ms      44.793ms       4.000us         0.01%       6.000us       6.000us             1  
           aten::uniform_         1.88%      44.740ms         1.88%      44.740ms      44.740ms       1.000us         0.00%       1.000us       1.000us             1  
               aten::rand         0.00%      40.000us         1.84%      44.000ms      44.000ms       4.000us         0.01%       6.000us       6.000us             1  
           aten::uniform_         1.84%      43.941ms         1.84%      43.941ms      43.941ms       1.000us         0.00%       1.000us       1.000us             1  
                 aten::to         0.00%       7.000us         0.34%       8.002ms       8.002ms       3.000us         0.00%       8.032ms       8.032ms             1  
           aten::_to_copy         0.00%      24.000us         0.34%       7.995ms       7.995ms       3.000us         0.00%       8.029ms       8.029ms             1  
              aten::copy_         0.00%      46.000us         0.33%       7.957ms       7.957ms       8.025ms        13.34%       8.025ms       8.025ms             1  
                 aten::to         0.00%       8.000us         0.33%       7.931ms       7.931ms       3.000us         0.00%      52.097ms      52.097ms             1  
           aten::_to_copy         0.00%      25.000us         0.33%       7.923ms       7.923ms       4.000us         0.01%      52.094ms      52.094ms             1  
              aten::copy_         0.00%      46.000us         0.33%       7.883ms       7.883ms      52.089ms        86.62%      52.089ms      52.089ms             1  
          cudaMemcpyAsync         0.33%       7.780ms         0.33%       7.780ms       7.780ms       0.000us         0.00%       0.000us       0.000us             1  
          cudaMemcpyAsync         0.32%       7.707ms         0.32%       7.707ms       7.707ms       0.000us         0.00%       0.000us       0.000us             1  
    cudaStreamSynchronize         0.01%     131.000us         0.01%     131.000us     131.000us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.385s
Self CUDA time total: 60.137ms

Code

from torch.profiler import profile, record_function, ProfilerActivity
import torch
import torch.nn as nn
import torch.nn.functional as F
import time

torch.cuda.synchronize()
start = time.time()
device = "cuda"
mat1 = torch.rand((4096,4096)).to(device)
mat2 = torch.rand((4096,4096)).to(device)

def func():
    mat3 = mat1@mat2

def loop():
    for i in range(200):
        func()

loop()

end = time.time()
print('time.time()',end-start)

How should I proceed to figure out the cause of this difference?

0

There are 0 best solutions below