Hardware specs-
- CPU- Ryzen 3970X Threadripper
- GPU- 2x4090
- OS- Ubuntu 22.04 (PyTorch Docker Image)
- RAM- 128GB
nvidia-smi
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07 Driver Version: 535.161.07 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 4090 Off | 00000000:21:00.0 Off | Off |
| 0% 46C P8 37W / 450W | 269MiB / 24564MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 1 NVIDIA GeForce RTX 4090 Off | 00000000:49:00.0 Off | Off |
| 0% 49C P8 28W / 450W | 11MiB / 24564MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Jul_11_02:20:44_PDT_2023
Cuda compilation tools, release 12.2, V12.2.128
Build cuda_12.2.r12.2/compiler.33053471_0
For some reason, training on the GPUs happens rather slowly as compared to my other machine with a 3080ti and 13600k (WSL 2.0) While only using a single 4090 for comparison, the 3080ti machine is much faster than the 4090 machine.
The training script for reference (was written using a tutorial from DataCamp)-
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from torch.profiler import profile, record_function, ProfilerActivity
import torch
MODEL_NAME = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
# Acquire the training data from Hugging Face
DATA_NAME = "yahoo_answers_qa"
yahoo_answers_qa = load_dataset(DATA_NAME)
yahoo_answers_qa = yahoo_answers_qa["train"].train_test_split(test_size=100, train_size=1000)
# We prefix our tasks with "answer the question"
prefix = "Please answer this question: "
# Define the preprocessing function
def preprocess_function(examples):
"""Add prefix to the sentences, tokenize the text, and set the labels"""
# The "inputs" are the tokenized answer:
inputs = [prefix + doc for doc in examples["question"]]
model_inputs = tokenizer(inputs, max_length=128, truncation=True)
# The "labels" are the tokenized outputs:
labels = tokenizer(text_target=examples["answer"],
max_length=512,
truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
# Map the preprocessing function across our dataset
tokenized_dataset = yahoo_answers_qa.map(preprocess_function, batched=True)
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")
def compute_metrics(eval_preds):
preds, labels = eval_preds
# decode preds and labels
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# rougeLSum expects newline after each sentence
decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
return result
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 2 # small batch size due to limited resources on the 3080ti machine
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 1
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=L_RATE,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
weight_decay=WEIGHT_DECAY,
save_total_limit=SAVE_TOTAL_LIM,
num_train_epochs=NUM_EPOCHS,
predict_with_generate=True,
push_to_hub=False
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics
)
torch.cuda.synchronize()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True) as prof:
trainer.train()
print(prof.key_averages().table(row_limit=15, sort_by="self_cuda_time_total"))
I tried using PyTorch's internal profiler to figure out what could be causing this but I couldn't understand the output very well (I tried both torch.profiler and torch.utils.bottleneck)
Here are the results of the various runs on both the machines-
torch.profiler on 4090
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 4441.05 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 3872.25 examples/s]
STAGE:2024-03-21 02:18:26 270203:270203 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
{'loss': 4.1186, 'learning_rate': 0.0, 'epoch': 1.0}
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:48<00:00, 10.12it/s]
{'eval_loss': 3.9080231189727783, 'eval_rouge1': 0.13428924748788773, 'eval_rouge2': 0.02317430268607329, 'eval_rougeL': 0.11411167280744719, 'eval_rougeLsum': 0.11964790797296734, 'eval_runtime': 6.5172, 'eval_samples_per_second': 15.344, 'eval_steps_per_second': 3.836, 'epoch': 1.0}
{'train_runtime': 56.4093, 'train_samples_per_second': 17.728, 'train_steps_per_second': 8.864, 'train_loss': 4.1186474609375, 'epoch': 1.0}
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:56<00:00, 8.86it/s]
STAGE:2024-03-21 02:19:30 270203:270203 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-03-21 02:19:33 270203:270203 ActivityProfilerController.cpp:322] Completed Stage: Post Processing
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
aten::mm 11.72% 4.517s 17.70% 6.821s 39.563us 1.492s 25.09% 1.673s 9.703us 0 b 0 b 151.09 Gb 151.09 Gb 172400
aten::_foreach_mul_ 0.49% 189.232ms 0.61% 235.115ms 117.558us 789.891ms 13.28% 792.785ms 396.392us 0 b 0 b 0 b 0 b 2000
void at::native::(anonymous namespace)::multi_tensor... 0.00% 0.000us 0.00% 0.000us 0.000us 543.324ms 9.14% 543.324ms 155.235us 0 b 0 b 0 b 0 b 3500
aten::_foreach_addcdiv_ 0.28% 109.787ms 0.32% 123.529ms 123.529us 535.961ms 9.01% 536.923ms 536.923us 0 b 0 b 0 b 0 b 1000
void at::native::(anonymous namespace)::multi_tensor... 0.00% 0.000us 0.00% 0.000us 0.000us 535.961ms 9.01% 535.961ms 267.981us 0 b 0 b 0 b 0 b 2000
aten::bmm 4.20% 1.619s 5.53% 2.132s 32.012us 418.858ms 7.04% 462.192ms 6.940us 0 b 0 b 10.87 Gb 10.87 Gb 66600
aten::_foreach_addcmul_ 0.27% 104.808ms 0.32% 121.843ms 121.843us 403.754ms 6.79% 404.620ms 404.620us 0 b 0 b 0 b 0 b 1000
void at::native::(anonymous namespace)::multi_tensor... 0.00% 0.000us 0.00% 0.000us 0.000us 403.754ms 6.79% 403.754ms 201.877us 0 b 0 b 0 b 0 b 2000
aten::_foreach_lerp_ 0.09% 34.689ms 0.14% 52.157ms 52.157us 402.333ms 6.76% 402.989ms 402.989us 0 b 0 b 0 b 0 b 1000
void at::native::(anonymous namespace)::multi_tensor... 0.00% 0.000us 0.00% 0.000us 0.000us 402.333ms 6.76% 402.333ms 201.167us 0 b 0 b 0 b 0 b 2000
cudaLaunchKernel 17.51% 6.747s 17.51% 6.747s 8.644us 370.963ms 6.24% 370.963ms 0.475us -936 b -936 b -71.47 Mb -71.47 Mb 780540
void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 275.655ms 4.63% 275.655ms 3.952us 0 b 0 b 0 b 0 b 69751
aten::_foreach_sqrt 0.37% 142.381ms 1.67% 644.885ms 644.885us 273.307ms 4.60% 274.641ms 274.641us 0 b 0 b 112.70 Gb 0 b 1000
void at::native::(anonymous namespace)::multi_tensor... 0.00% 0.000us 0.00% 0.000us 0.000us 273.307ms 4.60% 273.307ms 136.654us 0 b 0 b 0 b 0 b 2000
aten::_foreach_add_ 0.56% 217.295ms 2.39% 921.531ms 460.765us 272.497ms 4.58% 273.238ms 136.619us 8 b -171.88 Kb 0 b 0 b 2000
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 38.541s
Self CUDA time total: 5.947s
Don't have profiler results for the 3080Ti machine as the process gets killed due to OOM But I do have the output to show that training does happen faster-
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 6551.93 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 6014.12 examples/s]
STAGE:2024-03-20 16:38:27 1468:1468 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
{'loss': 4.1458, 'grad_norm': 2.1316089630126953, 'learning_rate': 0.0, 'epoch': 1.0}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:20<00:00, 23.12it/s]
UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.
warnings.warn(
{'eval_loss': 3.9349136352539062, 'eval_rouge1': 0.153426813681583, 'eval_rouge2': 0.02742618897451911, 'eval_rougeL': 0.1270918468217938, 'eval_rougeLsum': 0.1377312505422213, 'eval_runtime': 3.9379, 'eval_samples_per_second': 25.395, 'eval_steps_per_second': 6.349, 'epoch': 1.0}
{'train_runtime': 25.2432, 'train_samples_per_second': 39.615, 'train_steps_per_second': 19.807, 'train_loss': 4.1458447265625, 'epoch': 1.0}
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:25<00:00, 19.81it/s]
STAGE:2024-03-20 16:39:03 1468:1468 ActivityProfilerController.cpp:320] Completed Stage: Collection
Killed
I was able to run the bottleneck module on the 3080Ti machine using a very simple matrix multiplication code loop
Displaying only the autograd profiler output (CUDA mode) for both
torch.utils.bottleneck on 4090
------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
cudaDeviceSynchronize 43.12% 443.696ms 43.12% 443.696ms 443.696ms 0.000us 0.00% 0.000us 0.000us 1
cudaDeviceSynchronize 39.95% 411.095ms 39.95% 411.095ms 411.095ms 0.000us 0.00% 0.000us 0.000us 1
aten::rand 0.01% 63.000us 7.91% 81.361ms 81.361ms 30.000us 0.02% 81.365ms 81.365ms 1
aten::rand 0.01% 59.000us 7.90% 81.317ms 81.317ms 25.000us 0.01% 81.320ms 81.320ms 1
aten::uniform_ 7.90% 81.274ms 7.90% 81.274ms 81.274ms 81.307ms 46.65% 81.307ms 81.307ms 1
aten::uniform_ 7.89% 81.226ms 7.89% 81.226ms 81.226ms 81.258ms 46.63% 81.258ms 81.258ms 1
aten::to 0.00% 15.000us 0.57% 5.910ms 5.910ms 15.000us 0.01% 5.914ms 5.914ms 1
aten::_to_copy 0.00% 45.000us 0.57% 5.895ms 5.895ms 32.000us 0.02% 5.899ms 5.899ms 1
aten::copy_ 0.00% 31.000us 0.57% 5.829ms 5.829ms 5.842ms 3.35% 5.842ms 5.842ms 1
aten::to 0.00% 18.000us 0.56% 5.780ms 5.780ms 18.000us 0.01% 5.784ms 5.784ms 1
cudaMemcpyAsync 0.56% 5.767ms 0.56% 5.767ms 5.767ms 0.000us 0.00% 0.000us 0.000us 1
aten::_to_copy 0.00% 49.000us 0.56% 5.762ms 5.762ms 36.000us 0.02% 5.766ms 5.766ms 1
aten::copy_ 0.00% 31.000us 0.55% 5.688ms 5.688ms 5.700ms 3.27% 5.700ms 5.700ms 1
cudaMemcpyAsync 0.55% 5.627ms 0.55% 5.627ms 5.627ms 0.000us 0.00% 0.000us 0.000us 1
aten::matmul 0.00% 15.000us 0.01% 123.000us 123.000us 10.000us 0.01% 2.472ms 2.472ms 1
------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 1.029s
Self CUDA time total: 174.273ms
torch.utils.bottleneck on 3080Ti
------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
cudaDeviceSynchronize 48.05% 1.146s 48.05% 1.146s 1.146s 0.000us 0.00% 0.000us 0.000us 1
cudaDeviceSynchronize 47.56% 1.134s 47.56% 1.134s 1.134s 0.000us 0.00% 0.000us 0.000us 1
aten::rand 0.00% 32.000us 1.88% 44.793ms 44.793ms 4.000us 0.01% 6.000us 6.000us 1
aten::uniform_ 1.88% 44.740ms 1.88% 44.740ms 44.740ms 1.000us 0.00% 1.000us 1.000us 1
aten::rand 0.00% 40.000us 1.84% 44.000ms 44.000ms 4.000us 0.01% 6.000us 6.000us 1
aten::uniform_ 1.84% 43.941ms 1.84% 43.941ms 43.941ms 1.000us 0.00% 1.000us 1.000us 1
aten::to 0.00% 7.000us 0.34% 8.002ms 8.002ms 3.000us 0.00% 8.032ms 8.032ms 1
aten::_to_copy 0.00% 24.000us 0.34% 7.995ms 7.995ms 3.000us 0.00% 8.029ms 8.029ms 1
aten::copy_ 0.00% 46.000us 0.33% 7.957ms 7.957ms 8.025ms 13.34% 8.025ms 8.025ms 1
aten::to 0.00% 8.000us 0.33% 7.931ms 7.931ms 3.000us 0.00% 52.097ms 52.097ms 1
aten::_to_copy 0.00% 25.000us 0.33% 7.923ms 7.923ms 4.000us 0.01% 52.094ms 52.094ms 1
aten::copy_ 0.00% 46.000us 0.33% 7.883ms 7.883ms 52.089ms 86.62% 52.089ms 52.089ms 1
cudaMemcpyAsync 0.33% 7.780ms 0.33% 7.780ms 7.780ms 0.000us 0.00% 0.000us 0.000us 1
cudaMemcpyAsync 0.32% 7.707ms 0.32% 7.707ms 7.707ms 0.000us 0.00% 0.000us 0.000us 1
cudaStreamSynchronize 0.01% 131.000us 0.01% 131.000us 131.000us 0.000us 0.00% 0.000us 0.000us 1
------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.385s
Self CUDA time total: 60.137ms
Code
from torch.profiler import profile, record_function, ProfilerActivity
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
torch.cuda.synchronize()
start = time.time()
device = "cuda"
mat1 = torch.rand((4096,4096)).to(device)
mat2 = torch.rand((4096,4096)).to(device)
def func():
mat3 = mat1@mat2
def loop():
for i in range(200):
func()
loop()
end = time.time()
print('time.time()',end-start)
How should I proceed to figure out the cause of this difference?