Recently I've been learning Pytorch to train models using multiple GPUs, and one of the first things I started to experiment with was DataParallel (even though it's a method that's discouraged to use), and I constructed some dummy data, as well as a toy model, with the code:
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
import torch
import time
class My_dataset(Dataset):
def __init__(self):
self.x = torch.randn((15000, 768))
self.y = torch.ones((15000,))
def __getitem__(self, idx):
return self.x[idx], self.y[idx]
def __len__(self):
return len(self.x)
class My_Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.linear1 = torch.nn.Linear(768, 76800)
self.relu = torch.nn.ReLU()
self.classfier = torch.nn.Linear(76800, 1)
self.loss_fn = torch.nn.MSELoss()
def forward(self, x):
result = self.classfier(self.relu(self.linear1(x)))
return result
train_dataset = My_dataset()
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, drop_last=False)
model = My_Model()
model = model.cuda()
# model = torch.nn.parallel.DataParallel(model, device_ids=[0,1])
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(params=model.parameters())
start = time.time()
for i, batch_datasets in enumerate(train_dataloader):
if i % 100 == 0:
print(f"i: {i}")
x, y = batch_datasets
x = x.cuda()
y = y.cuda()
result = model(x)
loss = loss_fn(result, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
end = time.time()
print(f"time is {end - start}")
I run the code and it prints: time is 2.7407000064849854
Next, I uncommented model = torch.nn.parallel.DataParallel(model, device_ids=[0,1]), i.e., I trained the model using two GPUs, and the printout is: time is 50.398988246917725
I find it very confusing that training a model using two GPUs is much slower than one GPU.
Additionally, I printed the memory taken up by running a single GPU: 1513MiB / 24268MiB, and two GPUs (batch size is the same as a single GPU): [1785MiB / 24268MiB, 1331MiB / 24268MiB]. I don't know if this is normal for memory.