Model Parallel
The purpose of this tutorial is to describe a method to parallelize training in the case of a large model who’s attributes will not fit on a single GPU. We use the Pytorch framework. The main idea of the approach shown here is fairly simple: different layers of our NN can be placed on different GPUs.
First, we import the necessary packages. Nothing additional to the packages used in single-GPU training is necessary for this model-parallel approach.
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import sys
Now we build our model and define our forward pass:
class SeqNet(nn.Module):
def __init__(self, input_size, hidden_size1, output_size):
super(SeqNet, self).__init__()
self.lin1 = nn.Linear(input_size, hidden_size1).to('cuda:0')
self.lin2 = nn.Linear(hidden_size1, output_size).to('cuda:1')
def forward(self, x):
x = torch.flatten(x,1)
x = self.lin1(x.to('cuda:0'))
x = F.log_softmax(x, dim=1)
out = self.lin2(x.to('cuda:1'))
return out
This is where most of the work to parallelize our model is accomplished. We send each layer of our model to a different GPU by using .to('cuda:0')
and .to('cuda:1')
commands as we define our model layers. It’s also important to note that each step of our forward pass must happen on the appropriate GPU by sending our x
tensor to the correct place. We do this by again using x.to('cuda:0')
and `x.to(‘cuda:1’) commands.
We can now define our training function:
#!/usr/bin/env python3
#This example trains a sequential nueral network and logs
#our model and some paramterts/metric of interest with MLflow
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import sys
class SeqNet(nn.Module):
def __init__(self, input_size, hidden_size1, output_size):
super(SeqNet, self).__init__()
self.lin1 = nn.Linear(input_size, hidden_size1).to('cuda:0')
self.lin2 = nn.Linear(hidden_size1, output_size).to('cuda:1')
def forward(self, x):
x = torch.flatten(x,1)
x = self.lin1(x.to('cuda:0'))
x = F.log_softmax(x, dim=1)
out = self.lin2(x.to('cuda:1'))
return out
def train(model, train_loader, loss_function, optimizer, num_epochs):
for epoch in range(num_epochs):
running_loss = 0.0
model.train()
for i ,(images,labels) in enumerate(train_loader):
images = torch.div(images, 255.)
optimizer.zero_grad()
outputs = model(images)
loss = loss_function(outputs, labels.to('cuda:1'))
loss.backward()
optimizer.step()
running_loss += loss.item()
average_loss = running_loss / len(train_loader)
print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}")
print("Training finished.")
where the labels for our loss function calculation must be sent to the device corresponding to the output of our model. In this case it is cuda:1
.
We can now continue our training as usual:
input_size = 784
hidden_size1 = 200
hidden_size2 = 200
output_size = 10
num_epochs = 10
batch_size = 100
lr = 0.01
if not torch.cuda.is_available():
sys.exit("A minimum of 2 GPUs must be available to train this model.")
my_net = SeqNet(input_size, hidden_size1, output_size)
optimizer = torch.optim.Adam( my_net.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss()
fmnist_train = datasets.FashionMNIST(root="data", train=True, download=True, transform=ToTensor())
fmnist_test = datasets.FashionMNIST(root="data", train=False, download=True, transform=ToTensor())
fmnist_train_loader = DataLoader(fmnist_train, batch_size=batch_size, shuffle=True)
fmnist_test_loader = DataLoader(fmnist_test, batch_size=batch_size, shuffle=True)
train(my_net, fmnist_train_loader, loss_function, optimizer, num_epochs)
Download the full script used in this example here