diff --git a/fabric_examples/fablib_api/fabric_all_gpus/fabric_gpu.ipynb b/fabric_examples/fablib_api/fabric_all_gpus/fabric_gpu.ipynb index d4a0a833..d97bac73 100644 --- a/fabric_examples/fablib_api/fabric_all_gpus/fabric_gpu.ipynb +++ b/fabric_examples/fablib_api/fabric_all_gpus/fabric_gpu.ipynb @@ -219,11 +219,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "distro='ubuntu2204'\n", - "version='12.2'\n", + "version='12.6'\n", "architecture='x86_64'\n", "\n", "# install prerequisites\n", @@ -237,6 +239,16 @@ " print(f\"++++ {command}\")\n", " stdout, stderr = node.execute(command)\n", "\n", + "print(\"Installing PyTorch...\")\n", + "commands = [\n", + " 'sudo apt install python3-pip -y',\n", + " 'pip3 install torch',\n", + " 'pip3 install torchvision'\n", + "]\n", + "for command in commands:\n", + " print(f\"++++ {command}\")\n", + " stdout, stderr = node.execute(command)\n", + "\n", "print(f\"Installing CUDA {version}\")\n", "commands = [\n", " f'wget https://developer.download.nvidia.com/compute/cuda/repos/{distro}/{architecture}/cuda-keyring_1.1-1_all.deb',\n", @@ -298,6 +310,13 @@ "print(f\"stdout: {stdout}\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CUDA Hello World Example" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -360,7 +379,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "If you see `Hello World!`, the CUDA program ran successfully. `World!` was computed on the GPU from an array of offsets being summed with the string `Hello `, and the resut was printed to stdout.\n", + "If you see `Hello World!`, the CUDA program ran successfully. `World!` was computed on the GPU from an array of offsets being summed with the string `Hello `, and the result was printed to stdout.\n", "\n", "### Congratulations! You have now successfully run a program on a FABRIC GPU!" ] @@ -369,18 +388,34 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Cleanup Your Experiment" + "### PyTorch CIFAR10 Classifier Example" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's follow the \"Training a Classifer\" tutorial from PyTorch to train an image classifier on the CIFAR10 dataset\n", + "\n", + "`pytorch_example`\n", + "\n", + "*Source: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html*" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "tags": [] - }, + "metadata": {}, "outputs": [], "source": [ - "fablib.delete_slice(slice_name)" + "node.upload_file('./pytorch_example.py', 'pytorch_example.py')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, run the python script to train and test the classifier." ] }, { @@ -388,7 +423,36 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "stdout, stderr = node.execute(\"python3 pytorch_example.py\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you see `Finished Training` followed by the accuracy of the classifier, then the script ran successfully.\n", + "\n", + "### Congratulations! You have now successfully trained a PyTorch classifier on a FABRIC GPU!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleanup Your Experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fablib.delete_slice(slice_name)" + ] } ], "metadata": { @@ -407,7 +471,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.11.8" } }, "nbformat": 4, diff --git a/fabric_examples/fablib_api/fabric_all_gpus/pytorch_example.py b/fabric_examples/fablib_api/fabric_all_gpus/pytorch_example.py new file mode 100644 index 00000000..b61c43d9 --- /dev/null +++ b/fabric_examples/fablib_api/fabric_all_gpus/pytorch_example.py @@ -0,0 +1,108 @@ +import torch +import torchvision +import torchvision.transforms as transforms +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') +# Assuming that we are on a CUDA machine, this should print a CUDA device: +print("Device: ",device) + +# --- Load and normalize CIFAR10 --- +transform = transforms.Compose( + [transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + +batch_size = 4 + +trainset = torchvision.datasets.CIFAR10(root='./data', train=True, + download=True, transform=transform) +trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, + shuffle=True, num_workers=2) + +testset = torchvision.datasets.CIFAR10(root='./data', train=False, + download=True, transform=transform) +testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, + shuffle=False, num_workers=2) + +classes = ('plane', 'car', 'bird', 'cat', + 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') + +# --- Define a Convolutional Neural Network --- +class Net(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = torch.flatten(x, 1) # flatten all dimensions except batch + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +net = Net().to(device) + +# --- Define a Loss function and optimizer --- +criterion = nn.CrossEntropyLoss() +optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) + +# --- Train the network --- +for epoch in range(2): # loop over the dataset multiple times + + running_loss = 0.0 + for i, data in enumerate(trainloader, 0): + # get the inputs; data is a list of [inputs, labels] + inputs, labels = data[0].to(device), data[1].to(device) + + # zero the parameter gradients + optimizer.zero_grad() + + # forward + backward + optimize + outputs = net(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + # print statistics + running_loss += loss.item() + if i % 2000 == 1999: # print every 2000 mini-batches + print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}') + running_loss = 0.0 + +print('Finished Training') +PATH = './cifar_net.pth' +torch.save(net.state_dict(), PATH) + +# --- Test the network on the test data --- +dataiter = iter(testloader) +images, labels = next(dataiter) + +net = Net() +net.load_state_dict(torch.load(PATH)) +outputs = net(images) +_, predicted = torch.max(outputs, 1) + +correct = 0 +total = 0 +# since we're not training, we don't need to calculate the gradients for our outputs +with torch.no_grad(): + for data in testloader: + images, labels = data + # calculate outputs by running images through the network + outputs = net(images) + # the class with the highest energy is what we choose as prediction + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + +print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %') \ No newline at end of file