Merge pull request #300 from bformby/main

Updating fabric_gpu example notebook
fabric-testbed · Aug 13, 2024 · 1ff593d · 1ff593d
2 parents 5fdcc26 + a5bee4a
commit 1ff593d
Show file tree

Hide file tree

Showing 2 changed files with 182 additions and 10 deletions.
diff --git a/fabric_examples/fablib_api/fabric_all_gpus/fabric_gpu.ipynb b/fabric_examples/fablib_api/fabric_all_gpus/fabric_gpu.ipynb
@@ -219,11 +219,13 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "distro='ubuntu2204'\n",
-    "version='12.2'\n",
+    "version='12.6'\n",
     "architecture='x86_64'\n",
     "\n",
     "# install prerequisites\n",
@@ -237,6 +239,16 @@
     "    print(f\"++++ {command}\")\n",
     "    stdout, stderr = node.execute(command)\n",
     "\n",
+    "print(\"Installing PyTorch...\")\n",
+    "commands = [\n",
+    "    'sudo apt install python3-pip -y',\n",
+    "    'pip3 install torch',\n",
+    "    'pip3 install torchvision'\n",
+    "]\n",
+    "for command in commands:\n",
+    "    print(f\"++++ {command}\")\n",
+    "    stdout, stderr = node.execute(command)\n",
+    "\n",
     "print(f\"Installing CUDA {version}\")\n",
     "commands = [\n",
     "    f'wget https://developer.download.nvidia.com/compute/cuda/repos/{distro}/{architecture}/cuda-keyring_1.1-1_all.deb',\n",
@@ -298,6 +310,13 @@
     "print(f\"stdout: {stdout}\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### CUDA Hello World Example"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -360,7 +379,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "If you see `Hello World!`, the CUDA program ran successfully. `World!` was computed on the GPU from an array of offsets being summed with the string `Hello `, and the resut was printed to stdout.\n",
+    "If you see `Hello World!`, the CUDA program ran successfully. `World!` was computed on the GPU from an array of offsets being summed with the string `Hello `, and the result was printed to stdout.\n",
     "\n",
     "### Congratulations! You have now successfully run a program on a FABRIC GPU!"
    ]
@@ -369,26 +388,71 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Cleanup Your Experiment"
+    "### PyTorch CIFAR10 Classifier Example"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, let's follow the \"Training a Classifer\" tutorial from PyTorch to train an image classifier on the CIFAR10 dataset\n",
+    "\n",
+    "`pytorch_example`\n",
+    "\n",
+    "*Source: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html*"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "fablib.delete_slice(slice_name)"
+    "node.upload_file('./pytorch_example.py', 'pytorch_example.py')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, run the python script to train and test the classifier."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "stdout, stderr = node.execute(\"python3 pytorch_example.py\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you see `Finished Training` followed by the accuracy of the classifier, then the script ran successfully.\n",
+    "\n",
+    "### Congratulations! You have now successfully trained a PyTorch classifier on a FABRIC GPU!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cleanup Your Experiment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "fablib.delete_slice(slice_name)"
+   ]
   }
  ],
  "metadata": {
@@ -407,7 +471,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.11.8"
   }
  },
  "nbformat": 4,

diff --git a/fabric_examples/fablib_api/fabric_all_gpus/pytorch_example.py b/fabric_examples/fablib_api/fabric_all_gpus/pytorch_example.py
@@ -0,0 +1,108 @@
+import torch
+import torchvision
+import torchvision.transforms as transforms
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+# Assuming that we are on a CUDA machine, this should print a CUDA device:
+print("Device: ",device)
+
+# --- Load and normalize CIFAR10 --- 
+transform = transforms.Compose(
+    [transforms.ToTensor(),
+     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+batch_size = 4
+
+trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
+                                        download=True, transform=transform)
+trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
+                                          shuffle=True, num_workers=2)
+
+testset = torchvision.datasets.CIFAR10(root='./data', train=False,
+                                       download=True, transform=transform)
+testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
+                                         shuffle=False, num_workers=2)
+
+classes = ('plane', 'car', 'bird', 'cat',
+           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
+
+# --- Define a Convolutional Neural Network ---
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = torch.flatten(x, 1) # flatten all dimensions except batch
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+net = Net().to(device)
+
+# --- Define a Loss function and optimizer ---
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
+
+# --- Train the network ---
+for epoch in range(2):  # loop over the dataset multiple times
+
+    running_loss = 0.0
+    for i, data in enumerate(trainloader, 0):
+        # get the inputs; data is a list of [inputs, labels]
+        inputs, labels = data[0].to(device), data[1].to(device)
+
+        # zero the parameter gradients
+        optimizer.zero_grad()
+
+        # forward + backward + optimize
+        outputs = net(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        # print statistics
+        running_loss += loss.item()
+        if i % 2000 == 1999:    # print every 2000 mini-batches
+            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
+            running_loss = 0.0
+
+print('Finished Training')
+PATH = './cifar_net.pth'
+torch.save(net.state_dict(), PATH)
+
+# --- Test the network on the test data ---
+dataiter = iter(testloader)
+images, labels = next(dataiter)
+
+net = Net()
+net.load_state_dict(torch.load(PATH))
+outputs = net(images)
+_, predicted = torch.max(outputs, 1)
+
+correct = 0
+total = 0
+# since we're not training, we don't need to calculate the gradients for our outputs
+with torch.no_grad():
+    for data in testloader:
+        images, labels = data
+        # calculate outputs by running images through the network
+        outputs = net(images)
+        # the class with the highest energy is what we choose as prediction
+        _, predicted = torch.max(outputs.data, 1)
+        total += labels.size(0)
+        correct += (predicted == labels).sum().item()
+
+print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')