From dc793308040e47b030d5fcb9bd17e9bbb7e42f3a Mon Sep 17 00:00:00 2001
From: Chuck Tang <chuck.tang@databricks.com>
Date: Wed, 7 Aug 2024 15:41:23 -0700
Subject: [PATCH 01/23] commit change

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index eccba856fd..c16807aa9d 100644
--- a/setup.py
+++ b/setup.py
@@ -80,8 +80,8 @@ def package_files(prefix: str, directory: str, extension: str):
     'tqdm>=4.62.3,<5',
     'torchmetrics>=1.4.0.post0,<1.4.1',
     'torch_optimizer>=0.3.0,<0.4',
-    'torchvision>=0.13.1,<0.18.2',
-    'torch>=2.1.2,<2.3.2',
+    'torchvision>=0.14.0,<0.19.1',
+    'torch>=2.2.0,<2.4.1',
     'requests>=2.26.0,<3',
     'numpy>=1.21.5,<2.1.0',
     'psutil>=5.8.0,<7',

From 32e1eed98b4f2593dddaff8245ac3fd70b128ea1 Mon Sep 17 00:00:00 2001
From: Chuck Tang <chuck.tang@databricks.com>
Date: Thu, 8 Aug 2024 17:22:35 -0700
Subject: [PATCH 02/23] commit change

---
 docker/generate_build_matrix.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index 74d9c7fed4..d8a665322a 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -19,16 +19,16 @@
 import yaml
 
 PRODUCTION_PYTHON_VERSION = '3.11'
-PRODUCTION_PYTORCH_VERSION = '2.3.1'
+PRODUCTION_PYTORCH_VERSION = '2.4.0'
 
 
 def _get_torchvision_version(pytorch_version: str):
+    if pytorch_version == '2.4.0':
+        return '0.19.0'
     if pytorch_version == '2.3.1':
         return '0.18.1'
     if pytorch_version == '2.2.2':
         return '0.17.2'
-    if pytorch_version == '2.1.2':
-        return '0.16.2'
     raise ValueError(f'Invalid pytorch_version: {pytorch_version}')
 
 
@@ -42,12 +42,12 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool):
     # From https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/
     if not use_cuda:
         return ''
+    if pytorch_version == '2.4.0':
+        return '12.1.4'
     if pytorch_version == '2.3.1':
         return '12.1.1'
     if pytorch_version == '2.2.2':
         return '12.1.1'
-    if pytorch_version == '2.1.2':
-        return '12.1.1'
     raise ValueError(f'Invalid pytorch_version: {pytorch_version}')
 
 
@@ -167,7 +167,7 @@ def _write_table(table_tag: str, table_contents: str):
 
 
 def _main():
-    python_pytorch_versions = [('3.11', '2.3.1'), ('3.11', '2.2.2'), ('3.10', '2.1.2')]
+    python_pytorch_versions = [('3.11', '2.4.0'), ('3.11', '2.3.1'), ('3.11', '2.2.2')]
     cuda_options = [True, False]
     stages = ['pytorch_stage']
     interconnects = ['mellanox', 'EFA']  # mellanox is default, EFA needed for AWS

From 092347499fe4488e5c3c585cee500fdd4ba085b1 Mon Sep 17 00:00:00 2001
From: Chuck Tang <chuck.tang@databricks.com>
Date: Thu, 8 Aug 2024 17:26:23 -0700
Subject: [PATCH 03/23] commit change

---
 docker/README.md         |  12 ++--
 docker/build_matrix.yaml | 143 ++++++++++++++-------------------------
 2 files changed, 58 insertions(+), 97 deletions(-)

diff --git a/docker/README.md b/docker/README.md
index a0514ecb3d..1b68cdec39 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -30,15 +30,15 @@ To install composer, once inside the image, run `pip install mosaicml`.
 <!-- BEGIN_PYTORCH_BUILD_MATRIX -->
 | Linux Distro   | Flavor   | PyTorch Version   | CUDA Version        | Python Version   | Docker Tags                                                                              |
 |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------|
-| Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04`         |
-| Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` |
-| Ubuntu 20.04   | Base     | 2.3.1             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04`       |
+| Ubuntu 20.04   | Base     | 2.4.0             | 12.1.4 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04`         |
+| Ubuntu 20.04   | Base     | 2.4.0             | 12.1.4 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04-aws` |
+| Ubuntu 20.04   | Base     | 2.4.0             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04`       |
+| Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04`                                    |
+| Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws`                                |
+| Ubuntu 20.04   | Base     | 2.3.1             | cpu                 | 3.11             | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04`                                      |
 | Ubuntu 20.04   | Base     | 2.2.2             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04`                                    |
 | Ubuntu 20.04   | Base     | 2.2.2             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws`                                |
 | Ubuntu 20.04   | Base     | 2.2.2             | cpu                 | 3.11             | `mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04`                                      |
-| Ubuntu 20.04   | Base     | 2.1.2             | 12.1.1 (Infiniband) | 3.10             | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04`                                    |
-| Ubuntu 20.04   | Base     | 2.1.2             | 12.1.1 (EFA)        | 3.10             | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws`                                |
-| Ubuntu 20.04   | Base     | 2.1.2             | cpu                 | 3.10             | `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04`                                      |
 <!-- END_PYTORCH_BUILD_MATRIX -->
 
 **Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws`
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index ee74d12309..6f4f14ad76 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -1,79 +1,53 @@
 # This file is automatically generated by generate_build_matrix.py. DO NOT EDIT!
 - AWS_OFI_NCCL_VERSION: ''
-  BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
-  CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-3-1-cu121
+  BASE_IMAGE: nvidia/cuda:12.1.4-cudnn8-devel-ubuntu20.04
+  CUDA_VERSION: 12.1.4
+  IMAGE_NAME: torch-2-4-0-cu121
   MOFED_VERSION: latest-23.10
-  NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
-    brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
-    brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471
-    brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471
-    brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511
-    brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511
-    brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511
-    brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516
-    brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516
-    brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516
-    brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526
-    brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
-    brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
-    brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
+  NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.3.1
+  PYTORCH_VERSION: 2.4.0
   TAGS:
-  - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
   - mosaicml/pytorch:latest
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.18.1
+  TORCHVISION_VERSION: 0.19.0
 - AWS_OFI_NCCL_VERSION: v1.9.1-aws
-  BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
-  CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-3-1-cu121-aws
+  BASE_IMAGE: nvidia/cuda:12.1.4-cudnn8-devel-ubuntu20.04
+  CUDA_VERSION: 12.1.4
+  IMAGE_NAME: torch-2-4-0-cu121-aws
   MOFED_VERSION: ''
-  NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
-    brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
-    brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471
-    brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471
-    brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511
-    brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511
-    brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511
-    brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516
-    brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516
-    brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516
-    brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526
-    brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
-    brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
-    brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
+  NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.3.1
+  PYTORCH_VERSION: 2.4.0
   TAGS:
-  - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
+  - mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04-aws
   - mosaicml/pytorch:latest-aws
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.18.1
+  TORCHVISION_VERSION: 0.19.0
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   CUDA_VERSION: ''
-  IMAGE_NAME: torch-2-3-1-cpu
+  IMAGE_NAME: torch-2-4-0-cpu
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.3.1
+  PYTORCH_VERSION: 2.4.0
   TAGS:
-  - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
   - mosaicml/pytorch:latest_cpu
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.18.1
+  TORCHVISION_VERSION: 0.19.0
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-2-2-cu121
+  IMAGE_NAME: torch-2-3-1-cu121
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -92,15 +66,15 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.2.2
+  PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.17.2
+  TORCHVISION_VERSION: 0.18.1
 - AWS_OFI_NCCL_VERSION: v1.9.1-aws
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-2-2-cu121-aws
+  IMAGE_NAME: torch-2-3-1-cu121-aws
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -119,29 +93,29 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.2.2
+  PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws
+  - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.17.2
+  TORCHVISION_VERSION: 0.18.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   CUDA_VERSION: ''
-  IMAGE_NAME: torch-2-2-2-cpu
+  IMAGE_NAME: torch-2-3-1-cpu
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.2.2
+  PYTORCH_VERSION: 2.3.1
   TAGS:
-  - mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.17.2
+  TORCHVISION_VERSION: 0.18.1
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-1-2-cu121
+  IMAGE_NAME: torch-2-2-2-cu121
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -157,18 +131,18 @@
     brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
     brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
     brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
-  PYTHON_VERSION: '3.10'
+  PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.1.2
+  PYTORCH_VERSION: 2.2.2
   TAGS:
-  - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
+  - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.16.2
+  TORCHVISION_VERSION: 0.17.2
 - AWS_OFI_NCCL_VERSION: v1.9.1-aws
   BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
   CUDA_VERSION: 12.1.1
-  IMAGE_NAME: torch-2-1-2-cu121-aws
+  IMAGE_NAME: torch-2-2-2-cu121-aws
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
     brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
@@ -184,57 +158,44 @@
     brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
     brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
     brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
-  PYTHON_VERSION: '3.10'
+  PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.1.2
+  PYTORCH_VERSION: 2.2.2
   TAGS:
-  - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws
+  - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.16.2
+  TORCHVISION_VERSION: 0.17.2
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   CUDA_VERSION: ''
-  IMAGE_NAME: torch-2-1-2-cpu
+  IMAGE_NAME: torch-2-2-2-cpu
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
-  PYTHON_VERSION: '3.10'
+  PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.1.2
+  PYTORCH_VERSION: 2.2.2
   TAGS:
-  - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
+  - mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04
   TARGET: pytorch_stage
-  TORCHVISION_VERSION: 0.16.2
+  TORCHVISION_VERSION: 0.17.2
 - AWS_OFI_NCCL_VERSION: ''
-  BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04
+  BASE_IMAGE: nvidia/cuda:12.1.4-cudnn8-devel-ubuntu20.04
   COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5
-  CUDA_VERSION: 12.1.1
+  CUDA_VERSION: 12.1.4
   IMAGE_NAME: composer-0-23-5
   MOFED_VERSION: latest-23.10
-  NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471
-    brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471
-    brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471
-    brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471
-    brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511
-    brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511
-    brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511
-    brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516
-    brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516
-    brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516
-    brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526
-    brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526
-    brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526
-    brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
+  NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.3.1
+  PYTORCH_VERSION: 2.4.0
   TAGS:
   - mosaicml/composer:0.23.5
   - mosaicml/composer:latest
   TARGET: composer_stage
-  TORCHVISION_VERSION: 0.18.1
+  TORCHVISION_VERSION: 0.19.0
 - AWS_OFI_NCCL_VERSION: ''
   BASE_IMAGE: ubuntu:20.04
   COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5
@@ -245,9 +206,9 @@
   PYTHON_VERSION: '3.11'
   PYTORCH_NIGHTLY_URL: ''
   PYTORCH_NIGHTLY_VERSION: ''
-  PYTORCH_VERSION: 2.3.1
+  PYTORCH_VERSION: 2.4.0
   TAGS:
   - mosaicml/composer:0.23.5_cpu
   - mosaicml/composer:latest_cpu
   TARGET: composer_stage
-  TORCHVISION_VERSION: 0.18.1
+  TORCHVISION_VERSION: 0.19.0

From eb498932f6a1edb5175bb7e730c0557fb9f5a0bc Mon Sep 17 00:00:00 2001
From: Chuck Tang <chuck.tang@databricks.com>
Date: Thu, 8 Aug 2024 17:27:10 -0700
Subject: [PATCH 04/23] commit change

---
 docker/generate_build_matrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index d8a665322a..cb041d39d6 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -43,7 +43,7 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool):
     if not use_cuda:
         return ''
     if pytorch_version == '2.4.0':
-        return '12.1.4'
+        return '12.4.0'
     if pytorch_version == '2.3.1':
         return '12.1.1'
     if pytorch_version == '2.2.2':

From e0b3c63f03c44fc9e9aed1397ce43c1fc0ab54a8 Mon Sep 17 00:00:00 2001
From: Chuck Tang <chuck.tang@databricks.com>
Date: Thu, 8 Aug 2024 17:27:14 -0700
Subject: [PATCH 05/23] commit change

---
 docker/README.md         |  4 ++--
 docker/build_matrix.yaml | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docker/README.md b/docker/README.md
index 1b68cdec39..bd4284b4de 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -30,8 +30,8 @@ To install composer, once inside the image, run `pip install mosaicml`.
 <!-- BEGIN_PYTORCH_BUILD_MATRIX -->
 | Linux Distro   | Flavor   | PyTorch Version   | CUDA Version        | Python Version   | Docker Tags                                                                              |
 |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------|
-| Ubuntu 20.04   | Base     | 2.4.0             | 12.1.4 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04`         |
-| Ubuntu 20.04   | Base     | 2.4.0             | 12.1.4 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04-aws` |
+| Ubuntu 20.04   | Base     | 2.4.0             | 12.4.0 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04`         |
+| Ubuntu 20.04   | Base     | 2.4.0             | 12.4.0 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws` |
 | Ubuntu 20.04   | Base     | 2.4.0             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04`       |
 | Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04`                                    |
 | Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws`                                |
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index 6f4f14ad76..f23ae70bb4 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -1,8 +1,8 @@
 # This file is automatically generated by generate_build_matrix.py. DO NOT EDIT!
 - AWS_OFI_NCCL_VERSION: ''
-  BASE_IMAGE: nvidia/cuda:12.1.4-cudnn8-devel-ubuntu20.04
-  CUDA_VERSION: 12.1.4
-  IMAGE_NAME: torch-2-4-0-cu121
+  BASE_IMAGE: nvidia/cuda:12.4.0-cudnn8-devel-ubuntu20.04
+  CUDA_VERSION: 12.4.0
+  IMAGE_NAME: torch-2-4-0-cu124
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
@@ -10,14 +10,14 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.4.0
   TAGS:
-  - mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
   - mosaicml/pytorch:latest
   TARGET: pytorch_stage
   TORCHVISION_VERSION: 0.19.0
 - AWS_OFI_NCCL_VERSION: v1.9.1-aws
-  BASE_IMAGE: nvidia/cuda:12.1.4-cudnn8-devel-ubuntu20.04
-  CUDA_VERSION: 12.1.4
-  IMAGE_NAME: torch-2-4-0-cu121-aws
+  BASE_IMAGE: nvidia/cuda:12.4.0-cudnn8-devel-ubuntu20.04
+  CUDA_VERSION: 12.4.0
+  IMAGE_NAME: torch-2-4-0-cu124-aws
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
@@ -25,7 +25,7 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.4.0
   TAGS:
-  - mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04-aws
+  - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
   - mosaicml/pytorch:latest-aws
   TARGET: pytorch_stage
   TORCHVISION_VERSION: 0.19.0
@@ -181,9 +181,9 @@
   TARGET: pytorch_stage
   TORCHVISION_VERSION: 0.17.2
 - AWS_OFI_NCCL_VERSION: ''
-  BASE_IMAGE: nvidia/cuda:12.1.4-cudnn8-devel-ubuntu20.04
+  BASE_IMAGE: nvidia/cuda:12.4.0-cudnn8-devel-ubuntu20.04
   COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5
-  CUDA_VERSION: 12.1.4
+  CUDA_VERSION: 12.4.0
   IMAGE_NAME: composer-0-23-5
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''

From 1ce6cffcfb0e797168e4cdf3e67eb14ad7e154e9 Mon Sep 17 00:00:00 2001
From: Chuck Tang <chuck.tang@databricks.com>
Date: Fri, 9 Aug 2024 09:52:07 -0700
Subject: [PATCH 06/23] commit change

---
 docker/README.md                |  4 ++--
 docker/build_matrix.yaml        | 20 ++++++++++----------
 docker/generate_build_matrix.py |  4 +++-
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/docker/README.md b/docker/README.md
index bd4284b4de..0a6bf5aac0 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -30,8 +30,8 @@ To install composer, once inside the image, run `pip install mosaicml`.
 <!-- BEGIN_PYTORCH_BUILD_MATRIX -->
 | Linux Distro   | Flavor   | PyTorch Version   | CUDA Version        | Python Version   | Docker Tags                                                                              |
 |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------|
-| Ubuntu 20.04   | Base     | 2.4.0             | 12.4.0 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04`         |
-| Ubuntu 20.04   | Base     | 2.4.0             | 12.4.0 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws` |
+| Ubuntu 20.04   | Base     | 2.4.0             | 12.5.1 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04`         |
+| Ubuntu 20.04   | Base     | 2.4.0             | 12.5.1 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04-aws` |
 | Ubuntu 20.04   | Base     | 2.4.0             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04`       |
 | Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04`                                    |
 | Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws`                                |
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index f23ae70bb4..f4c09ceb1a 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -1,8 +1,8 @@
 # This file is automatically generated by generate_build_matrix.py. DO NOT EDIT!
 - AWS_OFI_NCCL_VERSION: ''
-  BASE_IMAGE: nvidia/cuda:12.4.0-cudnn8-devel-ubuntu20.04
-  CUDA_VERSION: 12.4.0
-  IMAGE_NAME: torch-2-4-0-cu124
+  BASE_IMAGE: nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
+  CUDA_VERSION: 12.5.1
+  IMAGE_NAME: torch-2-4-0-cu125
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
@@ -10,14 +10,14 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.4.0
   TAGS:
-  - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04
   - mosaicml/pytorch:latest
   TARGET: pytorch_stage
   TORCHVISION_VERSION: 0.19.0
 - AWS_OFI_NCCL_VERSION: v1.9.1-aws
-  BASE_IMAGE: nvidia/cuda:12.4.0-cudnn8-devel-ubuntu20.04
-  CUDA_VERSION: 12.4.0
-  IMAGE_NAME: torch-2-4-0-cu124-aws
+  BASE_IMAGE: nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
+  CUDA_VERSION: 12.5.1
+  IMAGE_NAME: torch-2-4-0-cu125-aws
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
@@ -25,7 +25,7 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.4.0
   TAGS:
-  - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
+  - mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04-aws
   - mosaicml/pytorch:latest-aws
   TARGET: pytorch_stage
   TORCHVISION_VERSION: 0.19.0
@@ -181,9 +181,9 @@
   TARGET: pytorch_stage
   TORCHVISION_VERSION: 0.17.2
 - AWS_OFI_NCCL_VERSION: ''
-  BASE_IMAGE: nvidia/cuda:12.4.0-cudnn8-devel-ubuntu20.04
+  BASE_IMAGE: nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
   COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5
-  CUDA_VERSION: 12.4.0
+  CUDA_VERSION: 12.5.1
   IMAGE_NAME: composer-0-23-5
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index cb041d39d6..ef587e6bc2 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -35,6 +35,8 @@ def _get_torchvision_version(pytorch_version: str):
 def _get_base_image(cuda_version: str):
     if not cuda_version:
         return 'ubuntu:20.04'
+    if cuda_version == '12.5.1':
+        return f'nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04'
     return f'nvidia/cuda:{cuda_version}-cudnn8-devel-ubuntu20.04'
 
 
@@ -43,7 +45,7 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool):
     if not use_cuda:
         return ''
     if pytorch_version == '2.4.0':
-        return '12.4.0'
+        return '12.5.1'
     if pytorch_version == '2.3.1':
         return '12.1.1'
     if pytorch_version == '2.2.2':

From 76d7505e2f8b149c15b1966b9f221bc3b44f7079 Mon Sep 17 00:00:00 2001
From: Chuck Tang <chuck.tang@databricks.com>
Date: Fri, 9 Aug 2024 10:32:09 -0700
Subject: [PATCH 07/23] commit change

---
 docker/Dockerfile | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 970af2f1ef..9e86c08fd3 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -189,9 +189,15 @@ ENV PYTORCH_NIGHTLY_VERSION=${PYTORCH_NIGHTLY_VERSION}
 
 RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \
       CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \
+      if [ "${PYTORCH_VERSION}" != "2.4.0" ]; then \
         pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \
             torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \
             torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \
+      else \
+        pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/${CUDA_VERSION} \
+            torch \
+            torchvision ; \
+      fi ; \
     else \
         pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \
             torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \

From 5669451fdfb5675ad201d9800ed88e0cb23f8a59 Mon Sep 17 00:00:00 2001
From: Chuck Tang <chuck.tang@databricks.com>
Date: Fri, 9 Aug 2024 10:51:37 -0700
Subject: [PATCH 08/23] commit change

---
 docker/README.md                |  4 ++--
 docker/build_matrix.yaml        | 20 ++++++++++----------
 docker/generate_build_matrix.py |  6 +++---
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/docker/README.md b/docker/README.md
index 0a6bf5aac0..09dd2591f5 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -30,8 +30,8 @@ To install composer, once inside the image, run `pip install mosaicml`.
 <!-- BEGIN_PYTORCH_BUILD_MATRIX -->
 | Linux Distro   | Flavor   | PyTorch Version   | CUDA Version        | Python Version   | Docker Tags                                                                              |
 |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------|
-| Ubuntu 20.04   | Base     | 2.4.0             | 12.5.1 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04`         |
-| Ubuntu 20.04   | Base     | 2.4.0             | 12.5.1 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04-aws` |
+| Ubuntu 20.04   | Base     | 2.4.0             | 12.4.1 (Infiniband) | 3.11             | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04`         |
+| Ubuntu 20.04   | Base     | 2.4.0             | 12.4.1 (EFA)        | 3.11             | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws` |
 | Ubuntu 20.04   | Base     | 2.4.0             | cpu                 | 3.11             | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04`       |
 | Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (Infiniband) | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04`                                    |
 | Ubuntu 20.04   | Base     | 2.3.1             | 12.1.1 (EFA)        | 3.11             | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws`                                |
diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml
index f4c09ceb1a..2fb084a78b 100644
--- a/docker/build_matrix.yaml
+++ b/docker/build_matrix.yaml
@@ -1,8 +1,8 @@
 # This file is automatically generated by generate_build_matrix.py. DO NOT EDIT!
 - AWS_OFI_NCCL_VERSION: ''
-  BASE_IMAGE: nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
-  CUDA_VERSION: 12.5.1
-  IMAGE_NAME: torch-2-4-0-cu125
+  BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
+  CUDA_VERSION: 12.4.1
+  IMAGE_NAME: torch-2-4-0-cu124
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
@@ -10,14 +10,14 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.4.0
   TAGS:
-  - mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04
+  - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
   - mosaicml/pytorch:latest
   TARGET: pytorch_stage
   TORCHVISION_VERSION: 0.19.0
 - AWS_OFI_NCCL_VERSION: v1.9.1-aws
-  BASE_IMAGE: nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
-  CUDA_VERSION: 12.5.1
-  IMAGE_NAME: torch-2-4-0-cu125-aws
+  BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
+  CUDA_VERSION: 12.4.1
+  IMAGE_NAME: torch-2-4-0-cu124-aws
   MOFED_VERSION: ''
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
   PYTHON_VERSION: '3.11'
@@ -25,7 +25,7 @@
   PYTORCH_NIGHTLY_VERSION: ''
   PYTORCH_VERSION: 2.4.0
   TAGS:
-  - mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04-aws
+  - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
   - mosaicml/pytorch:latest-aws
   TARGET: pytorch_stage
   TORCHVISION_VERSION: 0.19.0
@@ -181,9 +181,9 @@
   TARGET: pytorch_stage
   TORCHVISION_VERSION: 0.17.2
 - AWS_OFI_NCCL_VERSION: ''
-  BASE_IMAGE: nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04
+  BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04
   COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5
-  CUDA_VERSION: 12.5.1
+  CUDA_VERSION: 12.4.1
   IMAGE_NAME: composer-0-23-5
   MOFED_VERSION: latest-23.10
   NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py
index ef587e6bc2..a1cf5bca3b 100644
--- a/docker/generate_build_matrix.py
+++ b/docker/generate_build_matrix.py
@@ -35,8 +35,8 @@ def _get_torchvision_version(pytorch_version: str):
 def _get_base_image(cuda_version: str):
     if not cuda_version:
         return 'ubuntu:20.04'
-    if cuda_version == '12.5.1':
-        return f'nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04'
+    if cuda_version == '12.4.1':
+        return f'nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04'
     return f'nvidia/cuda:{cuda_version}-cudnn8-devel-ubuntu20.04'
 
 
@@ -45,7 +45,7 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool):
     if not use_cuda:
         return ''
     if pytorch_version == '2.4.0':
-        return '12.5.1'
+        return '12.4.1'
     if pytorch_version == '2.3.1':
         return '12.1.1'
     if pytorch_version == '2.2.2':

From 4c01abbab132cc1c9e15fa7e4372e9af845fb0ef Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 9 Aug 2024 15:46:21 -0400
Subject: [PATCH 09/23] fix dockerfile

---
 docker/Dockerfile | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 9e86c08fd3..970af2f1ef 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -189,15 +189,9 @@ ENV PYTORCH_NIGHTLY_VERSION=${PYTORCH_NIGHTLY_VERSION}
 
 RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \
       CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \
-      if [ "${PYTORCH_VERSION}" != "2.4.0" ]; then \
         pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \
             torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \
             torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \
-      else \
-        pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/${CUDA_VERSION} \
-            torch \
-            torchvision ; \
-      fi ; \
     else \
         pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \
             torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \

From 5f30e2d270db38a699dbf2562f8cce9a2fdbbb0f Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 9 Aug 2024 16:14:38 -0400
Subject: [PATCH 10/23] remove many issues

---
 composer/trainer/_patch_pytorch.py    |  6 ++++++
 composer/utils/checkpoint.py          | 24 +++++++++++++++---------
 tests/trainer/test_fsdp_checkpoint.py |  8 +++-----
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py
index 2c27118090..24ba86a2cb 100644
--- a/composer/trainer/_patch_pytorch.py
+++ b/composer/trainer/_patch_pytorch.py
@@ -133,6 +133,12 @@ def patch_pytorch():
         _MeshEnv.create_child_mesh = create_child_mesh
         DeviceMesh.__getitem__ = device_mesh__getitem__
 
+    elif version.parse(torch.__version__) < version.parse('2.4.1'):
+        # Monkey patch for torch < 2.4.1 ie torch == 2.4.0
+
+        # No monkeypatches!
+        pass
+
 
 def build_metadata(
     self,
diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py
index 56b13fcac6..ace6dedad4 100644
--- a/composer/utils/checkpoint.py
+++ b/composer/utils/checkpoint.py
@@ -692,14 +692,19 @@ def load_sharded_checkpoint(
                 # Ensure state exists
                 state_dict['state'] = state_dict.get('state', {})
 
-            # dist_cp.load breaks unless the specified state_dict supports `load_state_dict`
-            # See: https://github.com/pytorch/pytorch/issues/125096
-            dist_cp.load_state_dict(
-                state_dict=state_dict,
-                storage_reader=storage_reader,
-                planner=state.fsdp_config.load_planner,
-                no_dist=(not dist.is_initialized()),
-            )
+            if version.parse(torch.__version__) >= version.parse('2.4.0'):
+                dist_cp.load(
+                    state_dict=state_dict,
+                    storage_reader=storage_reader,
+                    planner=state.fsdp_config.load_planner,
+                )
+            else:
+                dist_cp.load_state_dict(
+                    state_dict=state_dict,
+                    storage_reader=storage_reader,
+                    planner=state.fsdp_config.load_planner,
+                    no_dist=(not dist.is_initialized()),
+                )
 
             log.info(f'Loaded state dict')
             state.load_state_dict(
@@ -1158,7 +1163,8 @@ def _save_checkpoint(
         if expect_file:
             if version.parse(torch.__version__) >= version.parse('2.3.0'):
                 save_planner = state.fsdp_config.save_planner
-                if save_planner is None:
+                if version.parse(torch.__version__) < version.parse('2.4.0') and save_planner is None:
+                    # Dedup is only broken on <2.4
                     from composer.trainer._patch_pytorch import SavePlannerWithDedupFix
 
                     save_planner = SavePlannerWithDedupFix()
diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py
index a59e60172a..8d677e3cc7 100644
--- a/tests/trainer/test_fsdp_checkpoint.py
+++ b/tests/trainer/test_fsdp_checkpoint.py
@@ -315,12 +315,10 @@ def test_fsdp_full_state_dict_load(
     use_tp: bool,
     use_hsdp: bool,
 ):
-    if use_hsdp:
-        pytest.xfail('Known PyTorch issue with HSDP, waiting for pytorch patch')
+    if use_hsdp and version.parse(torch.__version__) < version.parse('2.4.0'):
+        pytest.xfail('HSDP requires torch 2.4.0 or later')
     if use_tp:
         pytest.skip('TP on PyTorch 2.3 has full state dict issues.')
-    if (use_tp or use_hsdp) and version.parse(torch.__version__) < version.parse('2.3.0'):
-        pytest.skip('HSDP and TP require torch 2.3.0 or later')
     if autoresume:
         run_name = 'my-cool-autoresume-run'
     else:
@@ -861,7 +859,7 @@ def test_fsdp_partitioned_state_dict_load(
         run_name = None
 
     if use_remote:
-        save_folder = f's3://{s3_bucket}/{s3_ephemeral_prefix}/checkpoints/{{run_name}}'
+        save_folder = f's3://{s3_bucket}/{s3_ephemeral_prefix}/checkpoints/{{run_name}}'f
     else:
         tmp_paths = dist.all_gather_object(os.path.abspath(tmp_path))
         save_folder = os.path.join(tmp_paths[0], 'checkpoints', '{run_name}')

From fe7def5876e236d72998e8e98916b68f0bb3d9b0 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 9 Aug 2024 16:16:20 -0400
Subject: [PATCH 11/23] strip magic mock

---
 composer/core/state.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/composer/core/state.py b/composer/core/state.py
index cbd7fc41db..7528da97a9 100644
--- a/composer/core/state.py
+++ b/composer/core/state.py
@@ -1437,7 +1437,8 @@ def load_optim_state(self, state_dict: dict[str, Any], strict: bool = True):
                 # checkpoint on rank 0 only. However, PyTorch modifies the state_dict (producing
                 # errors) before discarding the output. Accordingly, we mock the state dict.
                 # See: https://github.com/pytorch/pytorch/issues/125177
-                optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
+                if version.parse(torch.__version__) < version.parse('2.4.0'):
+                    optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
                 set_optimizer_state_dict(
                     model=self.model,
                     optimizers=optimizer,

From 55b642fee40b77d02e5a6ead74830e8c2d9956c6 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 9 Aug 2024 16:19:50 -0400
Subject: [PATCH 12/23] fix gating

---
 composer/core/state.py                | 16 ++++++++++++----
 tests/trainer/test_fsdp_checkpoint.py |  2 +-
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/composer/core/state.py b/composer/core/state.py
index 7528da97a9..7c43473ace 100644
--- a/composer/core/state.py
+++ b/composer/core/state.py
@@ -979,7 +979,9 @@ def get_model_state_dict(self) -> dict[str, Any]:
         Returns:
             dict[str, Any]: The state dict for the model.
         """
-        if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+        if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+            version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+        ):
             from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict
             if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
                 raise NotImplementedError(
@@ -1017,7 +1019,9 @@ def get_optim_state_dict(self) -> dict[str, Any]:
         Returns:
             dict[str, Any]: The state dict for the optimizer.
         """
-        if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+        if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+            version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+        ):
             from torch.distributed.checkpoint.state_dict import StateDictOptions, get_optimizer_state_dict
             if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
                 raise NotImplementedError(
@@ -1327,7 +1331,9 @@ def load_model_state(
         model_on_rank = state_dict['model'] is not None
 
         if model_on_rank:
-            if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+            if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+                version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+            ):
                 from torch.distributed.checkpoint.state_dict import StateDictOptions, set_model_state_dict
                 try:
                     set_model_state_dict(
@@ -1430,7 +1436,9 @@ def load_optim_state(self, state_dict: dict[str, Any], strict: bool = True):
                 continue
 
             optim_state_dict = serialized_value[type(optimizer).__qualname__] if serialized_value is not None else None
-            if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+            if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+                version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+            ):
                 from torch.distributed.checkpoint.state_dict import StateDictOptions, set_optimizer_state_dict
 
                 # optim_state_dict is `None` on non-zero ranks when loading FSDP monolith
diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py
index 8d677e3cc7..76c3b4c5bb 100644
--- a/tests/trainer/test_fsdp_checkpoint.py
+++ b/tests/trainer/test_fsdp_checkpoint.py
@@ -859,7 +859,7 @@ def test_fsdp_partitioned_state_dict_load(
         run_name = None
 
     if use_remote:
-        save_folder = f's3://{s3_bucket}/{s3_ephemeral_prefix}/checkpoints/{{run_name}}'f
+        save_folder = f's3://{s3_bucket}/{s3_ephemeral_prefix}/checkpoints/{{run_name}}'
     else:
         tmp_paths = dist.all_gather_object(os.path.abspath(tmp_path))
         save_folder = os.path.join(tmp_paths[0], 'checkpoints', '{run_name}')

From 973c1bc107e19a93937588d9489d5f449b5aac35 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 9 Aug 2024 16:24:26 -0400
Subject: [PATCH 13/23] try chuck hack

---
 docker/Dockerfile | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 80ae8bad2e..251ded7c95 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -189,9 +189,15 @@ ENV PYTORCH_NIGHTLY_VERSION=${PYTORCH_NIGHTLY_VERSION}
 
 RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \
       CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \
+      if [ "${PYTORCH_VERSION}" != "2.4.0" ]; then \
         pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \
             torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \
             torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \
+      else \
+        pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/${CUDA_VERSION} \
+            torch \
+            torchvision ; \
+      fi ; \
     else \
         pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \
             torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \

From fac2593e225486b80e4a4d624ed326a29ec984d3 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 9 Aug 2024 16:44:00 -0400
Subject: [PATCH 14/23] upgrade lint

---
 .../algorithms/ghost_batchnorm/ghost_batchnorm.py     |  4 +++-
 composer/algorithms/swa/swa.py                        |  2 +-
 composer/callbacks/image_visualizer.py                | 10 +++++-----
 tests/trainer/test_fsdp_checkpoint.py                 | 11 +++++++++--
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py b/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
index 3943a1c345..92aed98808 100644
--- a/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
+++ b/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
@@ -168,7 +168,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:  # type: ignore
 
         nchunks: int = int(math.ceil(batch_size / self.ghost_batch_size))
         has_momentum: bool = hasattr(self.batchnorm, 'momentum')
-        original_momentum: float = self.batchnorm.momentum
+        original_momentum: Optional[float] = self.batchnorm.momentum
 
         if self.training and has_momentum:
             # applying the same batchnorm multiple times greatly increases
@@ -180,6 +180,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:  # type: ignore
         normalized_chunks = [self.batchnorm(chunk) for chunk in input.chunk(nchunks, 0)]
 
         if self.training and has_momentum:
+            assert original_momentum is not None
             self._unscale_momentum(original_momentum)
 
         return torch.cat(normalized_chunks, dim=0)
@@ -192,6 +193,7 @@ def from_batchnorm(module: torch.nn.Module, ghost_batch_size: int) -> '_GhostBat
 
     @torch.jit.unused
     def _scale_momentum(self, nchunks: int):
+        assert self.batchnorm.momentum is not None
         self.batchnorm.momentum = float(self.batchnorm.momentum) / nchunks
 
     @torch.jit.unused
diff --git a/composer/algorithms/swa/swa.py b/composer/algorithms/swa/swa.py
index 4177168a13..dd9826d44d 100644
--- a/composer/algorithms/swa/swa.py
+++ b/composer/algorithms/swa/swa.py
@@ -228,7 +228,7 @@ def _initialize_swa(self, state: State) -> None:
                 state.optimizers[0],
                 swa_lr=self.swa_lr,
                 anneal_epochs=self.anneal_steps,
-                anneal_strategy=self.anneal_strategy,
+                anneal_strategy=self.anneal_strategy,  # type: ignore
             )
 
         self.swa_model = AveragedModel(state.model, device=torch.device('cpu'))
diff --git a/composer/callbacks/image_visualizer.py b/composer/callbacks/image_visualizer.py
index e8381a944c..d86a2d97bc 100644
--- a/composer/callbacks/image_visualizer.py
+++ b/composer/callbacks/image_visualizer.py
@@ -164,18 +164,18 @@ def _make_segmentation_images(
     # Ensure the targets are in the expected format
     if infer_target_type(outputs, targets) == 'one_hot':
         if channels_last:
-            targets = targets.argmax(dim=-1).data.cpu().numpy()
+            targets = targets.argmax(dim=-1).data.cpu().numpy()  # type: ignore
         else:
-            targets = targets.argmax(dim=1).data.cpu().numpy()
+            targets = targets.argmax(dim=1).data.cpu().numpy()  # type: ignore
     else:
-        targets = targets.data.cpu().numpy()
+        targets = targets.data.cpu().numpy()  # type: ignore
     # Convert the outputs to the expected format
     if channels_last:
         num_classes = outputs.shape[-1]
-        outputs = outputs.argmax(dim=-1).cpu().numpy()
+        outputs = outputs.argmax(dim=-1).cpu().numpy()  # type: ignore
     else:
         num_classes = outputs.shape[1]
-        outputs = outputs.argmax(dim=1).cpu().numpy()
+        outputs = outputs.argmax(dim=1).cpu().numpy()  # type: ignore
     # Adjust targets such that negative values are mapped to one higher than the maximum class
     targets[targets < 0] = num_classes
 
diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py
index 76c3b4c5bb..5bdf76ce8a 100644
--- a/tests/trainer/test_fsdp_checkpoint.py
+++ b/tests/trainer/test_fsdp_checkpoint.py
@@ -1151,7 +1151,10 @@ def set_up_planner(
             # suffix all keys with `foo_``
             state_dict['state']['model'] = {k + '_foo': v for k, v in state_dict['state']['model'].items()}
 
-            super().set_up_planner(state_dict, is_coordinator)
+            super().set_up_planner(
+                state_dict=state_dict,
+                is_coordinator=is_coordinator,
+            )
 
     class RenameLoadPlanner(DefaultLoadPlanner):
 
@@ -1162,7 +1165,11 @@ def set_up_planner(
             is_coordinator: bool,
         ) -> None:
             if 'state' not in state_dict:
-                super().set_up_planner(state_dict, metadata, is_coordinator)
+                super().set_up_planner(
+                    state_dict=state_dict,
+                    metadata=metadata,
+                    is_coordinator=is_coordinator,
+                )
                 return
 
             self.original_state_dict = state_dict

From 9d90cc98c2f8f9a4fba7252dab39beaa53cb7c38 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 9 Aug 2024 16:46:53 -0400
Subject: [PATCH 15/23] fix planner

---
 composer/utils/checkpoint.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py
index ace6dedad4..648290a320 100644
--- a/composer/utils/checkpoint.py
+++ b/composer/utils/checkpoint.py
@@ -1163,11 +1163,16 @@ def _save_checkpoint(
         if expect_file:
             if version.parse(torch.__version__) >= version.parse('2.3.0'):
                 save_planner = state.fsdp_config.save_planner
-                if version.parse(torch.__version__) < version.parse('2.4.0') and save_planner is None:
-                    # Dedup is only broken on <2.4
-                    from composer.trainer._patch_pytorch import SavePlannerWithDedupFix
+                if save_planner is None:
+                    if version.parse(torch.__version__) < version.parse('2.4.0'):
+                        # Dedup is only broken on <2.4
+                        from composer.trainer._patch_pytorch import SavePlannerWithDedupFix
 
-                    save_planner = SavePlannerWithDedupFix()
+                        save_planner = SavePlannerWithDedupFix()
+                    else:
+                        from torch.distributed.checkpoint.default_planner import DefaultSavePlanner
+
+                        save_planner = DefaultSavePlanner(dedup_save_to_lowest_rank=True)
                 dist_cp.save(
                     state_dict=state_dict,
                     storage_writer=dist_cp.FileSystemWriter(dirname),

From 939ecb4e4b2376dd5bf65463fa56d64fdc036cbf Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 9 Aug 2024 16:57:46 -0400
Subject: [PATCH 16/23] fix inference

---
 tests/utils/test_inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/utils/test_inference.py b/tests/utils/test_inference.py
index e7c374377d..69b78ead4c 100644
--- a/tests/utils/test_inference.py
+++ b/tests/utils/test_inference.py
@@ -196,7 +196,7 @@ def test_huggingface_export_for_inference_onnx(onnx_opset_version, tiny_bert_con
         ort_session = ort.InferenceSession(save_path, providers=['CPUExecutionProvider'])
 
         for key, value in sample_input.items():
-            sample_input[key] = cpu_device.tensor_to_device(value).numpy()
+            sample_input[key] = cpu_device.tensor_to_device(value).numpy()  # type: ignore
 
         loaded_model_out = ort_session.run(None, sample_input)
 

From 643d4ce7dce2ead0ade4eec2622b326e717b644e Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 9 Aug 2024 17:20:35 -0400
Subject: [PATCH 17/23] swap folder

---
 docker/Dockerfile | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 251ded7c95..7d5a655087 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -189,15 +189,9 @@ ENV PYTORCH_NIGHTLY_VERSION=${PYTORCH_NIGHTLY_VERSION}
 
 RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \
       CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \
-      if [ "${PYTORCH_VERSION}" != "2.4.0" ]; then \
-        pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \
+        pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch/ \
             torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \
             torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \
-      else \
-        pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/${CUDA_VERSION} \
-            torch \
-            torchvision ; \
-      fi ; \
     else \
         pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \
             torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \

From 13ef79029a4428cee25e19bfb79b4acb39915953 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 9 Aug 2024 17:24:17 -0400
Subject: [PATCH 18/23] different link for torchvision

---
 docker/Dockerfile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 7d5a655087..35a8d2887a 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -188,9 +188,10 @@ ENV PYTORCH_NIGHTLY_URL=${PYTORCH_NIGHTLY_URL}
 ENV PYTORCH_NIGHTLY_VERSION=${PYTORCH_NIGHTLY_VERSION}
 
 RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \
-      CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \
+        CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \
         pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch/ \
-            torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \
+            torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} && \
+        pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torchvision/ \
             torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \
     else \
         pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \

From 1b7ba47892a522f89b7806307db52ff109688cb0 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 9 Aug 2024 17:40:43 -0400
Subject: [PATCH 19/23] remove old code

---
 composer/callbacks/memory_snapshot.py      |   8 +-
 composer/callbacks/oom_observer.py         |   8 +-
 composer/distributed/dist_strategy.py      | 333 +++++++--------------
 composer/distributed/mosaic_parallelism.py |   6 +-
 composer/profiler/torch_profiler.py        |  68 ++---
 composer/profiler/utils.py                 |   7 +-
 composer/trainer/_patch_pytorch.py         | 158 +---------
 composer/utils/dist.py                     |   2 -
 8 files changed, 143 insertions(+), 447 deletions(-)

diff --git a/composer/callbacks/memory_snapshot.py b/composer/callbacks/memory_snapshot.py
index 767f3abb0f..67805db257 100644
--- a/composer/callbacks/memory_snapshot.py
+++ b/composer/callbacks/memory_snapshot.py
@@ -94,13 +94,7 @@ def __init__(
             _, _, self.remote_path_in_bucket = parse_uri(remote_file_name)
         else:
             self.remote_path_in_bucket = None
-
-        if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'):  # type: ignore
-            # MemorySnapshot is only supported in torch v2.1.0-rc1 or higher
-            self._enabled = True
-        else:
-            self._enabled = False
-            warnings.warn('Memory snapshot is supported after PyTorch 2.1.0. Skipping memory snapshot callback.')
+        self._enabled = True
 
     def init(self, state: State, logger: Logger) -> None:
         if not self._enabled:
diff --git a/composer/callbacks/oom_observer.py b/composer/callbacks/oom_observer.py
index d43685bab7..af75ff4cab 100644
--- a/composer/callbacks/oom_observer.py
+++ b/composer/callbacks/oom_observer.py
@@ -113,13 +113,7 @@ def __init__(
         else:
             self.remote_path_in_bucket = None
 
-        if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'):  # type: ignore
-            # OOMObserver is only supported in torch v2.1.0 or higher
-            self._enabled = True
-        else:
-            self._enabled = False
-            warnings.warn('OOMObserver is supported after PyTorch 2.1.0. Disabling OOMObserver callback.')
-
+        self._enabled = True
         self.filename_config: Optional[SnapshotFileNameConfig] = None
 
     def init(self, state: State, logger: Logger) -> None:
diff --git a/composer/distributed/dist_strategy.py b/composer/distributed/dist_strategy.py
index f7adc79428..0496abd48d 100644
--- a/composer/distributed/dist_strategy.py
+++ b/composer/distributed/dist_strategy.py
@@ -16,6 +16,8 @@
     apply_activation_checkpointing,
     checkpoint_wrapper,
 )
+from torch.distributed.fsdp.wrap import CustomPolicy
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import offload_wrapper
 from torch.distributed.fsdp import FullyShardedDataParallel, ShardingStrategy
 from torch.distributed.fsdp._common_utils import clean_tensor_name
 from torch.nn.parallel import DistributedDataParallel
@@ -397,177 +399,73 @@ def sync_hook(*args):
             if hasattr(obj, '_fsdp_wrap') and not bool(obj._fsdp_wrap):
                 continue
 
-            # Rather than verifying these changes with older PyTorch versions, we are fixing forward here
-            if version.parse(torch.__version__) > version.parse('2.1.0'):
-                # A dictionary of all tied parameter pointers to (module, attr) tuples
-                tied_pointers = {}
-
-                # Goes through all modules finding which weights have the same pointers
-                for mod in obj.modules():
-                    for attr_name, attr in mod.named_parameters(recurse=False):
-                        ptr = id(attr)
-                        mod_attr_list = tied_pointers.get(ptr, [])
-                        mod_attr_list.append((mod, attr_name))
-                        tied_pointers[ptr] = mod_attr_list
-
-                # Dictionary mapping the source module to a list of (target module, source attr, target attr) tuples
-                source_mod_to_mod_attr = {}
-                for mod_attr_list in tied_pointers.values():
-                    # If there is only one module for this pointer, then there is no weight tying
-                    if len(mod_attr_list) == 1:
-                        continue
-
-                    # Arbitrarily choose the first module as the source module
-                    first_mod, first_attr = mod_attr_list[0]
-                    source_mod_to_mod_attr[first_mod] = [
-                        (target_mod, first_attr, dest_attr) for target_mod, dest_attr in mod_attr_list[1:]
-                    ]
-
-                # Clean up no longer needed module references for memory safety
-                del tied_pointers
-
-                def _param_init_fn(module: torch.nn.Module) -> None:
-                    # If we do not have any parameters or buffers on meta device managed by this module directly, we do not need to call the parameter init function.
-                    # It is assumed that whatever process moved the parameters off of meta device initialized them.
-                    # We expect this to occur if we have tied weights, as the second module will already have the weights initialized.
-                    is_meta = any(param.is_meta for param in module.parameters(recurse=False)
-                                 ) or any(buffer.is_meta for buffer in module.buffers(recurse=False))
-                    if not is_meta:
-                        return
-
-                    # Move all parameters and buffers to the current device
-                    module.to_empty(device=f'cuda:{torch.cuda.current_device()}', recurse=False)
-
-                    # Redo weight tying, which will have been broken by the above line that moves parameters off of meta device
-                    if module in source_mod_to_mod_attr:
-                        for target_mod, first_attr, dest_attr in source_mod_to_mod_attr[module]:
-                            setattr(target_mod, dest_attr, getattr(module, first_attr))
-
-                    # Run the specified initialization
-                    if hasattr(obj, 'param_init_fn') and isinstance(obj.param_init_fn, Callable):
-                        obj.param_init_fn(module)
-                    elif hasattr(module, 'reset_parameters') and isinstance(module.reset_parameters, Callable):
-                        module.reset_parameters()
-                    else:
-                        raise ValueError(
-                            f'Object `{obj_name}` does not have a ``param_init_fn`` or a ``reset_parameters`` function. '
-                            'This leaves parameters without initialization. Please add a ``param_init_fn`` or ``reset_parameters`` '
-                            f'to module `{obj_name}`.',
-                        )
-            else:
-
-                def _param_init_fn(module: torch.nn.Module) -> None:
-                    # A dictionary of all tied parameter pointers to module names
-                    tied_pointers = {}
-
-                    # Goes through all modules finding which weights have the same pointers
-                    for name, mod in module.named_modules():
-                        # Since FSDP recursively wraps, at parent modules we can encounter already
-                        # wrapped weights, as a result we should skip any modules with `_fsdp_wrapped_module.`
-                        if '_fsdp_wrapped_module' in name:
-                            continue
-                        for attr in ['weight', 'bias']:
-                            if hasattr(mod, attr):
-                                mod_attr = getattr(mod, attr)
-                                if mod_attr is None:
-                                    continue
-                                ptr = id(mod_attr)
-                                ptr_attr = (ptr, attr)
-                                name_list = tied_pointers.get(ptr_attr, [])
-                                name_list.append(name)
-                                tied_pointers[ptr_attr] = name_list
-
-                    # Creates a dictionary of module names that should be tied together
-                    tied_mod_names = collections.defaultdict(list)
-                    # Creates a set of modules we should not initialize
-                    should_not_init_params = set()
-                    for ptr_attr_type, mod_names in tied_pointers.items():
-                        # No modules for this pointer are tied
-                        if len(mod_names) == 1:
-                            continue
-                        _, attr_type = ptr_attr_type
-                        first = next(mod_names.__iter__())
-                        for elem in mod_names:
-                            should_not_init_params.add('.'.join([elem, attr_type]))
-                            tied_mod_names[(first, attr_type)].append(elem)
-                        # Make sure at least one of the tied parameters is initialized
-                        should_not_init_params.remove('.'.join([first, attr_type]))
-
-                    meta_safe_apply(
-                        module,
-                        lambda t: torch.empty_like(t, device=f'cuda:{torch.cuda.current_device()}'),
-                        should_not_init_params,
-                        module_name='',
+            # A dictionary of all tied parameter pointers to (module, attr) tuples
+            tied_pointers = {}
+
+            # Goes through all modules finding which weights have the same pointers
+            for mod in obj.modules():
+                for attr_name, attr in mod.named_parameters(recurse=False):
+                    ptr = id(attr)
+                    mod_attr_list = tied_pointers.get(ptr, [])
+                    mod_attr_list.append((mod, attr_name))
+                    tied_pointers[ptr] = mod_attr_list
+
+            # Dictionary mapping the source module to a list of (target module, source attr, target attr) tuples
+            source_mod_to_mod_attr = {}
+            for mod_attr_list in tied_pointers.values():
+                # If there is only one module for this pointer, then there is no weight tying
+                if len(mod_attr_list) == 1:
+                    continue
+
+                # Arbitrarily choose the first module as the source module
+                first_mod, first_attr = mod_attr_list[0]
+                source_mod_to_mod_attr[first_mod] = [
+                    (target_mod, first_attr, dest_attr) for target_mod, dest_attr in mod_attr_list[1:]
+                ]
+
+            # Clean up no longer needed module references for memory safety
+            del tied_pointers
+
+            def _param_init_fn(module: torch.nn.Module) -> None:
+                # If we do not have any parameters or buffers on meta device managed by this module directly, we do not need to call the parameter init function.
+                # It is assumed that whatever process moved the parameters off of meta device initialized them.
+                # We expect this to occur if we have tied weights, as the second module will already have the weights initialized.
+                is_meta = any(param.is_meta for param in module.parameters(recurse=False)
+                                ) or any(buffer.is_meta for buffer in module.buffers(recurse=False))
+                if not is_meta:
+                    return
+
+                # Move all parameters and buffers to the current device
+                module.to_empty(device=f'cuda:{torch.cuda.current_device()}', recurse=False)
+
+                # Redo weight tying, which will have been broken by the above line that moves parameters off of meta device
+                if module in source_mod_to_mod_attr:
+                    for target_mod, first_attr, dest_attr in source_mod_to_mod_attr[module]:
+                        setattr(target_mod, dest_attr, getattr(module, first_attr))
+
+                # Run the specified initialization
+                if hasattr(obj, 'param_init_fn') and isinstance(obj.param_init_fn, Callable):
+                    obj.param_init_fn(module)
+                elif hasattr(module, 'reset_parameters') and isinstance(module.reset_parameters, Callable):
+                    module.reset_parameters()
+                else:
+                    raise ValueError(
+                        f'Object `{obj_name}` does not have a ``param_init_fn`` or a ``reset_parameters`` function. '
+                        'This leaves parameters without initialization. Please add a ``param_init_fn`` or ``reset_parameters`` '
+                        f'to module `{obj_name}`.',
                     )
 
-                    if len(tied_mod_names) > 0:
-                        warnings.warn((
-                            'The passed in model appears to have tied weights. In order to '
-                            'support effective weight tying, the tied modules need to be '
-                            'in the same FSDP module. If the weights are not properly tied '
-                            'it can lead to loss spikes. We have tried our best to ensure '
-                            'the tied weights are in the same FSDP module.'
-                        ))
-
-                    # Redoes weight tying
-                    for name_attr, tied_names in tied_mod_names.items():
-                        name, attr = name_attr
-                        src_mod = module.get_submodule(name)
-                        # We need to make sure the source and destination
-                        # modules end up in the same FSDP module otherwise
-                        # with sharding weight tying gets violated
-                        src_mod._fsdp_wrap = False  # type: ignore
-                        src_params = getattr(src_mod, attr)
-                        for tied_name in tied_names:
-                            dest_mod = module.get_submodule(tied_name)
-                            dest_mod._fsdp_wrap = False  # type: ignore
-                            setattr(dest_mod, attr, src_params)
-
-                    if hasattr(obj, 'param_init_fn') and isinstance(obj.param_init_fn, Callable):
-                        module.apply(obj.param_init_fn)
-                    elif hasattr(module, 'reset_parameters') and isinstance(module.reset_parameters, Callable):
-                        module.reset_parameters()
-                    else:
-                        raise ValueError(
-                            f'Object `{obj_name}` does not have a ``param_init_fn`` or a ``reset_parameters`` function. '
-                            'This leaves parameters without initialization. Please add a ``param_init_fn`` or ``reset_parameters`` '
-                            f'to module `{obj_name}`.',
-                        )
-
-            if version.parse(torch.__version__) > version.parse('2.1.0.dev'):
-                # CustomPolicy is only supported in torch v2.1.0-rc1 or higher
-                from torch.distributed.fsdp.wrap import CustomPolicy  # type: ignore
-
-                def lambda_fn(module: torch.nn.Module) -> Union[bool, dict]:
-                    ret = False
-                    if hasattr(module, '_fsdp_wrap'):
-                        ret = bool(module._fsdp_wrap)
-                    elif hasattr(obj, 'fsdp_wrap_fn') and isinstance(obj.fsdp_wrap_fn, Callable):
-                        ret = obj.fsdp_wrap_fn(module)
-                        if isinstance(ret, dict):
-                            ret = set_custom_fsdp_module_kwargs(ret, process_group_cache)
-                    return ret
-
-                _auto_wrap_policy = CustomPolicy(lambda_fn)
-            else:
-                # Choose which modules to FSDP wrap according to the following priority:
-                # If module has attribute `module._fsdp_wrap = ...`, always respect it
-                # Otherwise wrap if root object `obj.fsdp_wrap_fn(module)` is true.
-                def __auto_wrap_policy(module: torch.nn.Module, recurse: bool, nonwrapped_numel: int) -> bool:
-                    if recurse:
-                        return True
-                    should_be_wrapped = False
-                    if hasattr(module, '_fsdp_wrap'):
-                        should_be_wrapped = bool(module._fsdp_wrap)
-                    elif hasattr(obj, 'fsdp_wrap_fn') and isinstance(obj.fsdp_wrap_fn, Callable):
-                        should_be_wrapped = obj.fsdp_wrap_fn(module)
-
-                    return should_be_wrapped
-
-                def _auto_wrap_policy_new(module: torch.nn.Module, recurse: bool, nonwrapped_numel: int) -> bool:
-                    return __auto_wrap_policy(module, recurse, nonwrapped_numel)
+            def lambda_fn(module: torch.nn.Module) -> Union[bool, dict]:
+                ret = False
+                if hasattr(module, '_fsdp_wrap'):
+                    ret = bool(module._fsdp_wrap)
+                elif hasattr(obj, 'fsdp_wrap_fn') and isinstance(obj.fsdp_wrap_fn, Callable):
+                    ret = obj.fsdp_wrap_fn(module)
+                    if isinstance(ret, dict):
+                        ret = set_custom_fsdp_module_kwargs(ret, process_group_cache)
+                return ret
 
-                _auto_wrap_policy = _auto_wrap_policy_new
+            _auto_wrap_policy = CustomPolicy(lambda_fn)
 
             fsdp_obj = FullyShardedDataParallel(
                 obj,
@@ -640,75 +538,52 @@ def _auto_wrap_policy_new(module: torch.nn.Module, recurse: bool, nonwrapped_num
                 # FP8 TE requires using the TE checkpoint function, FSDP activation checkpointing only works with TE non-reentrant checkpointing
                 if te_checkpoint_wrapper:
                     assert not activation_checkpointing_reentrant, 'TE checkpoint only works with non-reentrant checkpointing'
-                if version.parse(torch.__version__) > version.parse('2.1.0.dev'):
-                    from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import offload_wrapper
-                    if not activation_checkpointing_reentrant:
-                        if te_checkpoint_wrapper:
-                            try:
-                                import transformer_engine.pytorch as te
-                            except ModuleNotFoundError:
-                                raise ModuleNotFoundError(
-                                    'Please install transformer-engine to use TE checkpoint wrapper',
-                                )
-
-                            # RNG state tracker for checkpointing
-                            CUDA_RNG_STATES_TRACKER = te.distributed.CudaRNGStatesTracker()
-                            CUDA_RNG_STATES_TRACKER.add('fsdp-rng', te_rng_seed)
-
-                            def get_cuda_rng_tracker():
-                                return CUDA_RNG_STATES_TRACKER
-
-                            first_wrap_fn = lambda m: checkpoint_wrapper(
-                                m,
-                                context_fn=te.distributed.get_activation_recompute_contexts,
-                                checkpoint_fn=te.distributed.checkpoint,
-                                use_reentrant=False,
-                                get_rng_state_tracker=get_cuda_rng_tracker,
+                if not activation_checkpointing_reentrant:
+                    if te_checkpoint_wrapper:
+                        try:
+                            import transformer_engine.pytorch as te
+                        except ModuleNotFoundError:
+                            raise ModuleNotFoundError(
+                                'Please install transformer-engine to use TE checkpoint wrapper',
                             )
-                        else:
-                            first_wrap_fn = lambda m: checkpoint_wrapper(
-                                m,
-                                checkpoint_impl=CheckpointImpl.NO_REENTRANT,
-                            ) if activation_checkpointing else (lambda module: module)
-                        second_wrap_fn = (
-                            lambda module: offload_wrapper(
-                                first_wrap_fn(module)
-                                if activation_checkpointing else module,  # type: ignore reportGeneralTypeIssues
-                            )
-                        ) if activation_cpu_offload else first_wrap_fn
-                    else:
+
+                        # RNG state tracker for checkpointing
+                        CUDA_RNG_STATES_TRACKER = te.distributed.CudaRNGStatesTracker()
+                        CUDA_RNG_STATES_TRACKER.add('fsdp-rng', te_rng_seed)
+
+                        def get_cuda_rng_tracker():
+                            return CUDA_RNG_STATES_TRACKER
 
                         first_wrap_fn = lambda m: checkpoint_wrapper(
                             m,
-                            checkpoint_impl=CheckpointImpl.REENTRANT,
-                        ) if activation_checkpointing else (lambda module: module)
-                        second_wrap_fn = (
-                            lambda module: offload_wrapper(
-                                first_wrap_fn(module)
-                                if activation_checkpointing else module,  # type: ignore reportGeneralTypeIssues
-                            )
-                        ) if activation_cpu_offload else first_wrap_fn
-                else:
-                    if not activation_checkpointing_reentrant:
+                            context_fn=te.distributed.get_activation_recompute_contexts,
+                            checkpoint_fn=te.distributed.checkpoint,
+                            use_reentrant=False,
+                            get_rng_state_tracker=get_cuda_rng_tracker,
+                        )
+                    else:
                         first_wrap_fn = lambda m: checkpoint_wrapper(
                             m,
                             checkpoint_impl=CheckpointImpl.NO_REENTRANT,
                         ) if activation_checkpointing else (lambda module: module)
-                        second_wrap_fn = (
-                            lambda module: checkpoint_wrapper(
-                                first_wrap_fn(module),  # type: ignore reportGeneralTypeIssues
-                                checkpoint_impl=CheckpointImpl.NO_REENTRANT,
-                                offload_to_cpu=True,
-                            )
-                        ) if activation_cpu_offload else first_wrap_fn
-                    else:
-                        first_wrap_fn = checkpoint_wrapper if activation_checkpointing else (lambda module: module)
-                        second_wrap_fn = (
-                            lambda module: checkpoint_wrapper(
-                                first_wrap_fn(module),  # type: ignore reportGeneralTypeIssues
-                                offload_to_cpu=True,
-                            )
-                        ) if activation_cpu_offload else first_wrap_fn
+                    second_wrap_fn = (
+                        lambda module: offload_wrapper(
+                            first_wrap_fn(module)
+                            if activation_checkpointing else module,  # type: ignore reportGeneralTypeIssues
+                        )
+                    ) if activation_cpu_offload else first_wrap_fn
+                else:
+
+                    first_wrap_fn = lambda m: checkpoint_wrapper(
+                        m,
+                        checkpoint_impl=CheckpointImpl.REENTRANT,
+                    ) if activation_checkpointing else (lambda module: module)
+                    second_wrap_fn = (
+                        lambda module: offload_wrapper(
+                            first_wrap_fn(module)
+                            if activation_checkpointing else module,  # type: ignore reportGeneralTypeIssues
+                        )
+                    ) if activation_cpu_offload else first_wrap_fn
 
                 # Choose which modules to activation checkpoint according to the following priority:
                 # If module has attribute `module._activation_checkpointing = ...`, always respect it
diff --git a/composer/distributed/mosaic_parallelism.py b/composer/distributed/mosaic_parallelism.py
index 66c06d911b..fc261e1edf 100644
--- a/composer/distributed/mosaic_parallelism.py
+++ b/composer/distributed/mosaic_parallelism.py
@@ -27,12 +27,10 @@
     'NO_SHARD': ShardingStrategy.NO_SHARD,
     'SHARD_GRAD_OP': ShardingStrategy.SHARD_GRAD_OP,
     'FULL_SHARD': ShardingStrategy.FULL_SHARD,
+    '_HYBRID_SHARD_ZERO2': ShardingStrategy._HYBRID_SHARD_ZERO2,
+    'HYBRID_SHARD': ShardingStrategy.HYBRID_SHARD,
 }
 
-if version.parse(torch.__version__) >= version.parse('2.1.0'):
-    SHARDING_MAP['_HYBRID_SHARD_ZERO2'] = ShardingStrategy._HYBRID_SHARD_ZERO2
-    SHARDING_MAP['HYBRID_SHARD'] = ShardingStrategy.HYBRID_SHARD
-
 BACKWARD_PREFETCH_MAP = {
     'NONE': None,
     'BACKWARD_PRE': BackwardPrefetch.BACKWARD_PRE,
diff --git a/composer/profiler/torch_profiler.py b/composer/profiler/torch_profiler.py
index 883ba2b442..2d76c5bf95 100644
--- a/composer/profiler/torch_profiler.py
+++ b/composer/profiler/torch_profiler.py
@@ -27,6 +27,7 @@
     format_name_with_dist,
     format_name_with_dist_and_time,
 )
+from composer.profiler.utils import export_memory_timeline_html
 
 if TYPE_CHECKING:
     from composer.core import State
@@ -296,44 +297,39 @@ def handler_fn(prof: torch.profiler.profiler.profile):
                 f'PyTorch memory timeline profiler enabled: {self.memory_filename if self.memory_filename else False}',
             )
             if self.memory_filename is not None:
-                if version.parse(torch.__version__) > version.parse('2.1.0.dev'):  # type: ignore
-                    # memory timeline profiling is only supported in torch v2.1.0-rc1 or higher
-                    memory_trace_file_name = os.path.join(
-                        folder_name,
-                        format_name_with_dist_and_time(
-                            self.memory_filename,
-                            run_name=state.run_name,
-                            timestamp=timestamp,
-                        ),
+                memory_trace_file_name = os.path.join(
+                    folder_name,
+                    format_name_with_dist_and_time(
+                        self.memory_filename,
+                        run_name=state.run_name,
+                        timestamp=timestamp,
+                    ),
+                )
+                log.debug(f'Saving memory trace to {memory_trace_file_name}')
+                memory_trace_file_dirname = os.path.dirname(memory_trace_file_name)
+                if memory_trace_file_dirname:
+                    os.makedirs(memory_trace_file_dirname, exist_ok=True)
+                export_memory_timeline_html(
+                    prof,
+                    memory_trace_file_name,
+                    torch.cuda.current_device(),  # type: ignore
+                )
+                log.debug(f'Uploaded memory trace to {self.memory_remote_file_name}')
+                if self.memory_remote_file_name is not None:
+                    memory_trace_remote_file_name = format_name_with_dist_and_time(
+                        self.memory_remote_file_name,
+                        run_name=state.run_name,
+                        timestamp=timestamp,
+                    )
+                    memory_trace_remote_file_name = memory_trace_remote_file_name.lstrip('/')
+                    log.debug(
+                        f'Uploading memory trace to {memory_trace_remote_file_name} from {memory_trace_file_name}',
                     )
-                    log.debug(f'Saving memory trace to {memory_trace_file_name}')
-                    memory_trace_file_dirname = os.path.dirname(memory_trace_file_name)
-                    if memory_trace_file_dirname:
-                        os.makedirs(memory_trace_file_dirname, exist_ok=True)
-                    from composer.profiler.utils import export_memory_timeline_html
-                    export_memory_timeline_html(
-                        prof,
-                        memory_trace_file_name,
-                        torch.cuda.current_device(),  # type: ignore
+                    logger.upload_file(
+                        remote_file_name=memory_trace_remote_file_name,
+                        file_path=memory_trace_file_name,
+                        overwrite=self.overwrite,
                     )
-                    log.debug(f'Uploaded memory trace to {self.memory_remote_file_name}')
-                    if self.memory_remote_file_name is not None:
-                        memory_trace_remote_file_name = format_name_with_dist_and_time(
-                            self.memory_remote_file_name,
-                            run_name=state.run_name,
-                            timestamp=timestamp,
-                        )
-                        memory_trace_remote_file_name = memory_trace_remote_file_name.lstrip('/')
-                        log.debug(
-                            f'Uploading memory trace to {memory_trace_remote_file_name} from {memory_trace_file_name}',
-                        )
-                        logger.upload_file(
-                            remote_file_name=memory_trace_remote_file_name,
-                            file_path=memory_trace_file_name,
-                            overwrite=self.overwrite,
-                        )
-                else:
-                    log.warning('Memory timeline is supported after PyTorch 2.1.0. Skipping memory trace.')
 
             if self.num_traces_to_keep >= 0:
                 while len(self.saved_traces) > self.num_traces_to_keep:
diff --git a/composer/profiler/utils.py b/composer/profiler/utils.py
index ddd235b711..63a0fa59a1 100644
--- a/composer/profiler/utils.py
+++ b/composer/profiler/utils.py
@@ -10,6 +10,8 @@
 from tempfile import NamedTemporaryFile
 from typing import Any, Optional, Union
 
+from torch.profiler._memory_profiler import _CATEGORY_TO_COLORS, _CATEGORY_TO_INDEX, MemoryProfileTimeline
+
 import numpy as np
 import torch
 import torch.cuda
@@ -29,11 +31,6 @@ def export_memory_timeline_html(
     return_fig: bool = False,
 ) -> Optional[Union[None, Any]]:
     """Exports a memory timeline to an HTML file. Similar to the PyTorch plotting function, but with adjusted axis tickers and grids."""
-    if version.parse(torch.__version__) <= version.parse('2.1.0.dev'):
-        log.warning('export_memory_timeline_html failed because memory timeline is supported after PyTorch 2.1.0.')
-        return
-
-    from torch.profiler._memory_profiler import _CATEGORY_TO_COLORS, _CATEGORY_TO_INDEX, MemoryProfileTimeline
 
     # Default to device 0, if unset. Fallback on cpu.
     if device is None and prof.use_device and prof.use_device != 'cuda':
diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py
index 24ba86a2cb..bd430e6ce9 100644
--- a/composer/trainer/_patch_pytorch.py
+++ b/composer/trainer/_patch_pytorch.py
@@ -47,29 +47,7 @@ def patch_unshard_for_automicrobatching(auto_microbatch_size_found=False):
 
 def patch_pytorch():
     """Monkey patches pytorch functions based on pytorch version."""
-    if version.parse(torch.__version__) < version.parse('2.1.1'):
-        # Monkey patch for torch < 2.1.1 ie torch == 2.1.0
-
-        # Monkey patch sharding method
-        ChunkShardingSpec.build_metadata = build_metadata
-
-        # Monkey patch partial state dict handling
-        from torch.distributed.fsdp import _state_dict_utils
-
-        _state_dict_utils._sharded_pre_load_state_dict_hook = (_sharded_pre_load_state_dict_hook)
-
-        # Allow 2D HSDP
-        from torch.distributed.fsdp import _runtime_utils
-        _runtime_utils._validate_and_get_hybrid_shard_state = lambda *args, **kwargs: None
-
-    elif version.parse(torch.__version__) < version.parse('2.1.3'):
-        # Monkey patch for torch < 2.1.3 ie torch == 2.1.1, 2.1.2
-
-        # Allow 2D HSDP
-        from torch.distributed.fsdp import _runtime_utils
-        _runtime_utils._validate_and_get_hybrid_shard_state = lambda *args, **kwargs: None
-
-    elif version.parse(torch.__version__) < version.parse('2.2.1'):
+    if version.parse(torch.__version__) < version.parse('2.2.1'):
         # Monkey patch for torch < 2.2.1 ie torch == 2.2.0
 
         # Allow 2D HSDP
@@ -140,140 +118,6 @@ def patch_pytorch():
         pass
 
 
-def build_metadata(
-    self,
-    tensor_sizes: torch.Size,
-    tensor_properties: sharded_tensor_meta.TensorProperties,
-) -> sharded_tensor_meta.ShardedTensorMetadata:
-    """Adds nightly change for ChunkShardingSpec.
-
-    Change implemented in https://github.com/pytorch/pytorch/pull/108915
-    """
-    tensor_num_dim = len(tensor_sizes)
-
-    self._verify_dim(self.dim)
-    if self.dim >= tensor_num_dim or self.dim < -tensor_num_dim:  # type: ignore[operator]
-        raise ValueError(f'Invalid sharding dim: {self.dim}')
-
-    shards_metadata = []
-    sharding_dim_size = tensor_sizes[self.dim]  # type: ignore[index]
-    chunks = len(self.placements)
-    split_size = get_split_size(sharding_dim_size, chunks)
-    for idx, placement in enumerate(self.placements):
-        # generate ShardMetadata for each placement device
-        chunked_dim_size = get_chunked_dim_size(sharding_dim_size, split_size, idx)
-        shard_size = list(tensor_sizes)
-        current_offsets = [0] * tensor_num_dim
-        current_offsets[self.dim] = split_size * idx  # type: ignore[index]
-        shard_size[self.dim] = chunked_dim_size  # type: ignore[index]
-
-        shard_metadata = ShardMetadata(
-            shard_offsets=current_offsets,
-            shard_sizes=shard_size,
-            placement=placement,
-        )
-        shards_metadata.append(shard_metadata)
-
-    return sharded_tensor_meta.ShardedTensorMetadata(shards_metadata, tensor_sizes, tensor_properties)
-
-
-@no_type_check
-def _sharded_pre_load_state_dict_hook(
-    module: nn.Module,
-    fsdp_state,
-    state_dict: dict[str, Any],
-    prefix: str,
-) -> None:
-    """Adds nightly change for partial state dict error handling.
-
-    https://github.com/pytorch/pytorch/blob/0511df0ee9edeb5c2613805ccfb49beb323b87f9/torch/distributed/fsdp/_state_dict_utils.py#L607-L615
-
-    The hook combines the unflattened, sharded parameters (ShardedTensor) to
-    a new FlatParameter and shards the new FlatParameter to the local chunk.
-    """
-    from torch.distributed._tensor import Replicate
-    from torch.distributed.distributed_c10d import _get_pg_default_device
-    from torch.distributed.fsdp._common_utils import FSDP_PREFIX, _has_fsdp_params, _is_composable, _module_handle
-    from torch.distributed.fsdp._runtime_utils import _lazy_init
-    from torch.distributed.fsdp._state_dict_utils import _enter_unshard_params_ctx, _param_name_infos
-
-    _lazy_init(fsdp_state, module)
-    if not _is_composable(fsdp_state):
-        _replace_by_prefix(state_dict, prefix, prefix + f'{FSDP_PREFIX}')
-    if not _has_fsdp_params(fsdp_state, module):
-        return
-
-    handle = _module_handle(fsdp_state, module)
-    if not handle.uses_sharded_strategy:  # type: ignore
-        raise RuntimeError(
-            'load_sharded_state_dict can only be called when parameters '
-            'are flattened and sharded.',
-        )
-
-    device = fsdp_state.compute_device
-    for fqn, _, _ in _param_name_infos(module, fsdp_state):
-        if not _is_composable(fsdp_state):
-            fqn_from_global_root = f'{prefix}{FSDP_PREFIX}{fqn}'
-        else:
-            fqn_from_global_root = f'{prefix}{fqn}'
-        try:
-            param = state_dict.pop(fqn_from_global_root)
-        except KeyError:
-            log.warning(
-                f'Did not find param with FQN {fqn_from_global_root}, skipping it. '  # noqa: G004
-                'The weight will not be filled if you expect it to be.',
-            )
-            continue  # TODO: Improve unittesting for state_dict finetuning
-            # cases: https://github.com/pytorch/pytorch/issues/109134
-
-        if not fsdp_state._state_dict_config.use_dtensor:
-            # All-gather the param (ShardedTensor)
-            param, shards = _ext_pre_load_state_dict_transform(param)
-
-            assert len(shards) < 2, (
-                'Expects 0 or 1 shard per rank '
-                f'but got {len(shards)} shards on rank {fsdp_state.rank}.'
-            )
-            param_numel = param.size().numel()
-            dim_0_size = param.size()[0]
-            chunk_size = (math.ceil(dim_0_size / fsdp_state.world_size) * param_numel // dim_0_size)
-            if len(shards) == 1:
-                local_tensor = shards[0].tensor.flatten()
-                pg_device = _get_pg_default_device(fsdp_state.process_group)
-                if local_tensor.device.type != pg_device.type:
-                    local_tensor = local_tensor.to(pg_device)
-                num_padding = chunk_size - local_tensor.numel()
-                if num_padding > 0:
-                    local_tensor = F.pad(local_tensor, [0, num_padding])
-            else:
-                local_tensor = torch.zeros(chunk_size, dtype=param.dtype, device=device)
-            tensor = torch.empty(
-                chunk_size * fsdp_state.world_size,
-                dtype=local_tensor.dtype,
-                device=device,
-            )
-            if local_tensor.is_cpu:
-                # Tensor could be on FSDP GPU compute device, while local_tensor is on CPU.
-                # Convert to CPU so all_gather can work.
-                tensor_dev = tensor.device
-                tensor = tensor.cpu()
-                tensor_list = list(torch.chunk(tensor, torch.distributed.get_world_size(fsdp_state.process_group)))
-                torch.distributed.all_gather(tensor_list, local_tensor, group=fsdp_state.process_group)
-                tensor.to(tensor_dev)
-            else:
-                torch.distributed.all_gather_into_tensor(tensor, local_tensor, group=fsdp_state.process_group)
-            tensor = tensor.narrow(0, 0, param_numel).reshape(param.size())
-            state_dict[fqn_from_global_root] = tensor
-        else:
-            if param.device != fsdp_state._device_mesh.device_type:  # type: ignore
-                param = param.to(fsdp_state._device_mesh.device_type)  # type: ignore
-
-            param = param.redistribute(device_mesh=param.device_mesh, placements=[Replicate()])
-            state_dict[fqn_from_global_root] = param.to_local()
-
-    _enter_unshard_params_ctx(module, fsdp_state, writeback=True)
-
-
 if version.parse(torch.__version__) >= version.parse('2.2.1') and version.parse(
         torch.__version__,) < version.parse('2.2.3'):
 
diff --git a/composer/utils/dist.py b/composer/utils/dist.py
index 2178ce2dd5..5b89b5b531 100644
--- a/composer/utils/dist.py
+++ b/composer/utils/dist.py
@@ -579,8 +579,6 @@ def initialize_dist(device: Union[str, Device], timeout: float = 300.0) -> None:
                 'PyTorch XLA package not found. In order to use XLA based devices '
                 'PyTorch XLA must be installed.',
             )
-        if version.parse(torch_xla.__version__) < version.parse('2.1.0'):
-            raise RuntimeError(f'PyTorch XLA version must be at least 2.1.0, found {torch_xla.__version__}.')
         # XLA initialization requires the init_method to be set
         dist.init_process_group(device_obj.dist_backend, init_method='xla://')
     elif dist_env_vars_match_defaults:

From 6da77c1dca2f9951f6fc8e9fde3e1a6ede46b9d3 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 9 Aug 2024 17:45:10 -0400
Subject: [PATCH 20/23] lint

---
 composer/callbacks/memory_snapshot.py      |  1 -
 composer/callbacks/oom_observer.py         |  1 -
 composer/distributed/dist_strategy.py      | 12 ++++--------
 composer/distributed/mosaic_parallelism.py |  1 -
 composer/profiler/torch_profiler.py        |  3 +--
 composer/profiler/utils.py                 |  5 +----
 composer/trainer/_patch_pytorch.py         |  8 --------
 composer/utils/dist.py                     |  6 +-----
 8 files changed, 7 insertions(+), 30 deletions(-)

diff --git a/composer/callbacks/memory_snapshot.py b/composer/callbacks/memory_snapshot.py
index 67805db257..328d781d81 100644
--- a/composer/callbacks/memory_snapshot.py
+++ b/composer/callbacks/memory_snapshot.py
@@ -9,7 +9,6 @@
 from typing import Optional, Union
 
 import torch.cuda
-from packaging import version
 
 from composer import State
 from composer.core import Callback, State, Time, TimeUnit
diff --git a/composer/callbacks/oom_observer.py b/composer/callbacks/oom_observer.py
index af75ff4cab..d85b4ec6ca 100644
--- a/composer/callbacks/oom_observer.py
+++ b/composer/callbacks/oom_observer.py
@@ -14,7 +14,6 @@
 from typing import Optional
 
 import torch.cuda
-from packaging import version
 
 from composer.core import Callback, State
 from composer.loggers import Logger
diff --git a/composer/distributed/dist_strategy.py b/composer/distributed/dist_strategy.py
index 0496abd48d..1b09a9fd74 100644
--- a/composer/distributed/dist_strategy.py
+++ b/composer/distributed/dist_strategy.py
@@ -3,7 +3,6 @@
 
 """Helpers for running distributed data parallel training."""
 
-import collections
 import logging
 import warnings
 from contextlib import contextmanager, nullcontext
@@ -15,18 +14,17 @@
     CheckpointImpl,
     apply_activation_checkpointing,
     checkpoint_wrapper,
+    offload_wrapper,
 )
-from torch.distributed.fsdp.wrap import CustomPolicy
-from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import offload_wrapper
 from torch.distributed.fsdp import FullyShardedDataParallel, ShardingStrategy
 from torch.distributed.fsdp._common_utils import clean_tensor_name
+from torch.distributed.fsdp.wrap import CustomPolicy
 from torch.nn.parallel import DistributedDataParallel
 from torchmetrics import Metric, MetricCollection
 
 from composer.core import Precision, State
 from composer.core.precision import _validate_precision
 from composer.devices import Device, DeviceGPU
-from composer.distributed.meta_safe_apply import meta_safe_apply
 from composer.distributed.mosaic_parallelism import (
     BACKWARD_PREFETCH_MAP,
     SHARDING_MAP,
@@ -431,7 +429,7 @@ def _param_init_fn(module: torch.nn.Module) -> None:
                 # It is assumed that whatever process moved the parameters off of meta device initialized them.
                 # We expect this to occur if we have tied weights, as the second module will already have the weights initialized.
                 is_meta = any(param.is_meta for param in module.parameters(recurse=False)
-                                ) or any(buffer.is_meta for buffer in module.buffers(recurse=False))
+                             ) or any(buffer.is_meta for buffer in module.buffers(recurse=False))
                 if not is_meta:
                     return
 
@@ -543,9 +541,7 @@ def lambda_fn(module: torch.nn.Module) -> Union[bool, dict]:
                         try:
                             import transformer_engine.pytorch as te
                         except ModuleNotFoundError:
-                            raise ModuleNotFoundError(
-                                'Please install transformer-engine to use TE checkpoint wrapper',
-                            )
+                            raise ModuleNotFoundError('Please install transformer-engine to use TE checkpoint wrapper',)
 
                         # RNG state tracker for checkpointing
                         CUDA_RNG_STATES_TRACKER = te.distributed.CudaRNGStatesTracker()
diff --git a/composer/distributed/mosaic_parallelism.py b/composer/distributed/mosaic_parallelism.py
index fc261e1edf..0fa6a0547c 100644
--- a/composer/distributed/mosaic_parallelism.py
+++ b/composer/distributed/mosaic_parallelism.py
@@ -8,7 +8,6 @@
 from typing import Any, Union
 
 import torch
-from packaging import version
 from torch import distributed
 from torch.distributed import ProcessGroup
 from torch.distributed.fsdp import (
diff --git a/composer/profiler/torch_profiler.py b/composer/profiler/torch_profiler.py
index 2d76c5bf95..93e753bbd5 100644
--- a/composer/profiler/torch_profiler.py
+++ b/composer/profiler/torch_profiler.py
@@ -13,12 +13,12 @@
 
 import torch.cuda
 import torch.profiler
-from packaging import version
 from torch.profiler.profiler import ProfilerAction as TorchProfilerAction
 
 from composer.core.callback import Callback
 from composer.loggers import Logger
 from composer.profiler.profiler_action import ProfilerAction
+from composer.profiler.utils import export_memory_timeline_html
 from composer.utils import (
     FORMAT_NAME_WITH_DIST_AND_TIME_TABLE,
     FORMAT_NAME_WITH_DIST_TABLE,
@@ -27,7 +27,6 @@
     format_name_with_dist,
     format_name_with_dist_and_time,
 )
-from composer.profiler.utils import export_memory_timeline_html
 
 if TYPE_CHECKING:
     from composer.core import State
diff --git a/composer/profiler/utils.py b/composer/profiler/utils.py
index 63a0fa59a1..68f2862549 100644
--- a/composer/profiler/utils.py
+++ b/composer/profiler/utils.py
@@ -10,12 +10,10 @@
 from tempfile import NamedTemporaryFile
 from typing import Any, Optional, Union
 
-from torch.profiler._memory_profiler import _CATEGORY_TO_COLORS, _CATEGORY_TO_INDEX, MemoryProfileTimeline
-
 import numpy as np
 import torch
 import torch.cuda
-from packaging import version
+from torch.profiler._memory_profiler import _CATEGORY_TO_COLORS, _CATEGORY_TO_INDEX, MemoryProfileTimeline
 from torch.profiler.profiler import profile as TorchProfile
 
 log = logging.getLogger(__name__)
@@ -31,7 +29,6 @@ def export_memory_timeline_html(
     return_fig: bool = False,
 ) -> Optional[Union[None, Any]]:
     """Exports a memory timeline to an HTML file. Similar to the PyTorch plotting function, but with adjusted axis tickers and grids."""
-
     # Default to device 0, if unset. Fallback on cpu.
     if device is None and prof.use_device and prof.use_device != 'cuda':
         device = prof.use_device + ':0'
diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py
index bd430e6ce9..881914e2ce 100644
--- a/composer/trainer/_patch_pytorch.py
+++ b/composer/trainer/_patch_pytorch.py
@@ -11,7 +11,6 @@
 """PyTorch, especially PyTorch Distributed, monkeypatches."""
 
 import logging
-import math
 import functools
 import contextlib
 from dataclasses import asdict
@@ -20,16 +19,9 @@
 
 
 import torch
-import torch.distributed._shard.sharded_tensor.metadata as sharded_tensor_meta
-from torch.distributed._shard.sharding_spec import ChunkShardingSpec
 import torch.nn as nn
-import torch.nn.functional as F
 from packaging import version
-from torch.distributed._shard.sharding_spec import ShardMetadata
-from torch.distributed._shard.sharding_spec._internals import get_chunked_dim_size, get_split_size
 from torch.distributed.fsdp import FullyShardedDataParallel, ShardingStrategy
-from torch.distributed.fsdp._fsdp_extensions import _ext_pre_load_state_dict_transform
-from torch.distributed.utils import _replace_by_prefix
 
 from composer.utils import dist
 
diff --git a/composer/utils/dist.py b/composer/utils/dist.py
index 5b89b5b531..0515828a10 100644
--- a/composer/utils/dist.py
+++ b/composer/utils/dist.py
@@ -47,12 +47,8 @@
 import torch
 import torch.distributed as dist
 import torch.utils.data
-from packaging import version
 
-from composer.utils.device import get_device, is_hpu_installed, is_xla_installed
-
-if is_xla_installed():
-    import torch_xla
+from composer.utils.device import get_device, is_hpu_installed
 
 if TYPE_CHECKING:
     from composer.devices import Device

From 71c3de7255f6afef4d9e85ae0dd3c7a04cfa7e5c Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 9 Aug 2024 17:59:48 -0400
Subject: [PATCH 21/23] 4wide

---
 .github/workflows/docker-configure-build-push.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker-configure-build-push.yaml b/.github/workflows/docker-configure-build-push.yaml
index 2b6bf4893d..a668e75217 100644
--- a/.github/workflows/docker-configure-build-push.yaml
+++ b/.github/workflows/docker-configure-build-push.yaml
@@ -36,7 +36,7 @@ on:
         required: true
 jobs:
   configure-build-push:
-    runs-on: ubuntu-latest
+    runs-on: mosaic-4wide
     steps:
     - name: Maximize Build Space on Worker
       uses: easimon/maximize-build-space@v4

From 1ce316938c365dbfd0cfadcdd8ee58e05b5e350b Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 9 Aug 2024 18:00:50 -0400
Subject: [PATCH 22/23] bump fa

---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 35a8d2887a..c3f4dee907 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -262,7 +262,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \
 RUN if [ -n "$CUDA_VERSION" ] ; then \
         pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \
         pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \
-        MAX_JOBS=1 pip${PYTHON_VERSION} install --no-cache-dir --no-build-isolation flash-attn==2.6.2; \
+        MAX_JOBS=1 pip${PYTHON_VERSION} install --no-cache-dir --no-build-isolation flash-attn==2.6.3; \
         cd .. ; \
     fi
 

From e9224f45f2eb2b44d6df0e49eee8313240fc9813 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Mon, 12 Aug 2024 17:47:43 -0400
Subject: [PATCH 23/23] remove 2.1 tests

---
 .github/workflows/daily.yaml  | 28 ----------------------------
 .github/workflows/pr-cpu.yaml |  4 ----
 composer/trainer/trainer.py   |  1 -
 3 files changed, 33 deletions(-)

diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
index b64e68d493..ee94e89c2b 100644
--- a/.github/workflows/daily.yaml
+++ b/.github/workflows/daily.yaml
@@ -17,11 +17,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: mosaicml
         - name: cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and not doctest
@@ -42,11 +37,6 @@ jobs:
           markers: not daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
-        - name: daily-cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: mosaicml
         - name: daily-cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
@@ -102,12 +92,6 @@ jobs:
         # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
         # on MCLOUD and not eat up all GPUs at once
         include:
-        - name: "gpu-3.10-2.1-1-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 1
         - name: "gpu-3.11-2.2-1-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -120,12 +104,6 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 1
-        - name: "gpu-3.10-2.1-2-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 2
         - name: "gpu-3.11-2.2-2-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -138,12 +116,6 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 2
-        - name: "gpu-3.10-2.1-4-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 4
         - name: "gpu-3.11-2.2-4-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 23129715db..4d44e69824 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -13,10 +13,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and not remote and not gpu and not doctest
-          pytest_command: coverage run -m pytest
         - name: cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
index b2f829ca10..27323718fc 100644
--- a/composer/trainer/trainer.py
+++ b/composer/trainer/trainer.py
@@ -41,7 +41,6 @@
 import torch.utils.data
 from packaging import version
 from torch._dynamo import OptimizedModule
-from torch.cuda.amp.grad_scaler import GradScaler
 from torch.distributed.fsdp import FullyShardedDataParallel
 from torch.distributed.fsdp._runtime_utils import _post_backward_final_callback
 from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler