Bump to Pytorch 2.4 (#3542)

Co-authored-by: Chuck Tang <chuck.tang@databricks.com>
mosaicml · Aug 12, 2024 · a10a798 · a10a798
1 parent 1320825
commit a10a798
Show file tree

Hide file tree

Showing 24 changed files with 276 additions and 639 deletions.
diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml
@@ -17,11 +17,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: mosaicml
         - name: cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: not daily and (remote or not remote) and not gpu and not doctest
@@ -42,11 +37,6 @@ jobs:
           markers: not daily and (remote or not remote) and not gpu and doctest
           pytest_command: coverage run -m pytest tests/test_docs.py
           composer_package_name: mosaicml
-        - name: daily-cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: daily and (remote or not remote) and not gpu and not doctest
-          pytest_command: coverage run -m pytest
-          composer_package_name: mosaicml
         - name: daily-cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: daily and (remote or not remote) and not gpu and not doctest
@@ -102,12 +92,6 @@ jobs:
         # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
         # on MCLOUD and not eat up all GPUs at once
         include:
-        - name: "gpu-3.10-2.1-1-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 1
         - name: "gpu-3.11-2.2-1-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -120,12 +104,6 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 1
-        - name: "gpu-3.10-2.1-2-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 2
         - name: "gpu-3.11-2.2-2-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
@@ -138,12 +116,6 @@ jobs:
           pytest_command: "coverage run -m pytest"
           composer_package_name: "mosaicml"
           gpu_num: 2
-        - name: "gpu-3.10-2.1-4-gpu"
-          container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
-          markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
-          pytest_command: "coverage run -m pytest"
-          composer_package_name: "mosaicml"
-          gpu_num: 4
         - name: "gpu-3.11-2.2-4-gpu"
           container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
           markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"

diff --git a/.github/workflows/docker-configure-build-push.yaml b/.github/workflows/docker-configure-build-push.yaml
@@ -36,7 +36,7 @@ on:
         required: true
 jobs:
   configure-build-push:
-    runs-on: ubuntu-latest
+    runs-on: mosaic-4wide
     steps:
     - name: Maximize Build Space on Worker
       uses: easimon/maximize-build-space@v4

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -13,10 +13,6 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cpu-3.10-2.1
-          container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
-          markers: not daily and not remote and not gpu and not doctest
-          pytest_command: coverage run -m pytest
         - name: cpu-3.11-2.2
           container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
           markers: not daily and not remote and not gpu and not doctest

diff --git a/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py b/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
@@ -168,7 +168,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:  # type: ignore
 
         nchunks: int = int(math.ceil(batch_size / self.ghost_batch_size))
         has_momentum: bool = hasattr(self.batchnorm, 'momentum')
-        original_momentum: float = self.batchnorm.momentum
+        original_momentum: Optional[float] = self.batchnorm.momentum
 
         if self.training and has_momentum:
             # applying the same batchnorm multiple times greatly increases
@@ -180,6 +180,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:  # type: ignore
         normalized_chunks = [self.batchnorm(chunk) for chunk in input.chunk(nchunks, 0)]
 
         if self.training and has_momentum:
+            assert original_momentum is not None
             self._unscale_momentum(original_momentum)
 
         return torch.cat(normalized_chunks, dim=0)
@@ -192,6 +193,7 @@ def from_batchnorm(module: torch.nn.Module, ghost_batch_size: int) -> '_GhostBat
 
     @torch.jit.unused
     def _scale_momentum(self, nchunks: int):
+        assert self.batchnorm.momentum is not None
         self.batchnorm.momentum = float(self.batchnorm.momentum) / nchunks
 
     @torch.jit.unused

diff --git a/composer/algorithms/swa/swa.py b/composer/algorithms/swa/swa.py
@@ -228,7 +228,7 @@ def _initialize_swa(self, state: State) -> None:
                 state.optimizers[0],
                 swa_lr=self.swa_lr,
                 anneal_epochs=self.anneal_steps,
-                anneal_strategy=self.anneal_strategy,
+                anneal_strategy=self.anneal_strategy,  # type: ignore
             )
 
         self.swa_model = AveragedModel(state.model, device=torch.device('cpu'))

diff --git a/composer/callbacks/image_visualizer.py b/composer/callbacks/image_visualizer.py
@@ -164,18 +164,18 @@ def _make_segmentation_images(
     # Ensure the targets are in the expected format
     if infer_target_type(outputs, targets) == 'one_hot':
         if channels_last:
-            targets = targets.argmax(dim=-1).data.cpu().numpy()
+            targets = targets.argmax(dim=-1).data.cpu().numpy()  # type: ignore
         else:
-            targets = targets.argmax(dim=1).data.cpu().numpy()
+            targets = targets.argmax(dim=1).data.cpu().numpy()  # type: ignore
     else:
-        targets = targets.data.cpu().numpy()
+        targets = targets.data.cpu().numpy()  # type: ignore
     # Convert the outputs to the expected format
     if channels_last:
         num_classes = outputs.shape[-1]
-        outputs = outputs.argmax(dim=-1).cpu().numpy()
+        outputs = outputs.argmax(dim=-1).cpu().numpy()  # type: ignore
     else:
         num_classes = outputs.shape[1]
-        outputs = outputs.argmax(dim=1).cpu().numpy()
+        outputs = outputs.argmax(dim=1).cpu().numpy()  # type: ignore
     # Adjust targets such that negative values are mapped to one higher than the maximum class
     targets[targets < 0] = num_classes
 

diff --git a/composer/callbacks/memory_snapshot.py b/composer/callbacks/memory_snapshot.py
@@ -9,7 +9,6 @@
 from typing import Optional, Union
 
 import torch.cuda
-from packaging import version
 
 from composer import State
 from composer.core import Callback, State, Time, TimeUnit
@@ -94,13 +93,7 @@ def __init__(
             _, _, self.remote_path_in_bucket = parse_uri(remote_file_name)
         else:
             self.remote_path_in_bucket = None
-
-        if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'):  # type: ignore
-            # MemorySnapshot is only supported in torch v2.1.0-rc1 or higher
-            self._enabled = True
-        else:
-            self._enabled = False
-            warnings.warn('Memory snapshot is supported after PyTorch 2.1.0. Skipping memory snapshot callback.')
+        self._enabled = True
 
     def init(self, state: State, logger: Logger) -> None:
         if not self._enabled:

diff --git a/composer/callbacks/oom_observer.py b/composer/callbacks/oom_observer.py
@@ -14,7 +14,6 @@
 from typing import Optional
 
 import torch.cuda
-from packaging import version
 
 from composer.core import Callback, State
 from composer.loggers import Logger
@@ -113,13 +112,7 @@ def __init__(
         else:
             self.remote_path_in_bucket = None
 
-        if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'):  # type: ignore
-            # OOMObserver is only supported in torch v2.1.0 or higher
-            self._enabled = True
-        else:
-            self._enabled = False
-            warnings.warn('OOMObserver is supported after PyTorch 2.1.0. Disabling OOMObserver callback.')
-
+        self._enabled = True
         self.filename_config: Optional[SnapshotFileNameConfig] = None
 
     def init(self, state: State, logger: Logger) -> None:

diff --git a/composer/core/state.py b/composer/core/state.py
@@ -979,7 +979,9 @@ def get_model_state_dict(self) -> dict[str, Any]:
         Returns:
             dict[str, Any]: The state dict for the model.
         """
-        if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+        if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+            version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+        ):
             from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict
             if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
                 raise NotImplementedError(
@@ -1017,7 +1019,9 @@ def get_optim_state_dict(self) -> dict[str, Any]:
         Returns:
             dict[str, Any]: The state dict for the optimizer.
         """
-        if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+        if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+            version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+        ):
             from torch.distributed.checkpoint.state_dict import StateDictOptions, get_optimizer_state_dict
             if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
                 raise NotImplementedError(
@@ -1327,7 +1331,9 @@ def load_model_state(
         model_on_rank = state_dict['model'] is not None
 
         if model_on_rank:
-            if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+            if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+                version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+            ):
                 from torch.distributed.checkpoint.state_dict import StateDictOptions, set_model_state_dict
                 try:
                     set_model_state_dict(
@@ -1430,14 +1436,17 @@ def load_optim_state(self, state_dict: dict[str, Any], strict: bool = True):
                 continue
 
             optim_state_dict = serialized_value[type(optimizer).__qualname__] if serialized_value is not None else None
-            if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
+            if version.parse(torch.__version__) >= version.parse('2.4.0') or (
+                version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
+            ):
                 from torch.distributed.checkpoint.state_dict import StateDictOptions, set_optimizer_state_dict
 
                 # optim_state_dict is `None` on non-zero ranks when loading FSDP monolith
                 # checkpoint on rank 0 only. However, PyTorch modifies the state_dict (producing
                 # errors) before discarding the output. Accordingly, we mock the state dict.
                 # See: https://github.com/pytorch/pytorch/issues/125177
-                optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
+                if version.parse(torch.__version__) < version.parse('2.4.0'):
+                    optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
                 set_optimizer_state_dict(
                     model=self.model,
                     optimizers=optimizer,