Skip to content

Commit

Permalink
Bump to Pytorch 2.4 (#3542)
Browse files Browse the repository at this point in the history
Co-authored-by: Chuck Tang <chuck.tang@databricks.com>
  • Loading branch information
mvpatel2000 and Chuck Tang authored Aug 12, 2024
1 parent 1320825 commit a10a798
Show file tree
Hide file tree
Showing 24 changed files with 276 additions and 639 deletions.
28 changes: 0 additions & 28 deletions .github/workflows/daily.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,6 @@ jobs:
strategy:
matrix:
include:
- name: cpu-3.10-2.1
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
Expand All @@ -42,11 +37,6 @@ jobs:
markers: not daily and (remote or not remote) and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
composer_package_name: mosaicml
- name: daily-cpu-3.10-2.1
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: daily-cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
Expand Down Expand Up @@ -102,12 +92,6 @@ jobs:
# Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
# on MCLOUD and not eat up all GPUs at once
include:
- name: "gpu-3.10-2.1-1-gpu"
container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 1
- name: "gpu-3.11-2.2-1-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand All @@ -120,12 +104,6 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 1
- name: "gpu-3.10-2.1-2-gpu"
container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 2
- name: "gpu-3.11-2.2-2-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand All @@ -138,12 +116,6 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 2
- name: "gpu-3.10-2.1-4-gpu"
container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 4
- name: "gpu-3.11-2.2-4-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docker-configure-build-push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ on:
required: true
jobs:
configure-build-push:
runs-on: ubuntu-latest
runs-on: mosaic-4wide
steps:
- name: Maximize Build Space on Worker
uses: easimon/maximize-build-space@v4
Expand Down
4 changes: 0 additions & 4 deletions .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@ jobs:
strategy:
matrix:
include:
- name: cpu-3.10-2.1
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
pytest_command: coverage run -m pytest
- name: cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
Expand Down
4 changes: 3 additions & 1 deletion composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: # type: ignore

nchunks: int = int(math.ceil(batch_size / self.ghost_batch_size))
has_momentum: bool = hasattr(self.batchnorm, 'momentum')
original_momentum: float = self.batchnorm.momentum
original_momentum: Optional[float] = self.batchnorm.momentum

if self.training and has_momentum:
# applying the same batchnorm multiple times greatly increases
Expand All @@ -180,6 +180,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: # type: ignore
normalized_chunks = [self.batchnorm(chunk) for chunk in input.chunk(nchunks, 0)]

if self.training and has_momentum:
assert original_momentum is not None
self._unscale_momentum(original_momentum)

return torch.cat(normalized_chunks, dim=0)
Expand All @@ -192,6 +193,7 @@ def from_batchnorm(module: torch.nn.Module, ghost_batch_size: int) -> '_GhostBat

@torch.jit.unused
def _scale_momentum(self, nchunks: int):
assert self.batchnorm.momentum is not None
self.batchnorm.momentum = float(self.batchnorm.momentum) / nchunks

@torch.jit.unused
Expand Down
2 changes: 1 addition & 1 deletion composer/algorithms/swa/swa.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def _initialize_swa(self, state: State) -> None:
state.optimizers[0],
swa_lr=self.swa_lr,
anneal_epochs=self.anneal_steps,
anneal_strategy=self.anneal_strategy,
anneal_strategy=self.anneal_strategy, # type: ignore
)

self.swa_model = AveragedModel(state.model, device=torch.device('cpu'))
Expand Down
10 changes: 5 additions & 5 deletions composer/callbacks/image_visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,18 +164,18 @@ def _make_segmentation_images(
# Ensure the targets are in the expected format
if infer_target_type(outputs, targets) == 'one_hot':
if channels_last:
targets = targets.argmax(dim=-1).data.cpu().numpy()
targets = targets.argmax(dim=-1).data.cpu().numpy() # type: ignore
else:
targets = targets.argmax(dim=1).data.cpu().numpy()
targets = targets.argmax(dim=1).data.cpu().numpy() # type: ignore
else:
targets = targets.data.cpu().numpy()
targets = targets.data.cpu().numpy() # type: ignore
# Convert the outputs to the expected format
if channels_last:
num_classes = outputs.shape[-1]
outputs = outputs.argmax(dim=-1).cpu().numpy()
outputs = outputs.argmax(dim=-1).cpu().numpy() # type: ignore
else:
num_classes = outputs.shape[1]
outputs = outputs.argmax(dim=1).cpu().numpy()
outputs = outputs.argmax(dim=1).cpu().numpy() # type: ignore
# Adjust targets such that negative values are mapped to one higher than the maximum class
targets[targets < 0] = num_classes

Expand Down
9 changes: 1 addition & 8 deletions composer/callbacks/memory_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from typing import Optional, Union

import torch.cuda
from packaging import version

from composer import State
from composer.core import Callback, State, Time, TimeUnit
Expand Down Expand Up @@ -94,13 +93,7 @@ def __init__(
_, _, self.remote_path_in_bucket = parse_uri(remote_file_name)
else:
self.remote_path_in_bucket = None

if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'): # type: ignore
# MemorySnapshot is only supported in torch v2.1.0-rc1 or higher
self._enabled = True
else:
self._enabled = False
warnings.warn('Memory snapshot is supported after PyTorch 2.1.0. Skipping memory snapshot callback.')
self._enabled = True

def init(self, state: State, logger: Logger) -> None:
if not self._enabled:
Expand Down
9 changes: 1 addition & 8 deletions composer/callbacks/oom_observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from typing import Optional

import torch.cuda
from packaging import version

from composer.core import Callback, State
from composer.loggers import Logger
Expand Down Expand Up @@ -113,13 +112,7 @@ def __init__(
else:
self.remote_path_in_bucket = None

if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'): # type: ignore
# OOMObserver is only supported in torch v2.1.0 or higher
self._enabled = True
else:
self._enabled = False
warnings.warn('OOMObserver is supported after PyTorch 2.1.0. Disabling OOMObserver callback.')

self._enabled = True
self.filename_config: Optional[SnapshotFileNameConfig] = None

def init(self, state: State, logger: Logger) -> None:
Expand Down
19 changes: 14 additions & 5 deletions composer/core/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -979,7 +979,9 @@ def get_model_state_dict(self) -> dict[str, Any]:
Returns:
dict[str, Any]: The state dict for the model.
"""
if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
if version.parse(torch.__version__) >= version.parse('2.4.0') or (
version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
):
from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict
if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
raise NotImplementedError(
Expand Down Expand Up @@ -1017,7 +1019,9 @@ def get_optim_state_dict(self) -> dict[str, Any]:
Returns:
dict[str, Any]: The state dict for the optimizer.
"""
if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
if version.parse(torch.__version__) >= version.parse('2.4.0') or (
version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
):
from torch.distributed.checkpoint.state_dict import StateDictOptions, get_optimizer_state_dict
if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
raise NotImplementedError(
Expand Down Expand Up @@ -1327,7 +1331,9 @@ def load_model_state(
model_on_rank = state_dict['model'] is not None

if model_on_rank:
if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
if version.parse(torch.__version__) >= version.parse('2.4.0') or (
version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
):
from torch.distributed.checkpoint.state_dict import StateDictOptions, set_model_state_dict
try:
set_model_state_dict(
Expand Down Expand Up @@ -1430,14 +1436,17 @@ def load_optim_state(self, state_dict: dict[str, Any], strict: bool = True):
continue

optim_state_dict = serialized_value[type(optimizer).__qualname__] if serialized_value is not None else None
if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
if version.parse(torch.__version__) >= version.parse('2.4.0') or (
version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
):
from torch.distributed.checkpoint.state_dict import StateDictOptions, set_optimizer_state_dict

# optim_state_dict is `None` on non-zero ranks when loading FSDP monolith
# checkpoint on rank 0 only. However, PyTorch modifies the state_dict (producing
# errors) before discarding the output. Accordingly, we mock the state dict.
# See: https://github.com/pytorch/pytorch/issues/125177
optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
if version.parse(torch.__version__) < version.parse('2.4.0'):
optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
set_optimizer_state_dict(
model=self.model,
optimizers=optimizer,
Expand Down
Loading

0 comments on commit a10a798

Please sign in to comment.