Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bump to Pytorch 2.4 #3542

Merged
merged 25 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 0 additions & 28 deletions .github/workflows/daily.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,6 @@ jobs:
strategy:
matrix:
include:
- name: cpu-3.10-2.1
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
Expand All @@ -42,11 +37,6 @@ jobs:
markers: not daily and (remote or not remote) and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
composer_package_name: mosaicml
- name: daily-cpu-3.10-2.1
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: daily-cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
Expand Down Expand Up @@ -102,12 +92,6 @@ jobs:
# Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
# on MCLOUD and not eat up all GPUs at once
include:
- name: "gpu-3.10-2.1-1-gpu"
container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 1
- name: "gpu-3.11-2.2-1-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand All @@ -120,12 +104,6 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 1
- name: "gpu-3.10-2.1-2-gpu"
container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 2
- name: "gpu-3.11-2.2-2-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand All @@ -138,12 +116,6 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 2
- name: "gpu-3.10-2.1-4-gpu"
container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 4
- name: "gpu-3.11-2.2-4-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docker-configure-build-push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ on:
required: true
jobs:
configure-build-push:
runs-on: ubuntu-latest
runs-on: mosaic-4wide
steps:
- name: Maximize Build Space on Worker
uses: easimon/maximize-build-space@v4
Expand Down
4 changes: 0 additions & 4 deletions .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@ jobs:
strategy:
matrix:
include:
- name: cpu-3.10-2.1
container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
pytest_command: coverage run -m pytest
- name: cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
Expand Down
4 changes: 3 additions & 1 deletion composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: # type: ignore

nchunks: int = int(math.ceil(batch_size / self.ghost_batch_size))
has_momentum: bool = hasattr(self.batchnorm, 'momentum')
original_momentum: float = self.batchnorm.momentum
original_momentum: Optional[float] = self.batchnorm.momentum

if self.training and has_momentum:
# applying the same batchnorm multiple times greatly increases
Expand All @@ -180,6 +180,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: # type: ignore
normalized_chunks = [self.batchnorm(chunk) for chunk in input.chunk(nchunks, 0)]

if self.training and has_momentum:
assert original_momentum is not None
self._unscale_momentum(original_momentum)

return torch.cat(normalized_chunks, dim=0)
Expand All @@ -192,6 +193,7 @@ def from_batchnorm(module: torch.nn.Module, ghost_batch_size: int) -> '_GhostBat

@torch.jit.unused
def _scale_momentum(self, nchunks: int):
assert self.batchnorm.momentum is not None
self.batchnorm.momentum = float(self.batchnorm.momentum) / nchunks

@torch.jit.unused
Expand Down
2 changes: 1 addition & 1 deletion composer/algorithms/swa/swa.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def _initialize_swa(self, state: State) -> None:
state.optimizers[0],
swa_lr=self.swa_lr,
anneal_epochs=self.anneal_steps,
anneal_strategy=self.anneal_strategy,
anneal_strategy=self.anneal_strategy, # type: ignore
)

self.swa_model = AveragedModel(state.model, device=torch.device('cpu'))
Expand Down
10 changes: 5 additions & 5 deletions composer/callbacks/image_visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,18 +164,18 @@ def _make_segmentation_images(
# Ensure the targets are in the expected format
if infer_target_type(outputs, targets) == 'one_hot':
if channels_last:
targets = targets.argmax(dim=-1).data.cpu().numpy()
targets = targets.argmax(dim=-1).data.cpu().numpy() # type: ignore
else:
targets = targets.argmax(dim=1).data.cpu().numpy()
targets = targets.argmax(dim=1).data.cpu().numpy() # type: ignore
else:
targets = targets.data.cpu().numpy()
targets = targets.data.cpu().numpy() # type: ignore
# Convert the outputs to the expected format
if channels_last:
num_classes = outputs.shape[-1]
outputs = outputs.argmax(dim=-1).cpu().numpy()
outputs = outputs.argmax(dim=-1).cpu().numpy() # type: ignore
else:
num_classes = outputs.shape[1]
outputs = outputs.argmax(dim=1).cpu().numpy()
outputs = outputs.argmax(dim=1).cpu().numpy() # type: ignore
# Adjust targets such that negative values are mapped to one higher than the maximum class
targets[targets < 0] = num_classes

Expand Down
9 changes: 1 addition & 8 deletions composer/callbacks/memory_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from typing import Optional, Union

import torch.cuda
from packaging import version

from composer import State
from composer.core import Callback, State, Time, TimeUnit
Expand Down Expand Up @@ -94,13 +93,7 @@ def __init__(
_, _, self.remote_path_in_bucket = parse_uri(remote_file_name)
else:
self.remote_path_in_bucket = None

if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'): # type: ignore
# MemorySnapshot is only supported in torch v2.1.0-rc1 or higher
self._enabled = True
else:
self._enabled = False
warnings.warn('Memory snapshot is supported after PyTorch 2.1.0. Skipping memory snapshot callback.')
self._enabled = True

def init(self, state: State, logger: Logger) -> None:
if not self._enabled:
Expand Down
9 changes: 1 addition & 8 deletions composer/callbacks/oom_observer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from typing import Optional

import torch.cuda
from packaging import version

from composer.core import Callback, State
from composer.loggers import Logger
Expand Down Expand Up @@ -113,13 +112,7 @@ def __init__(
else:
self.remote_path_in_bucket = None

if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'): # type: ignore
# OOMObserver is only supported in torch v2.1.0 or higher
self._enabled = True
else:
self._enabled = False
warnings.warn('OOMObserver is supported after PyTorch 2.1.0. Disabling OOMObserver callback.')

self._enabled = True
self.filename_config: Optional[SnapshotFileNameConfig] = None

def init(self, state: State, logger: Logger) -> None:
Expand Down
19 changes: 14 additions & 5 deletions composer/core/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -979,7 +979,9 @@ def get_model_state_dict(self) -> dict[str, Any]:
Returns:
dict[str, Any]: The state dict for the model.
"""
if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
if version.parse(torch.__version__) >= version.parse('2.4.0') or (
version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
):
from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict
if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
raise NotImplementedError(
Expand Down Expand Up @@ -1017,7 +1019,9 @@ def get_optim_state_dict(self) -> dict[str, Any]:
Returns:
dict[str, Any]: The state dict for the optimizer.
"""
if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
if version.parse(torch.__version__) >= version.parse('2.4.0') or (
version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
):
from torch.distributed.checkpoint.state_dict import StateDictOptions, get_optimizer_state_dict
if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
raise NotImplementedError(
Expand Down Expand Up @@ -1327,7 +1331,9 @@ def load_model_state(
model_on_rank = state_dict['model'] is not None

if model_on_rank:
if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
if version.parse(torch.__version__) >= version.parse('2.4.0') or (
version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
):
from torch.distributed.checkpoint.state_dict import StateDictOptions, set_model_state_dict
try:
set_model_state_dict(
Expand Down Expand Up @@ -1430,14 +1436,17 @@ def load_optim_state(self, state_dict: dict[str, Any], strict: bool = True):
continue

optim_state_dict = serialized_value[type(optimizer).__qualname__] if serialized_value is not None else None
if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized():
if version.parse(torch.__version__) >= version.parse('2.4.0') or (
version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized()
):
from torch.distributed.checkpoint.state_dict import StateDictOptions, set_optimizer_state_dict

# optim_state_dict is `None` on non-zero ranks when loading FSDP monolith
# checkpoint on rank 0 only. However, PyTorch modifies the state_dict (producing
# errors) before discarding the output. Accordingly, we mock the state dict.
# See: https://github.com/pytorch/pytorch/issues/125177
optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
if version.parse(torch.__version__) < version.parse('2.4.0'):
optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict
set_optimizer_state_dict(
model=self.model,
optimizers=optimizer,
Expand Down
Loading
Loading