From dc793308040e47b030d5fcb9bd17e9bbb7e42f3a Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 7 Aug 2024 15:41:23 -0700 Subject: [PATCH 01/23] commit change --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index eccba856fd..c16807aa9d 100644 --- a/setup.py +++ b/setup.py @@ -80,8 +80,8 @@ def package_files(prefix: str, directory: str, extension: str): 'tqdm>=4.62.3,<5', 'torchmetrics>=1.4.0.post0,<1.4.1', 'torch_optimizer>=0.3.0,<0.4', - 'torchvision>=0.13.1,<0.18.2', - 'torch>=2.1.2,<2.3.2', + 'torchvision>=0.14.0,<0.19.1', + 'torch>=2.2.0,<2.4.1', 'requests>=2.26.0,<3', 'numpy>=1.21.5,<2.1.0', 'psutil>=5.8.0,<7', From 32e1eed98b4f2593dddaff8245ac3fd70b128ea1 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Thu, 8 Aug 2024 17:22:35 -0700 Subject: [PATCH 02/23] commit change --- docker/generate_build_matrix.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index 74d9c7fed4..d8a665322a 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -19,16 +19,16 @@ import yaml PRODUCTION_PYTHON_VERSION = '3.11' -PRODUCTION_PYTORCH_VERSION = '2.3.1' +PRODUCTION_PYTORCH_VERSION = '2.4.0' def _get_torchvision_version(pytorch_version: str): + if pytorch_version == '2.4.0': + return '0.19.0' if pytorch_version == '2.3.1': return '0.18.1' if pytorch_version == '2.2.2': return '0.17.2' - if pytorch_version == '2.1.2': - return '0.16.2' raise ValueError(f'Invalid pytorch_version: {pytorch_version}') @@ -42,12 +42,12 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool): # From https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/ if not use_cuda: return '' + if pytorch_version == '2.4.0': + return '12.1.4' if pytorch_version == '2.3.1': return '12.1.1' if pytorch_version == '2.2.2': return '12.1.1' - if pytorch_version == '2.1.2': - return '12.1.1' raise ValueError(f'Invalid pytorch_version: {pytorch_version}') @@ -167,7 +167,7 @@ def _write_table(table_tag: str, table_contents: str): def _main(): - python_pytorch_versions = [('3.11', '2.3.1'), ('3.11', '2.2.2'), ('3.10', '2.1.2')] + python_pytorch_versions = [('3.11', '2.4.0'), ('3.11', '2.3.1'), ('3.11', '2.2.2')] cuda_options = [True, False] stages = ['pytorch_stage'] interconnects = ['mellanox', 'EFA'] # mellanox is default, EFA needed for AWS From 092347499fe4488e5c3c585cee500fdd4ba085b1 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Thu, 8 Aug 2024 17:26:23 -0700 Subject: [PATCH 03/23] commit change --- docker/README.md | 12 ++-- docker/build_matrix.yaml | 143 ++++++++++++++------------------------- 2 files changed, 58 insertions(+), 97 deletions(-) diff --git a/docker/README.md b/docker/README.md index a0514ecb3d..1b68cdec39 100644 --- a/docker/README.md +++ b/docker/README.md @@ -30,15 +30,15 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| -| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.3.1 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.4.0 | 12.1.4 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.4.0 | 12.1.4 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.4.0 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.3.1 | cpu | 3.11 | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.2.2 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.2.2 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.2.2 | cpu | 3.11 | `mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.1 (Infiniband) | 3.10 | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.1.2 | 12.1.1 (EFA) | 3.10 | `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws` | -| Ubuntu 20.04 | Base | 2.1.2 | cpu | 3.10 | `mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04` | **Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws` diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index ee74d12309..6f4f14ad76 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -1,79 +1,53 @@ # This file is automatically generated by generate_build_matrix.py. DO NOT EDIT! - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-3-1-cu121 + BASE_IMAGE: nvidia/cuda:12.1.4-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 12.1.4 + IMAGE_NAME: torch-2-4-0-cu121 MOFED_VERSION: latest-23.10 - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.3.1 + PYTORCH_VERSION: 2.4.0 TAGS: - - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 - mosaicml/pytorch:latest TARGET: pytorch_stage - TORCHVISION_VERSION: 0.18.1 + TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: v1.9.1-aws - BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-3-1-cu121-aws + BASE_IMAGE: nvidia/cuda:12.1.4-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 12.1.4 + IMAGE_NAME: torch-2-4-0-cu121-aws MOFED_VERSION: '' - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.3.1 + PYTORCH_VERSION: 2.4.0 TAGS: - - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04-aws - mosaicml/pytorch:latest-aws TARGET: pytorch_stage - TORCHVISION_VERSION: 0.18.1 + TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-2-3-1-cpu + IMAGE_NAME: torch-2-4-0-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.3.1 + PYTORCH_VERSION: 2.4.0 TAGS: - - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 - mosaicml/pytorch:latest_cpu TARGET: pytorch_stage - TORCHVISION_VERSION: 0.18.1 + TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-2-2-cu121 + IMAGE_NAME: torch-2-3-1-cu121 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -92,15 +66,15 @@ PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.2.2 + PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.17.2 + TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: v1.9.1-aws BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-2-2-cu121-aws + IMAGE_NAME: torch-2-3-1-cu121-aws MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -119,29 +93,29 @@ PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.2.2 + PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws TARGET: pytorch_stage - TORCHVISION_VERSION: 0.17.2 + TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-2-2-2-cpu + IMAGE_NAME: torch-2-3-1-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.2.2 + PYTORCH_VERSION: 2.3.1 TAGS: - - mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.17.2 + TORCHVISION_VERSION: 0.18.1 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-1-2-cu121 + IMAGE_NAME: torch-2-2-2-cu121 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -157,18 +131,18 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 + PYTORCH_VERSION: 2.2.2 TAGS: - - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 + - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 + TORCHVISION_VERSION: 0.17.2 - AWS_OFI_NCCL_VERSION: v1.9.1-aws BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 CUDA_VERSION: 12.1.1 - IMAGE_NAME: torch-2-1-2-cu121-aws + IMAGE_NAME: torch-2-2-2-cu121-aws MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -184,57 +158,44 @@ brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 + PYTORCH_VERSION: 2.2.2 TAGS: - - mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04-aws + - mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 + TORCHVISION_VERSION: 0.17.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 CUDA_VERSION: '' - IMAGE_NAME: torch-2-1-2-cpu + IMAGE_NAME: torch-2-2-2-cpu MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.1.2 + PYTORCH_VERSION: 2.2.2 TAGS: - - mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 + - mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04 TARGET: pytorch_stage - TORCHVISION_VERSION: 0.16.2 + TORCHVISION_VERSION: 0.17.2 - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 + BASE_IMAGE: nvidia/cuda:12.1.4-cudnn8-devel-ubuntu20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5 - CUDA_VERSION: 12.1.1 + CUDA_VERSION: 12.1.4 IMAGE_NAME: composer-0-23-5 MOFED_VERSION: latest-23.10 - NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 - brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 - brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 - brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 - brand=tesla,driver>=510,driver<511 brand=unknown,driver>=510,driver<511 brand=nvidia,driver>=510,driver<511 - brand=nvidiartx,driver>=510,driver<511 brand=geforce,driver>=510,driver<511 brand=geforcertx,driver>=510,driver<511 - brand=quadro,driver>=510,driver<511 brand=quadrortx,driver>=510,driver<511 brand=titan,driver>=510,driver<511 - brand=titanrtx,driver>=510,driver<511 brand=tesla,driver>=515,driver<516 brand=unknown,driver>=515,driver<516 - brand=nvidia,driver>=515,driver<516 brand=nvidiartx,driver>=515,driver<516 brand=geforce,driver>=515,driver<516 - brand=geforcertx,driver>=515,driver<516 brand=quadro,driver>=515,driver<516 brand=quadrortx,driver>=515,driver<516 - brand=titan,driver>=515,driver<516 brand=titanrtx,driver>=515,driver<516 brand=tesla,driver>=525,driver<526 - brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 - brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 - brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 + NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.3.1 + PYTORCH_VERSION: 2.4.0 TAGS: - mosaicml/composer:0.23.5 - mosaicml/composer:latest TARGET: composer_stage - TORCHVISION_VERSION: 0.18.1 + TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5 @@ -245,9 +206,9 @@ PYTHON_VERSION: '3.11' PYTORCH_NIGHTLY_URL: '' PYTORCH_NIGHTLY_VERSION: '' - PYTORCH_VERSION: 2.3.1 + PYTORCH_VERSION: 2.4.0 TAGS: - mosaicml/composer:0.23.5_cpu - mosaicml/composer:latest_cpu TARGET: composer_stage - TORCHVISION_VERSION: 0.18.1 + TORCHVISION_VERSION: 0.19.0 From eb498932f6a1edb5175bb7e730c0557fb9f5a0bc Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Thu, 8 Aug 2024 17:27:10 -0700 Subject: [PATCH 04/23] commit change --- docker/generate_build_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index d8a665322a..cb041d39d6 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -43,7 +43,7 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool): if not use_cuda: return '' if pytorch_version == '2.4.0': - return '12.1.4' + return '12.4.0' if pytorch_version == '2.3.1': return '12.1.1' if pytorch_version == '2.2.2': From e0b3c63f03c44fc9e9aed1397ce43c1fc0ab54a8 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Thu, 8 Aug 2024 17:27:14 -0700 Subject: [PATCH 05/23] commit change --- docker/README.md | 4 ++-- docker/build_matrix.yaml | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/docker/README.md b/docker/README.md index 1b68cdec39..bd4284b4de 100644 --- a/docker/README.md +++ b/docker/README.md @@ -30,8 +30,8 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| -| Ubuntu 20.04 | Base | 2.4.0 | 12.1.4 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.4.0 | 12.1.4 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.4.0 | 12.4.0 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.4.0 | 12.4.0 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.4.0 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` | diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 6f4f14ad76..f23ae70bb4 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -1,8 +1,8 @@ # This file is automatically generated by generate_build_matrix.py. DO NOT EDIT! - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.1.4-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.4 - IMAGE_NAME: torch-2-4-0-cu121 + BASE_IMAGE: nvidia/cuda:12.4.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 12.4.0 + IMAGE_NAME: torch-2-4-0-cu124 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' @@ -10,14 +10,14 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.4.0 TAGS: - - mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 - mosaicml/pytorch:latest TARGET: pytorch_stage TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: v1.9.1-aws - BASE_IMAGE: nvidia/cuda:12.1.4-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.1.4 - IMAGE_NAME: torch-2-4-0-cu121-aws + BASE_IMAGE: nvidia/cuda:12.4.0-cudnn8-devel-ubuntu20.04 + CUDA_VERSION: 12.4.0 + IMAGE_NAME: torch-2-4-0-cu124-aws MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' @@ -25,7 +25,7 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.4.0 TAGS: - - mosaicml/pytorch:2.4.0_cu121-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws - mosaicml/pytorch:latest-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.19.0 @@ -181,9 +181,9 @@ TARGET: pytorch_stage TORCHVISION_VERSION: 0.17.2 - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.1.4-cudnn8-devel-ubuntu20.04 + BASE_IMAGE: nvidia/cuda:12.4.0-cudnn8-devel-ubuntu20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5 - CUDA_VERSION: 12.1.4 + CUDA_VERSION: 12.4.0 IMAGE_NAME: composer-0-23-5 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' From 1ce6cffcfb0e797168e4cdf3e67eb14ad7e154e9 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Fri, 9 Aug 2024 09:52:07 -0700 Subject: [PATCH 06/23] commit change --- docker/README.md | 4 ++-- docker/build_matrix.yaml | 20 ++++++++++---------- docker/generate_build_matrix.py | 4 +++- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/docker/README.md b/docker/README.md index bd4284b4de..0a6bf5aac0 100644 --- a/docker/README.md +++ b/docker/README.md @@ -30,8 +30,8 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| -| Ubuntu 20.04 | Base | 2.4.0 | 12.4.0 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.4.0 | 12.4.0 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.4.0 | 12.5.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.4.0 | 12.5.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.4.0 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` | diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index f23ae70bb4..f4c09ceb1a 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -1,8 +1,8 @@ # This file is automatically generated by generate_build_matrix.py. DO NOT EDIT! - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.4.0-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.4.0 - IMAGE_NAME: torch-2-4-0-cu124 + BASE_IMAGE: nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 + CUDA_VERSION: 12.5.1 + IMAGE_NAME: torch-2-4-0-cu125 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' @@ -10,14 +10,14 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.4.0 TAGS: - - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04 - mosaicml/pytorch:latest TARGET: pytorch_stage TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: v1.9.1-aws - BASE_IMAGE: nvidia/cuda:12.4.0-cudnn8-devel-ubuntu20.04 - CUDA_VERSION: 12.4.0 - IMAGE_NAME: torch-2-4-0-cu124-aws + BASE_IMAGE: nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 + CUDA_VERSION: 12.5.1 + IMAGE_NAME: torch-2-4-0-cu125-aws MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' @@ -25,7 +25,7 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.4.0 TAGS: - - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04-aws - mosaicml/pytorch:latest-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.19.0 @@ -181,9 +181,9 @@ TARGET: pytorch_stage TORCHVISION_VERSION: 0.17.2 - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.4.0-cudnn8-devel-ubuntu20.04 + BASE_IMAGE: nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5 - CUDA_VERSION: 12.4.0 + CUDA_VERSION: 12.5.1 IMAGE_NAME: composer-0-23-5 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index cb041d39d6..ef587e6bc2 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -35,6 +35,8 @@ def _get_torchvision_version(pytorch_version: str): def _get_base_image(cuda_version: str): if not cuda_version: return 'ubuntu:20.04' + if cuda_version == '12.5.1': + return f'nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04' return f'nvidia/cuda:{cuda_version}-cudnn8-devel-ubuntu20.04' @@ -43,7 +45,7 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool): if not use_cuda: return '' if pytorch_version == '2.4.0': - return '12.4.0' + return '12.5.1' if pytorch_version == '2.3.1': return '12.1.1' if pytorch_version == '2.2.2': From 76d7505e2f8b149c15b1966b9f221bc3b44f7079 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Fri, 9 Aug 2024 10:32:09 -0700 Subject: [PATCH 07/23] commit change --- docker/Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index 970af2f1ef..9e86c08fd3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -189,9 +189,15 @@ ENV PYTORCH_NIGHTLY_VERSION=${PYTORCH_NIGHTLY_VERSION} RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \ + if [ "${PYTORCH_VERSION}" != "2.4.0" ]; then \ pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \ torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \ + else \ + pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/${CUDA_VERSION} \ + torch \ + torchvision ; \ + fi ; \ else \ pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ From 5669451fdfb5675ad201d9800ed88e0cb23f8a59 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Fri, 9 Aug 2024 10:51:37 -0700 Subject: [PATCH 08/23] commit change --- docker/README.md | 4 ++-- docker/build_matrix.yaml | 20 ++++++++++---------- docker/generate_build_matrix.py | 6 +++--- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docker/README.md b/docker/README.md index 0a6bf5aac0..09dd2591f5 100644 --- a/docker/README.md +++ b/docker/README.md @@ -30,8 +30,8 @@ To install composer, once inside the image, run `pip install mosaicml`. | Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags | |----------------|----------|-------------------|---------------------|------------------|------------------------------------------------------------------------------------------| -| Ubuntu 20.04 | Base | 2.4.0 | 12.5.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04` | -| Ubuntu 20.04 | Base | 2.4.0 | 12.5.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04-aws` | +| Ubuntu 20.04 | Base | 2.4.0 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04` | +| Ubuntu 20.04 | Base | 2.4.0 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws` | | Ubuntu 20.04 | Base | 2.4.0 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04` | | Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` | diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index f4c09ceb1a..2fb084a78b 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -1,8 +1,8 @@ # This file is automatically generated by generate_build_matrix.py. DO NOT EDIT! - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 - CUDA_VERSION: 12.5.1 - IMAGE_NAME: torch-2-4-0-cu125 + BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 + CUDA_VERSION: 12.4.1 + IMAGE_NAME: torch-2-4-0-cu124 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' @@ -10,14 +10,14 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.4.0 TAGS: - - mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04 + - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 - mosaicml/pytorch:latest TARGET: pytorch_stage TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: v1.9.1-aws - BASE_IMAGE: nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 - CUDA_VERSION: 12.5.1 - IMAGE_NAME: torch-2-4-0-cu125-aws + BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 + CUDA_VERSION: 12.4.1 + IMAGE_NAME: torch-2-4-0-cu124-aws MOFED_VERSION: '' NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' @@ -25,7 +25,7 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.4.0 TAGS: - - mosaicml/pytorch:2.4.0_cu125-python3.11-ubuntu20.04-aws + - mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws - mosaicml/pytorch:latest-aws TARGET: pytorch_stage TORCHVISION_VERSION: 0.19.0 @@ -181,9 +181,9 @@ TARGET: pytorch_stage TORCHVISION_VERSION: 0.17.2 - AWS_OFI_NCCL_VERSION: '' - BASE_IMAGE: nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 + BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5 - CUDA_VERSION: 12.5.1 + CUDA_VERSION: 12.4.1 IMAGE_NAME: composer-0-23-5 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index ef587e6bc2..a1cf5bca3b 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -35,8 +35,8 @@ def _get_torchvision_version(pytorch_version: str): def _get_base_image(cuda_version: str): if not cuda_version: return 'ubuntu:20.04' - if cuda_version == '12.5.1': - return f'nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04' + if cuda_version == '12.4.1': + return f'nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04' return f'nvidia/cuda:{cuda_version}-cudnn8-devel-ubuntu20.04' @@ -45,7 +45,7 @@ def _get_cuda_version(pytorch_version: str, use_cuda: bool): if not use_cuda: return '' if pytorch_version == '2.4.0': - return '12.5.1' + return '12.4.1' if pytorch_version == '2.3.1': return '12.1.1' if pytorch_version == '2.2.2': From 4c01abbab132cc1c9e15fa7e4372e9af845fb0ef Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 9 Aug 2024 15:46:21 -0400 Subject: [PATCH 09/23] fix dockerfile --- docker/Dockerfile | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 9e86c08fd3..970af2f1ef 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -189,15 +189,9 @@ ENV PYTORCH_NIGHTLY_VERSION=${PYTORCH_NIGHTLY_VERSION} RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \ - if [ "${PYTORCH_VERSION}" != "2.4.0" ]; then \ pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \ torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \ - else \ - pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/${CUDA_VERSION} \ - torch \ - torchvision ; \ - fi ; \ else \ pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ From 5f30e2d270db38a699dbf2562f8cce9a2fdbbb0f Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 9 Aug 2024 16:14:38 -0400 Subject: [PATCH 10/23] remove many issues --- composer/trainer/_patch_pytorch.py | 6 ++++++ composer/utils/checkpoint.py | 24 +++++++++++++++--------- tests/trainer/test_fsdp_checkpoint.py | 8 +++----- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py index 2c27118090..24ba86a2cb 100644 --- a/composer/trainer/_patch_pytorch.py +++ b/composer/trainer/_patch_pytorch.py @@ -133,6 +133,12 @@ def patch_pytorch(): _MeshEnv.create_child_mesh = create_child_mesh DeviceMesh.__getitem__ = device_mesh__getitem__ + elif version.parse(torch.__version__) < version.parse('2.4.1'): + # Monkey patch for torch < 2.4.1 ie torch == 2.4.0 + + # No monkeypatches! + pass + def build_metadata( self, diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py index 56b13fcac6..ace6dedad4 100644 --- a/composer/utils/checkpoint.py +++ b/composer/utils/checkpoint.py @@ -692,14 +692,19 @@ def load_sharded_checkpoint( # Ensure state exists state_dict['state'] = state_dict.get('state', {}) - # dist_cp.load breaks unless the specified state_dict supports `load_state_dict` - # See: https://github.com/pytorch/pytorch/issues/125096 - dist_cp.load_state_dict( - state_dict=state_dict, - storage_reader=storage_reader, - planner=state.fsdp_config.load_planner, - no_dist=(not dist.is_initialized()), - ) + if version.parse(torch.__version__) >= version.parse('2.4.0'): + dist_cp.load( + state_dict=state_dict, + storage_reader=storage_reader, + planner=state.fsdp_config.load_planner, + ) + else: + dist_cp.load_state_dict( + state_dict=state_dict, + storage_reader=storage_reader, + planner=state.fsdp_config.load_planner, + no_dist=(not dist.is_initialized()), + ) log.info(f'Loaded state dict') state.load_state_dict( @@ -1158,7 +1163,8 @@ def _save_checkpoint( if expect_file: if version.parse(torch.__version__) >= version.parse('2.3.0'): save_planner = state.fsdp_config.save_planner - if save_planner is None: + if version.parse(torch.__version__) < version.parse('2.4.0') and save_planner is None: + # Dedup is only broken on <2.4 from composer.trainer._patch_pytorch import SavePlannerWithDedupFix save_planner = SavePlannerWithDedupFix() diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py index a59e60172a..8d677e3cc7 100644 --- a/tests/trainer/test_fsdp_checkpoint.py +++ b/tests/trainer/test_fsdp_checkpoint.py @@ -315,12 +315,10 @@ def test_fsdp_full_state_dict_load( use_tp: bool, use_hsdp: bool, ): - if use_hsdp: - pytest.xfail('Known PyTorch issue with HSDP, waiting for pytorch patch') + if use_hsdp and version.parse(torch.__version__) < version.parse('2.4.0'): + pytest.xfail('HSDP requires torch 2.4.0 or later') if use_tp: pytest.skip('TP on PyTorch 2.3 has full state dict issues.') - if (use_tp or use_hsdp) and version.parse(torch.__version__) < version.parse('2.3.0'): - pytest.skip('HSDP and TP require torch 2.3.0 or later') if autoresume: run_name = 'my-cool-autoresume-run' else: @@ -861,7 +859,7 @@ def test_fsdp_partitioned_state_dict_load( run_name = None if use_remote: - save_folder = f's3://{s3_bucket}/{s3_ephemeral_prefix}/checkpoints/{{run_name}}' + save_folder = f's3://{s3_bucket}/{s3_ephemeral_prefix}/checkpoints/{{run_name}}'f else: tmp_paths = dist.all_gather_object(os.path.abspath(tmp_path)) save_folder = os.path.join(tmp_paths[0], 'checkpoints', '{run_name}') From fe7def5876e236d72998e8e98916b68f0bb3d9b0 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 9 Aug 2024 16:16:20 -0400 Subject: [PATCH 11/23] strip magic mock --- composer/core/state.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/composer/core/state.py b/composer/core/state.py index cbd7fc41db..7528da97a9 100644 --- a/composer/core/state.py +++ b/composer/core/state.py @@ -1437,7 +1437,8 @@ def load_optim_state(self, state_dict: dict[str, Any], strict: bool = True): # checkpoint on rank 0 only. However, PyTorch modifies the state_dict (producing # errors) before discarding the output. Accordingly, we mock the state dict. # See: https://github.com/pytorch/pytorch/issues/125177 - optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict + if version.parse(torch.__version__) < version.parse('2.4.0'): + optim_state_dict = MagicMock() if optim_state_dict is None else optim_state_dict set_optimizer_state_dict( model=self.model, optimizers=optimizer, From 55b642fee40b77d02e5a6ead74830e8c2d9956c6 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 9 Aug 2024 16:19:50 -0400 Subject: [PATCH 12/23] fix gating --- composer/core/state.py | 16 ++++++++++++---- tests/trainer/test_fsdp_checkpoint.py | 2 +- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/composer/core/state.py b/composer/core/state.py index 7528da97a9..7c43473ace 100644 --- a/composer/core/state.py +++ b/composer/core/state.py @@ -979,7 +979,9 @@ def get_model_state_dict(self) -> dict[str, Any]: Returns: dict[str, Any]: The state dict for the model. """ - if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized(): + if version.parse(torch.__version__) >= version.parse('2.4.0') or ( + version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized() + ): from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict if self.fsdp_state_dict_type not in [None, 'full', 'sharded']: raise NotImplementedError( @@ -1017,7 +1019,9 @@ def get_optim_state_dict(self) -> dict[str, Any]: Returns: dict[str, Any]: The state dict for the optimizer. """ - if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized(): + if version.parse(torch.__version__) >= version.parse('2.4.0') or ( + version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized() + ): from torch.distributed.checkpoint.state_dict import StateDictOptions, get_optimizer_state_dict if self.fsdp_state_dict_type not in [None, 'full', 'sharded']: raise NotImplementedError( @@ -1327,7 +1331,9 @@ def load_model_state( model_on_rank = state_dict['model'] is not None if model_on_rank: - if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized(): + if version.parse(torch.__version__) >= version.parse('2.4.0') or ( + version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized() + ): from torch.distributed.checkpoint.state_dict import StateDictOptions, set_model_state_dict try: set_model_state_dict( @@ -1430,7 +1436,9 @@ def load_optim_state(self, state_dict: dict[str, Any], strict: bool = True): continue optim_state_dict = serialized_value[type(optimizer).__qualname__] if serialized_value is not None else None - if version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized(): + if version.parse(torch.__version__) >= version.parse('2.4.0') or ( + version.parse(torch.__version__) >= version.parse('2.3.0') and dist.is_initialized() + ): from torch.distributed.checkpoint.state_dict import StateDictOptions, set_optimizer_state_dict # optim_state_dict is `None` on non-zero ranks when loading FSDP monolith diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py index 8d677e3cc7..76c3b4c5bb 100644 --- a/tests/trainer/test_fsdp_checkpoint.py +++ b/tests/trainer/test_fsdp_checkpoint.py @@ -859,7 +859,7 @@ def test_fsdp_partitioned_state_dict_load( run_name = None if use_remote: - save_folder = f's3://{s3_bucket}/{s3_ephemeral_prefix}/checkpoints/{{run_name}}'f + save_folder = f's3://{s3_bucket}/{s3_ephemeral_prefix}/checkpoints/{{run_name}}' else: tmp_paths = dist.all_gather_object(os.path.abspath(tmp_path)) save_folder = os.path.join(tmp_paths[0], 'checkpoints', '{run_name}') From 973c1bc107e19a93937588d9489d5f449b5aac35 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 9 Aug 2024 16:24:26 -0400 Subject: [PATCH 13/23] try chuck hack --- docker/Dockerfile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index 80ae8bad2e..251ded7c95 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -189,9 +189,15 @@ ENV PYTORCH_NIGHTLY_VERSION=${PYTORCH_NIGHTLY_VERSION} RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \ + if [ "${PYTORCH_VERSION}" != "2.4.0" ]; then \ pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \ torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \ + else \ + pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/${CUDA_VERSION} \ + torch \ + torchvision ; \ + fi ; \ else \ pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ From fac2593e225486b80e4a4d624ed326a29ec984d3 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 9 Aug 2024 16:44:00 -0400 Subject: [PATCH 14/23] upgrade lint --- .../algorithms/ghost_batchnorm/ghost_batchnorm.py | 4 +++- composer/algorithms/swa/swa.py | 2 +- composer/callbacks/image_visualizer.py | 10 +++++----- tests/trainer/test_fsdp_checkpoint.py | 11 +++++++++-- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py b/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py index 3943a1c345..92aed98808 100644 --- a/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py +++ b/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py @@ -168,7 +168,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: # type: ignore nchunks: int = int(math.ceil(batch_size / self.ghost_batch_size)) has_momentum: bool = hasattr(self.batchnorm, 'momentum') - original_momentum: float = self.batchnorm.momentum + original_momentum: Optional[float] = self.batchnorm.momentum if self.training and has_momentum: # applying the same batchnorm multiple times greatly increases @@ -180,6 +180,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: # type: ignore normalized_chunks = [self.batchnorm(chunk) for chunk in input.chunk(nchunks, 0)] if self.training and has_momentum: + assert original_momentum is not None self._unscale_momentum(original_momentum) return torch.cat(normalized_chunks, dim=0) @@ -192,6 +193,7 @@ def from_batchnorm(module: torch.nn.Module, ghost_batch_size: int) -> '_GhostBat @torch.jit.unused def _scale_momentum(self, nchunks: int): + assert self.batchnorm.momentum is not None self.batchnorm.momentum = float(self.batchnorm.momentum) / nchunks @torch.jit.unused diff --git a/composer/algorithms/swa/swa.py b/composer/algorithms/swa/swa.py index 4177168a13..dd9826d44d 100644 --- a/composer/algorithms/swa/swa.py +++ b/composer/algorithms/swa/swa.py @@ -228,7 +228,7 @@ def _initialize_swa(self, state: State) -> None: state.optimizers[0], swa_lr=self.swa_lr, anneal_epochs=self.anneal_steps, - anneal_strategy=self.anneal_strategy, + anneal_strategy=self.anneal_strategy, # type: ignore ) self.swa_model = AveragedModel(state.model, device=torch.device('cpu')) diff --git a/composer/callbacks/image_visualizer.py b/composer/callbacks/image_visualizer.py index e8381a944c..d86a2d97bc 100644 --- a/composer/callbacks/image_visualizer.py +++ b/composer/callbacks/image_visualizer.py @@ -164,18 +164,18 @@ def _make_segmentation_images( # Ensure the targets are in the expected format if infer_target_type(outputs, targets) == 'one_hot': if channels_last: - targets = targets.argmax(dim=-1).data.cpu().numpy() + targets = targets.argmax(dim=-1).data.cpu().numpy() # type: ignore else: - targets = targets.argmax(dim=1).data.cpu().numpy() + targets = targets.argmax(dim=1).data.cpu().numpy() # type: ignore else: - targets = targets.data.cpu().numpy() + targets = targets.data.cpu().numpy() # type: ignore # Convert the outputs to the expected format if channels_last: num_classes = outputs.shape[-1] - outputs = outputs.argmax(dim=-1).cpu().numpy() + outputs = outputs.argmax(dim=-1).cpu().numpy() # type: ignore else: num_classes = outputs.shape[1] - outputs = outputs.argmax(dim=1).cpu().numpy() + outputs = outputs.argmax(dim=1).cpu().numpy() # type: ignore # Adjust targets such that negative values are mapped to one higher than the maximum class targets[targets < 0] = num_classes diff --git a/tests/trainer/test_fsdp_checkpoint.py b/tests/trainer/test_fsdp_checkpoint.py index 76c3b4c5bb..5bdf76ce8a 100644 --- a/tests/trainer/test_fsdp_checkpoint.py +++ b/tests/trainer/test_fsdp_checkpoint.py @@ -1151,7 +1151,10 @@ def set_up_planner( # suffix all keys with `foo_`` state_dict['state']['model'] = {k + '_foo': v for k, v in state_dict['state']['model'].items()} - super().set_up_planner(state_dict, is_coordinator) + super().set_up_planner( + state_dict=state_dict, + is_coordinator=is_coordinator, + ) class RenameLoadPlanner(DefaultLoadPlanner): @@ -1162,7 +1165,11 @@ def set_up_planner( is_coordinator: bool, ) -> None: if 'state' not in state_dict: - super().set_up_planner(state_dict, metadata, is_coordinator) + super().set_up_planner( + state_dict=state_dict, + metadata=metadata, + is_coordinator=is_coordinator, + ) return self.original_state_dict = state_dict From 9d90cc98c2f8f9a4fba7252dab39beaa53cb7c38 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 9 Aug 2024 16:46:53 -0400 Subject: [PATCH 15/23] fix planner --- composer/utils/checkpoint.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/composer/utils/checkpoint.py b/composer/utils/checkpoint.py index ace6dedad4..648290a320 100644 --- a/composer/utils/checkpoint.py +++ b/composer/utils/checkpoint.py @@ -1163,11 +1163,16 @@ def _save_checkpoint( if expect_file: if version.parse(torch.__version__) >= version.parse('2.3.0'): save_planner = state.fsdp_config.save_planner - if version.parse(torch.__version__) < version.parse('2.4.0') and save_planner is None: - # Dedup is only broken on <2.4 - from composer.trainer._patch_pytorch import SavePlannerWithDedupFix + if save_planner is None: + if version.parse(torch.__version__) < version.parse('2.4.0'): + # Dedup is only broken on <2.4 + from composer.trainer._patch_pytorch import SavePlannerWithDedupFix - save_planner = SavePlannerWithDedupFix() + save_planner = SavePlannerWithDedupFix() + else: + from torch.distributed.checkpoint.default_planner import DefaultSavePlanner + + save_planner = DefaultSavePlanner(dedup_save_to_lowest_rank=True) dist_cp.save( state_dict=state_dict, storage_writer=dist_cp.FileSystemWriter(dirname), From 939ecb4e4b2376dd5bf65463fa56d64fdc036cbf Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 9 Aug 2024 16:57:46 -0400 Subject: [PATCH 16/23] fix inference --- tests/utils/test_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils/test_inference.py b/tests/utils/test_inference.py index e7c374377d..69b78ead4c 100644 --- a/tests/utils/test_inference.py +++ b/tests/utils/test_inference.py @@ -196,7 +196,7 @@ def test_huggingface_export_for_inference_onnx(onnx_opset_version, tiny_bert_con ort_session = ort.InferenceSession(save_path, providers=['CPUExecutionProvider']) for key, value in sample_input.items(): - sample_input[key] = cpu_device.tensor_to_device(value).numpy() + sample_input[key] = cpu_device.tensor_to_device(value).numpy() # type: ignore loaded_model_out = ort_session.run(None, sample_input) From 643d4ce7dce2ead0ade4eec2622b326e717b644e Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 9 Aug 2024 17:20:35 -0400 Subject: [PATCH 17/23] swap folder --- docker/Dockerfile | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 251ded7c95..7d5a655087 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -189,15 +189,9 @@ ENV PYTORCH_NIGHTLY_VERSION=${PYTORCH_NIGHTLY_VERSION} RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \ - if [ "${PYTORCH_VERSION}" != "2.4.0" ]; then \ - pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html \ + pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch/ \ torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \ - else \ - pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/${CUDA_VERSION} \ - torch \ - torchvision ; \ - fi ; \ else \ pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ torch==${PYTORCH_VERSION}.${PYTORCH_NIGHTLY_VERSION} \ From 13ef79029a4428cee25e19bfb79b4acb39915953 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 9 Aug 2024 17:24:17 -0400 Subject: [PATCH 18/23] different link for torchvision --- docker/Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 7d5a655087..35a8d2887a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -188,9 +188,10 @@ ENV PYTORCH_NIGHTLY_URL=${PYTORCH_NIGHTLY_URL} ENV PYTORCH_NIGHTLY_VERSION=${PYTORCH_NIGHTLY_VERSION} RUN if [ -z "$PYTORCH_NIGHTLY_URL" ] ; then \ - CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \ + CUDA_VERSION_TAG=$(python${PYTHON_VERSION} -c "print('cu' + ''.join('${CUDA_VERSION}'.split('.')[:2]) if '${CUDA_VERSION}' else 'cpu')") && \ pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torch/ \ - torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} \ + torch==${PYTORCH_VERSION}+${CUDA_VERSION_TAG} && \ + pip${PYTHON_VERSION} install --no-cache-dir --find-links https://download.pytorch.org/whl/torchvision/ \ torchvision==${TORCHVISION_VERSION}+${CUDA_VERSION_TAG} ; \ else \ pip${PYTHON_VERSION} install --no-cache-dir --pre --index-url ${PYTORCH_NIGHTLY_URL} \ From 1b7ba47892a522f89b7806307db52ff109688cb0 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 9 Aug 2024 17:40:43 -0400 Subject: [PATCH 19/23] remove old code --- composer/callbacks/memory_snapshot.py | 8 +- composer/callbacks/oom_observer.py | 8 +- composer/distributed/dist_strategy.py | 333 +++++++-------------- composer/distributed/mosaic_parallelism.py | 6 +- composer/profiler/torch_profiler.py | 68 ++--- composer/profiler/utils.py | 7 +- composer/trainer/_patch_pytorch.py | 158 +--------- composer/utils/dist.py | 2 - 8 files changed, 143 insertions(+), 447 deletions(-) diff --git a/composer/callbacks/memory_snapshot.py b/composer/callbacks/memory_snapshot.py index 767f3abb0f..67805db257 100644 --- a/composer/callbacks/memory_snapshot.py +++ b/composer/callbacks/memory_snapshot.py @@ -94,13 +94,7 @@ def __init__( _, _, self.remote_path_in_bucket = parse_uri(remote_file_name) else: self.remote_path_in_bucket = None - - if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'): # type: ignore - # MemorySnapshot is only supported in torch v2.1.0-rc1 or higher - self._enabled = True - else: - self._enabled = False - warnings.warn('Memory snapshot is supported after PyTorch 2.1.0. Skipping memory snapshot callback.') + self._enabled = True def init(self, state: State, logger: Logger) -> None: if not self._enabled: diff --git a/composer/callbacks/oom_observer.py b/composer/callbacks/oom_observer.py index d43685bab7..af75ff4cab 100644 --- a/composer/callbacks/oom_observer.py +++ b/composer/callbacks/oom_observer.py @@ -113,13 +113,7 @@ def __init__( else: self.remote_path_in_bucket = None - if version.parse(torch.__version__.split('.dev')[0]) >= version.parse('2.1.0'): # type: ignore - # OOMObserver is only supported in torch v2.1.0 or higher - self._enabled = True - else: - self._enabled = False - warnings.warn('OOMObserver is supported after PyTorch 2.1.0. Disabling OOMObserver callback.') - + self._enabled = True self.filename_config: Optional[SnapshotFileNameConfig] = None def init(self, state: State, logger: Logger) -> None: diff --git a/composer/distributed/dist_strategy.py b/composer/distributed/dist_strategy.py index f7adc79428..0496abd48d 100644 --- a/composer/distributed/dist_strategy.py +++ b/composer/distributed/dist_strategy.py @@ -16,6 +16,8 @@ apply_activation_checkpointing, checkpoint_wrapper, ) +from torch.distributed.fsdp.wrap import CustomPolicy +from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import offload_wrapper from torch.distributed.fsdp import FullyShardedDataParallel, ShardingStrategy from torch.distributed.fsdp._common_utils import clean_tensor_name from torch.nn.parallel import DistributedDataParallel @@ -397,177 +399,73 @@ def sync_hook(*args): if hasattr(obj, '_fsdp_wrap') and not bool(obj._fsdp_wrap): continue - # Rather than verifying these changes with older PyTorch versions, we are fixing forward here - if version.parse(torch.__version__) > version.parse('2.1.0'): - # A dictionary of all tied parameter pointers to (module, attr) tuples - tied_pointers = {} - - # Goes through all modules finding which weights have the same pointers - for mod in obj.modules(): - for attr_name, attr in mod.named_parameters(recurse=False): - ptr = id(attr) - mod_attr_list = tied_pointers.get(ptr, []) - mod_attr_list.append((mod, attr_name)) - tied_pointers[ptr] = mod_attr_list - - # Dictionary mapping the source module to a list of (target module, source attr, target attr) tuples - source_mod_to_mod_attr = {} - for mod_attr_list in tied_pointers.values(): - # If there is only one module for this pointer, then there is no weight tying - if len(mod_attr_list) == 1: - continue - - # Arbitrarily choose the first module as the source module - first_mod, first_attr = mod_attr_list[0] - source_mod_to_mod_attr[first_mod] = [ - (target_mod, first_attr, dest_attr) for target_mod, dest_attr in mod_attr_list[1:] - ] - - # Clean up no longer needed module references for memory safety - del tied_pointers - - def _param_init_fn(module: torch.nn.Module) -> None: - # If we do not have any parameters or buffers on meta device managed by this module directly, we do not need to call the parameter init function. - # It is assumed that whatever process moved the parameters off of meta device initialized them. - # We expect this to occur if we have tied weights, as the second module will already have the weights initialized. - is_meta = any(param.is_meta for param in module.parameters(recurse=False) - ) or any(buffer.is_meta for buffer in module.buffers(recurse=False)) - if not is_meta: - return - - # Move all parameters and buffers to the current device - module.to_empty(device=f'cuda:{torch.cuda.current_device()}', recurse=False) - - # Redo weight tying, which will have been broken by the above line that moves parameters off of meta device - if module in source_mod_to_mod_attr: - for target_mod, first_attr, dest_attr in source_mod_to_mod_attr[module]: - setattr(target_mod, dest_attr, getattr(module, first_attr)) - - # Run the specified initialization - if hasattr(obj, 'param_init_fn') and isinstance(obj.param_init_fn, Callable): - obj.param_init_fn(module) - elif hasattr(module, 'reset_parameters') and isinstance(module.reset_parameters, Callable): - module.reset_parameters() - else: - raise ValueError( - f'Object `{obj_name}` does not have a ``param_init_fn`` or a ``reset_parameters`` function. ' - 'This leaves parameters without initialization. Please add a ``param_init_fn`` or ``reset_parameters`` ' - f'to module `{obj_name}`.', - ) - else: - - def _param_init_fn(module: torch.nn.Module) -> None: - # A dictionary of all tied parameter pointers to module names - tied_pointers = {} - - # Goes through all modules finding which weights have the same pointers - for name, mod in module.named_modules(): - # Since FSDP recursively wraps, at parent modules we can encounter already - # wrapped weights, as a result we should skip any modules with `_fsdp_wrapped_module.` - if '_fsdp_wrapped_module' in name: - continue - for attr in ['weight', 'bias']: - if hasattr(mod, attr): - mod_attr = getattr(mod, attr) - if mod_attr is None: - continue - ptr = id(mod_attr) - ptr_attr = (ptr, attr) - name_list = tied_pointers.get(ptr_attr, []) - name_list.append(name) - tied_pointers[ptr_attr] = name_list - - # Creates a dictionary of module names that should be tied together - tied_mod_names = collections.defaultdict(list) - # Creates a set of modules we should not initialize - should_not_init_params = set() - for ptr_attr_type, mod_names in tied_pointers.items(): - # No modules for this pointer are tied - if len(mod_names) == 1: - continue - _, attr_type = ptr_attr_type - first = next(mod_names.__iter__()) - for elem in mod_names: - should_not_init_params.add('.'.join([elem, attr_type])) - tied_mod_names[(first, attr_type)].append(elem) - # Make sure at least one of the tied parameters is initialized - should_not_init_params.remove('.'.join([first, attr_type])) - - meta_safe_apply( - module, - lambda t: torch.empty_like(t, device=f'cuda:{torch.cuda.current_device()}'), - should_not_init_params, - module_name='', + # A dictionary of all tied parameter pointers to (module, attr) tuples + tied_pointers = {} + + # Goes through all modules finding which weights have the same pointers + for mod in obj.modules(): + for attr_name, attr in mod.named_parameters(recurse=False): + ptr = id(attr) + mod_attr_list = tied_pointers.get(ptr, []) + mod_attr_list.append((mod, attr_name)) + tied_pointers[ptr] = mod_attr_list + + # Dictionary mapping the source module to a list of (target module, source attr, target attr) tuples + source_mod_to_mod_attr = {} + for mod_attr_list in tied_pointers.values(): + # If there is only one module for this pointer, then there is no weight tying + if len(mod_attr_list) == 1: + continue + + # Arbitrarily choose the first module as the source module + first_mod, first_attr = mod_attr_list[0] + source_mod_to_mod_attr[first_mod] = [ + (target_mod, first_attr, dest_attr) for target_mod, dest_attr in mod_attr_list[1:] + ] + + # Clean up no longer needed module references for memory safety + del tied_pointers + + def _param_init_fn(module: torch.nn.Module) -> None: + # If we do not have any parameters or buffers on meta device managed by this module directly, we do not need to call the parameter init function. + # It is assumed that whatever process moved the parameters off of meta device initialized them. + # We expect this to occur if we have tied weights, as the second module will already have the weights initialized. + is_meta = any(param.is_meta for param in module.parameters(recurse=False) + ) or any(buffer.is_meta for buffer in module.buffers(recurse=False)) + if not is_meta: + return + + # Move all parameters and buffers to the current device + module.to_empty(device=f'cuda:{torch.cuda.current_device()}', recurse=False) + + # Redo weight tying, which will have been broken by the above line that moves parameters off of meta device + if module in source_mod_to_mod_attr: + for target_mod, first_attr, dest_attr in source_mod_to_mod_attr[module]: + setattr(target_mod, dest_attr, getattr(module, first_attr)) + + # Run the specified initialization + if hasattr(obj, 'param_init_fn') and isinstance(obj.param_init_fn, Callable): + obj.param_init_fn(module) + elif hasattr(module, 'reset_parameters') and isinstance(module.reset_parameters, Callable): + module.reset_parameters() + else: + raise ValueError( + f'Object `{obj_name}` does not have a ``param_init_fn`` or a ``reset_parameters`` function. ' + 'This leaves parameters without initialization. Please add a ``param_init_fn`` or ``reset_parameters`` ' + f'to module `{obj_name}`.', ) - if len(tied_mod_names) > 0: - warnings.warn(( - 'The passed in model appears to have tied weights. In order to ' - 'support effective weight tying, the tied modules need to be ' - 'in the same FSDP module. If the weights are not properly tied ' - 'it can lead to loss spikes. We have tried our best to ensure ' - 'the tied weights are in the same FSDP module.' - )) - - # Redoes weight tying - for name_attr, tied_names in tied_mod_names.items(): - name, attr = name_attr - src_mod = module.get_submodule(name) - # We need to make sure the source and destination - # modules end up in the same FSDP module otherwise - # with sharding weight tying gets violated - src_mod._fsdp_wrap = False # type: ignore - src_params = getattr(src_mod, attr) - for tied_name in tied_names: - dest_mod = module.get_submodule(tied_name) - dest_mod._fsdp_wrap = False # type: ignore - setattr(dest_mod, attr, src_params) - - if hasattr(obj, 'param_init_fn') and isinstance(obj.param_init_fn, Callable): - module.apply(obj.param_init_fn) - elif hasattr(module, 'reset_parameters') and isinstance(module.reset_parameters, Callable): - module.reset_parameters() - else: - raise ValueError( - f'Object `{obj_name}` does not have a ``param_init_fn`` or a ``reset_parameters`` function. ' - 'This leaves parameters without initialization. Please add a ``param_init_fn`` or ``reset_parameters`` ' - f'to module `{obj_name}`.', - ) - - if version.parse(torch.__version__) > version.parse('2.1.0.dev'): - # CustomPolicy is only supported in torch v2.1.0-rc1 or higher - from torch.distributed.fsdp.wrap import CustomPolicy # type: ignore - - def lambda_fn(module: torch.nn.Module) -> Union[bool, dict]: - ret = False - if hasattr(module, '_fsdp_wrap'): - ret = bool(module._fsdp_wrap) - elif hasattr(obj, 'fsdp_wrap_fn') and isinstance(obj.fsdp_wrap_fn, Callable): - ret = obj.fsdp_wrap_fn(module) - if isinstance(ret, dict): - ret = set_custom_fsdp_module_kwargs(ret, process_group_cache) - return ret - - _auto_wrap_policy = CustomPolicy(lambda_fn) - else: - # Choose which modules to FSDP wrap according to the following priority: - # If module has attribute `module._fsdp_wrap = ...`, always respect it - # Otherwise wrap if root object `obj.fsdp_wrap_fn(module)` is true. - def __auto_wrap_policy(module: torch.nn.Module, recurse: bool, nonwrapped_numel: int) -> bool: - if recurse: - return True - should_be_wrapped = False - if hasattr(module, '_fsdp_wrap'): - should_be_wrapped = bool(module._fsdp_wrap) - elif hasattr(obj, 'fsdp_wrap_fn') and isinstance(obj.fsdp_wrap_fn, Callable): - should_be_wrapped = obj.fsdp_wrap_fn(module) - - return should_be_wrapped - - def _auto_wrap_policy_new(module: torch.nn.Module, recurse: bool, nonwrapped_numel: int) -> bool: - return __auto_wrap_policy(module, recurse, nonwrapped_numel) + def lambda_fn(module: torch.nn.Module) -> Union[bool, dict]: + ret = False + if hasattr(module, '_fsdp_wrap'): + ret = bool(module._fsdp_wrap) + elif hasattr(obj, 'fsdp_wrap_fn') and isinstance(obj.fsdp_wrap_fn, Callable): + ret = obj.fsdp_wrap_fn(module) + if isinstance(ret, dict): + ret = set_custom_fsdp_module_kwargs(ret, process_group_cache) + return ret - _auto_wrap_policy = _auto_wrap_policy_new + _auto_wrap_policy = CustomPolicy(lambda_fn) fsdp_obj = FullyShardedDataParallel( obj, @@ -640,75 +538,52 @@ def _auto_wrap_policy_new(module: torch.nn.Module, recurse: bool, nonwrapped_num # FP8 TE requires using the TE checkpoint function, FSDP activation checkpointing only works with TE non-reentrant checkpointing if te_checkpoint_wrapper: assert not activation_checkpointing_reentrant, 'TE checkpoint only works with non-reentrant checkpointing' - if version.parse(torch.__version__) > version.parse('2.1.0.dev'): - from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import offload_wrapper - if not activation_checkpointing_reentrant: - if te_checkpoint_wrapper: - try: - import transformer_engine.pytorch as te - except ModuleNotFoundError: - raise ModuleNotFoundError( - 'Please install transformer-engine to use TE checkpoint wrapper', - ) - - # RNG state tracker for checkpointing - CUDA_RNG_STATES_TRACKER = te.distributed.CudaRNGStatesTracker() - CUDA_RNG_STATES_TRACKER.add('fsdp-rng', te_rng_seed) - - def get_cuda_rng_tracker(): - return CUDA_RNG_STATES_TRACKER - - first_wrap_fn = lambda m: checkpoint_wrapper( - m, - context_fn=te.distributed.get_activation_recompute_contexts, - checkpoint_fn=te.distributed.checkpoint, - use_reentrant=False, - get_rng_state_tracker=get_cuda_rng_tracker, + if not activation_checkpointing_reentrant: + if te_checkpoint_wrapper: + try: + import transformer_engine.pytorch as te + except ModuleNotFoundError: + raise ModuleNotFoundError( + 'Please install transformer-engine to use TE checkpoint wrapper', ) - else: - first_wrap_fn = lambda m: checkpoint_wrapper( - m, - checkpoint_impl=CheckpointImpl.NO_REENTRANT, - ) if activation_checkpointing else (lambda module: module) - second_wrap_fn = ( - lambda module: offload_wrapper( - first_wrap_fn(module) - if activation_checkpointing else module, # type: ignore reportGeneralTypeIssues - ) - ) if activation_cpu_offload else first_wrap_fn - else: + + # RNG state tracker for checkpointing + CUDA_RNG_STATES_TRACKER = te.distributed.CudaRNGStatesTracker() + CUDA_RNG_STATES_TRACKER.add('fsdp-rng', te_rng_seed) + + def get_cuda_rng_tracker(): + return CUDA_RNG_STATES_TRACKER first_wrap_fn = lambda m: checkpoint_wrapper( m, - checkpoint_impl=CheckpointImpl.REENTRANT, - ) if activation_checkpointing else (lambda module: module) - second_wrap_fn = ( - lambda module: offload_wrapper( - first_wrap_fn(module) - if activation_checkpointing else module, # type: ignore reportGeneralTypeIssues - ) - ) if activation_cpu_offload else first_wrap_fn - else: - if not activation_checkpointing_reentrant: + context_fn=te.distributed.get_activation_recompute_contexts, + checkpoint_fn=te.distributed.checkpoint, + use_reentrant=False, + get_rng_state_tracker=get_cuda_rng_tracker, + ) + else: first_wrap_fn = lambda m: checkpoint_wrapper( m, checkpoint_impl=CheckpointImpl.NO_REENTRANT, ) if activation_checkpointing else (lambda module: module) - second_wrap_fn = ( - lambda module: checkpoint_wrapper( - first_wrap_fn(module), # type: ignore reportGeneralTypeIssues - checkpoint_impl=CheckpointImpl.NO_REENTRANT, - offload_to_cpu=True, - ) - ) if activation_cpu_offload else first_wrap_fn - else: - first_wrap_fn = checkpoint_wrapper if activation_checkpointing else (lambda module: module) - second_wrap_fn = ( - lambda module: checkpoint_wrapper( - first_wrap_fn(module), # type: ignore reportGeneralTypeIssues - offload_to_cpu=True, - ) - ) if activation_cpu_offload else first_wrap_fn + second_wrap_fn = ( + lambda module: offload_wrapper( + first_wrap_fn(module) + if activation_checkpointing else module, # type: ignore reportGeneralTypeIssues + ) + ) if activation_cpu_offload else first_wrap_fn + else: + + first_wrap_fn = lambda m: checkpoint_wrapper( + m, + checkpoint_impl=CheckpointImpl.REENTRANT, + ) if activation_checkpointing else (lambda module: module) + second_wrap_fn = ( + lambda module: offload_wrapper( + first_wrap_fn(module) + if activation_checkpointing else module, # type: ignore reportGeneralTypeIssues + ) + ) if activation_cpu_offload else first_wrap_fn # Choose which modules to activation checkpoint according to the following priority: # If module has attribute `module._activation_checkpointing = ...`, always respect it diff --git a/composer/distributed/mosaic_parallelism.py b/composer/distributed/mosaic_parallelism.py index 66c06d911b..fc261e1edf 100644 --- a/composer/distributed/mosaic_parallelism.py +++ b/composer/distributed/mosaic_parallelism.py @@ -27,12 +27,10 @@ 'NO_SHARD': ShardingStrategy.NO_SHARD, 'SHARD_GRAD_OP': ShardingStrategy.SHARD_GRAD_OP, 'FULL_SHARD': ShardingStrategy.FULL_SHARD, + '_HYBRID_SHARD_ZERO2': ShardingStrategy._HYBRID_SHARD_ZERO2, + 'HYBRID_SHARD': ShardingStrategy.HYBRID_SHARD, } -if version.parse(torch.__version__) >= version.parse('2.1.0'): - SHARDING_MAP['_HYBRID_SHARD_ZERO2'] = ShardingStrategy._HYBRID_SHARD_ZERO2 - SHARDING_MAP['HYBRID_SHARD'] = ShardingStrategy.HYBRID_SHARD - BACKWARD_PREFETCH_MAP = { 'NONE': None, 'BACKWARD_PRE': BackwardPrefetch.BACKWARD_PRE, diff --git a/composer/profiler/torch_profiler.py b/composer/profiler/torch_profiler.py index 883ba2b442..2d76c5bf95 100644 --- a/composer/profiler/torch_profiler.py +++ b/composer/profiler/torch_profiler.py @@ -27,6 +27,7 @@ format_name_with_dist, format_name_with_dist_and_time, ) +from composer.profiler.utils import export_memory_timeline_html if TYPE_CHECKING: from composer.core import State @@ -296,44 +297,39 @@ def handler_fn(prof: torch.profiler.profiler.profile): f'PyTorch memory timeline profiler enabled: {self.memory_filename if self.memory_filename else False}', ) if self.memory_filename is not None: - if version.parse(torch.__version__) > version.parse('2.1.0.dev'): # type: ignore - # memory timeline profiling is only supported in torch v2.1.0-rc1 or higher - memory_trace_file_name = os.path.join( - folder_name, - format_name_with_dist_and_time( - self.memory_filename, - run_name=state.run_name, - timestamp=timestamp, - ), + memory_trace_file_name = os.path.join( + folder_name, + format_name_with_dist_and_time( + self.memory_filename, + run_name=state.run_name, + timestamp=timestamp, + ), + ) + log.debug(f'Saving memory trace to {memory_trace_file_name}') + memory_trace_file_dirname = os.path.dirname(memory_trace_file_name) + if memory_trace_file_dirname: + os.makedirs(memory_trace_file_dirname, exist_ok=True) + export_memory_timeline_html( + prof, + memory_trace_file_name, + torch.cuda.current_device(), # type: ignore + ) + log.debug(f'Uploaded memory trace to {self.memory_remote_file_name}') + if self.memory_remote_file_name is not None: + memory_trace_remote_file_name = format_name_with_dist_and_time( + self.memory_remote_file_name, + run_name=state.run_name, + timestamp=timestamp, + ) + memory_trace_remote_file_name = memory_trace_remote_file_name.lstrip('/') + log.debug( + f'Uploading memory trace to {memory_trace_remote_file_name} from {memory_trace_file_name}', ) - log.debug(f'Saving memory trace to {memory_trace_file_name}') - memory_trace_file_dirname = os.path.dirname(memory_trace_file_name) - if memory_trace_file_dirname: - os.makedirs(memory_trace_file_dirname, exist_ok=True) - from composer.profiler.utils import export_memory_timeline_html - export_memory_timeline_html( - prof, - memory_trace_file_name, - torch.cuda.current_device(), # type: ignore + logger.upload_file( + remote_file_name=memory_trace_remote_file_name, + file_path=memory_trace_file_name, + overwrite=self.overwrite, ) - log.debug(f'Uploaded memory trace to {self.memory_remote_file_name}') - if self.memory_remote_file_name is not None: - memory_trace_remote_file_name = format_name_with_dist_and_time( - self.memory_remote_file_name, - run_name=state.run_name, - timestamp=timestamp, - ) - memory_trace_remote_file_name = memory_trace_remote_file_name.lstrip('/') - log.debug( - f'Uploading memory trace to {memory_trace_remote_file_name} from {memory_trace_file_name}', - ) - logger.upload_file( - remote_file_name=memory_trace_remote_file_name, - file_path=memory_trace_file_name, - overwrite=self.overwrite, - ) - else: - log.warning('Memory timeline is supported after PyTorch 2.1.0. Skipping memory trace.') if self.num_traces_to_keep >= 0: while len(self.saved_traces) > self.num_traces_to_keep: diff --git a/composer/profiler/utils.py b/composer/profiler/utils.py index ddd235b711..63a0fa59a1 100644 --- a/composer/profiler/utils.py +++ b/composer/profiler/utils.py @@ -10,6 +10,8 @@ from tempfile import NamedTemporaryFile from typing import Any, Optional, Union +from torch.profiler._memory_profiler import _CATEGORY_TO_COLORS, _CATEGORY_TO_INDEX, MemoryProfileTimeline + import numpy as np import torch import torch.cuda @@ -29,11 +31,6 @@ def export_memory_timeline_html( return_fig: bool = False, ) -> Optional[Union[None, Any]]: """Exports a memory timeline to an HTML file. Similar to the PyTorch plotting function, but with adjusted axis tickers and grids.""" - if version.parse(torch.__version__) <= version.parse('2.1.0.dev'): - log.warning('export_memory_timeline_html failed because memory timeline is supported after PyTorch 2.1.0.') - return - - from torch.profiler._memory_profiler import _CATEGORY_TO_COLORS, _CATEGORY_TO_INDEX, MemoryProfileTimeline # Default to device 0, if unset. Fallback on cpu. if device is None and prof.use_device and prof.use_device != 'cuda': diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py index 24ba86a2cb..bd430e6ce9 100644 --- a/composer/trainer/_patch_pytorch.py +++ b/composer/trainer/_patch_pytorch.py @@ -47,29 +47,7 @@ def patch_unshard_for_automicrobatching(auto_microbatch_size_found=False): def patch_pytorch(): """Monkey patches pytorch functions based on pytorch version.""" - if version.parse(torch.__version__) < version.parse('2.1.1'): - # Monkey patch for torch < 2.1.1 ie torch == 2.1.0 - - # Monkey patch sharding method - ChunkShardingSpec.build_metadata = build_metadata - - # Monkey patch partial state dict handling - from torch.distributed.fsdp import _state_dict_utils - - _state_dict_utils._sharded_pre_load_state_dict_hook = (_sharded_pre_load_state_dict_hook) - - # Allow 2D HSDP - from torch.distributed.fsdp import _runtime_utils - _runtime_utils._validate_and_get_hybrid_shard_state = lambda *args, **kwargs: None - - elif version.parse(torch.__version__) < version.parse('2.1.3'): - # Monkey patch for torch < 2.1.3 ie torch == 2.1.1, 2.1.2 - - # Allow 2D HSDP - from torch.distributed.fsdp import _runtime_utils - _runtime_utils._validate_and_get_hybrid_shard_state = lambda *args, **kwargs: None - - elif version.parse(torch.__version__) < version.parse('2.2.1'): + if version.parse(torch.__version__) < version.parse('2.2.1'): # Monkey patch for torch < 2.2.1 ie torch == 2.2.0 # Allow 2D HSDP @@ -140,140 +118,6 @@ def patch_pytorch(): pass -def build_metadata( - self, - tensor_sizes: torch.Size, - tensor_properties: sharded_tensor_meta.TensorProperties, -) -> sharded_tensor_meta.ShardedTensorMetadata: - """Adds nightly change for ChunkShardingSpec. - - Change implemented in https://github.com/pytorch/pytorch/pull/108915 - """ - tensor_num_dim = len(tensor_sizes) - - self._verify_dim(self.dim) - if self.dim >= tensor_num_dim or self.dim < -tensor_num_dim: # type: ignore[operator] - raise ValueError(f'Invalid sharding dim: {self.dim}') - - shards_metadata = [] - sharding_dim_size = tensor_sizes[self.dim] # type: ignore[index] - chunks = len(self.placements) - split_size = get_split_size(sharding_dim_size, chunks) - for idx, placement in enumerate(self.placements): - # generate ShardMetadata for each placement device - chunked_dim_size = get_chunked_dim_size(sharding_dim_size, split_size, idx) - shard_size = list(tensor_sizes) - current_offsets = [0] * tensor_num_dim - current_offsets[self.dim] = split_size * idx # type: ignore[index] - shard_size[self.dim] = chunked_dim_size # type: ignore[index] - - shard_metadata = ShardMetadata( - shard_offsets=current_offsets, - shard_sizes=shard_size, - placement=placement, - ) - shards_metadata.append(shard_metadata) - - return sharded_tensor_meta.ShardedTensorMetadata(shards_metadata, tensor_sizes, tensor_properties) - - -@no_type_check -def _sharded_pre_load_state_dict_hook( - module: nn.Module, - fsdp_state, - state_dict: dict[str, Any], - prefix: str, -) -> None: - """Adds nightly change for partial state dict error handling. - - https://github.com/pytorch/pytorch/blob/0511df0ee9edeb5c2613805ccfb49beb323b87f9/torch/distributed/fsdp/_state_dict_utils.py#L607-L615 - - The hook combines the unflattened, sharded parameters (ShardedTensor) to - a new FlatParameter and shards the new FlatParameter to the local chunk. - """ - from torch.distributed._tensor import Replicate - from torch.distributed.distributed_c10d import _get_pg_default_device - from torch.distributed.fsdp._common_utils import FSDP_PREFIX, _has_fsdp_params, _is_composable, _module_handle - from torch.distributed.fsdp._runtime_utils import _lazy_init - from torch.distributed.fsdp._state_dict_utils import _enter_unshard_params_ctx, _param_name_infos - - _lazy_init(fsdp_state, module) - if not _is_composable(fsdp_state): - _replace_by_prefix(state_dict, prefix, prefix + f'{FSDP_PREFIX}') - if not _has_fsdp_params(fsdp_state, module): - return - - handle = _module_handle(fsdp_state, module) - if not handle.uses_sharded_strategy: # type: ignore - raise RuntimeError( - 'load_sharded_state_dict can only be called when parameters ' - 'are flattened and sharded.', - ) - - device = fsdp_state.compute_device - for fqn, _, _ in _param_name_infos(module, fsdp_state): - if not _is_composable(fsdp_state): - fqn_from_global_root = f'{prefix}{FSDP_PREFIX}{fqn}' - else: - fqn_from_global_root = f'{prefix}{fqn}' - try: - param = state_dict.pop(fqn_from_global_root) - except KeyError: - log.warning( - f'Did not find param with FQN {fqn_from_global_root}, skipping it. ' # noqa: G004 - 'The weight will not be filled if you expect it to be.', - ) - continue # TODO: Improve unittesting for state_dict finetuning - # cases: https://github.com/pytorch/pytorch/issues/109134 - - if not fsdp_state._state_dict_config.use_dtensor: - # All-gather the param (ShardedTensor) - param, shards = _ext_pre_load_state_dict_transform(param) - - assert len(shards) < 2, ( - 'Expects 0 or 1 shard per rank ' - f'but got {len(shards)} shards on rank {fsdp_state.rank}.' - ) - param_numel = param.size().numel() - dim_0_size = param.size()[0] - chunk_size = (math.ceil(dim_0_size / fsdp_state.world_size) * param_numel // dim_0_size) - if len(shards) == 1: - local_tensor = shards[0].tensor.flatten() - pg_device = _get_pg_default_device(fsdp_state.process_group) - if local_tensor.device.type != pg_device.type: - local_tensor = local_tensor.to(pg_device) - num_padding = chunk_size - local_tensor.numel() - if num_padding > 0: - local_tensor = F.pad(local_tensor, [0, num_padding]) - else: - local_tensor = torch.zeros(chunk_size, dtype=param.dtype, device=device) - tensor = torch.empty( - chunk_size * fsdp_state.world_size, - dtype=local_tensor.dtype, - device=device, - ) - if local_tensor.is_cpu: - # Tensor could be on FSDP GPU compute device, while local_tensor is on CPU. - # Convert to CPU so all_gather can work. - tensor_dev = tensor.device - tensor = tensor.cpu() - tensor_list = list(torch.chunk(tensor, torch.distributed.get_world_size(fsdp_state.process_group))) - torch.distributed.all_gather(tensor_list, local_tensor, group=fsdp_state.process_group) - tensor.to(tensor_dev) - else: - torch.distributed.all_gather_into_tensor(tensor, local_tensor, group=fsdp_state.process_group) - tensor = tensor.narrow(0, 0, param_numel).reshape(param.size()) - state_dict[fqn_from_global_root] = tensor - else: - if param.device != fsdp_state._device_mesh.device_type: # type: ignore - param = param.to(fsdp_state._device_mesh.device_type) # type: ignore - - param = param.redistribute(device_mesh=param.device_mesh, placements=[Replicate()]) - state_dict[fqn_from_global_root] = param.to_local() - - _enter_unshard_params_ctx(module, fsdp_state, writeback=True) - - if version.parse(torch.__version__) >= version.parse('2.2.1') and version.parse( torch.__version__,) < version.parse('2.2.3'): diff --git a/composer/utils/dist.py b/composer/utils/dist.py index 2178ce2dd5..5b89b5b531 100644 --- a/composer/utils/dist.py +++ b/composer/utils/dist.py @@ -579,8 +579,6 @@ def initialize_dist(device: Union[str, Device], timeout: float = 300.0) -> None: 'PyTorch XLA package not found. In order to use XLA based devices ' 'PyTorch XLA must be installed.', ) - if version.parse(torch_xla.__version__) < version.parse('2.1.0'): - raise RuntimeError(f'PyTorch XLA version must be at least 2.1.0, found {torch_xla.__version__}.') # XLA initialization requires the init_method to be set dist.init_process_group(device_obj.dist_backend, init_method='xla://') elif dist_env_vars_match_defaults: From 6da77c1dca2f9951f6fc8e9fde3e1a6ede46b9d3 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 9 Aug 2024 17:45:10 -0400 Subject: [PATCH 20/23] lint --- composer/callbacks/memory_snapshot.py | 1 - composer/callbacks/oom_observer.py | 1 - composer/distributed/dist_strategy.py | 12 ++++-------- composer/distributed/mosaic_parallelism.py | 1 - composer/profiler/torch_profiler.py | 3 +-- composer/profiler/utils.py | 5 +---- composer/trainer/_patch_pytorch.py | 8 -------- composer/utils/dist.py | 6 +----- 8 files changed, 7 insertions(+), 30 deletions(-) diff --git a/composer/callbacks/memory_snapshot.py b/composer/callbacks/memory_snapshot.py index 67805db257..328d781d81 100644 --- a/composer/callbacks/memory_snapshot.py +++ b/composer/callbacks/memory_snapshot.py @@ -9,7 +9,6 @@ from typing import Optional, Union import torch.cuda -from packaging import version from composer import State from composer.core import Callback, State, Time, TimeUnit diff --git a/composer/callbacks/oom_observer.py b/composer/callbacks/oom_observer.py index af75ff4cab..d85b4ec6ca 100644 --- a/composer/callbacks/oom_observer.py +++ b/composer/callbacks/oom_observer.py @@ -14,7 +14,6 @@ from typing import Optional import torch.cuda -from packaging import version from composer.core import Callback, State from composer.loggers import Logger diff --git a/composer/distributed/dist_strategy.py b/composer/distributed/dist_strategy.py index 0496abd48d..1b09a9fd74 100644 --- a/composer/distributed/dist_strategy.py +++ b/composer/distributed/dist_strategy.py @@ -3,7 +3,6 @@ """Helpers for running distributed data parallel training.""" -import collections import logging import warnings from contextlib import contextmanager, nullcontext @@ -15,18 +14,17 @@ CheckpointImpl, apply_activation_checkpointing, checkpoint_wrapper, + offload_wrapper, ) -from torch.distributed.fsdp.wrap import CustomPolicy -from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import offload_wrapper from torch.distributed.fsdp import FullyShardedDataParallel, ShardingStrategy from torch.distributed.fsdp._common_utils import clean_tensor_name +from torch.distributed.fsdp.wrap import CustomPolicy from torch.nn.parallel import DistributedDataParallel from torchmetrics import Metric, MetricCollection from composer.core import Precision, State from composer.core.precision import _validate_precision from composer.devices import Device, DeviceGPU -from composer.distributed.meta_safe_apply import meta_safe_apply from composer.distributed.mosaic_parallelism import ( BACKWARD_PREFETCH_MAP, SHARDING_MAP, @@ -431,7 +429,7 @@ def _param_init_fn(module: torch.nn.Module) -> None: # It is assumed that whatever process moved the parameters off of meta device initialized them. # We expect this to occur if we have tied weights, as the second module will already have the weights initialized. is_meta = any(param.is_meta for param in module.parameters(recurse=False) - ) or any(buffer.is_meta for buffer in module.buffers(recurse=False)) + ) or any(buffer.is_meta for buffer in module.buffers(recurse=False)) if not is_meta: return @@ -543,9 +541,7 @@ def lambda_fn(module: torch.nn.Module) -> Union[bool, dict]: try: import transformer_engine.pytorch as te except ModuleNotFoundError: - raise ModuleNotFoundError( - 'Please install transformer-engine to use TE checkpoint wrapper', - ) + raise ModuleNotFoundError('Please install transformer-engine to use TE checkpoint wrapper',) # RNG state tracker for checkpointing CUDA_RNG_STATES_TRACKER = te.distributed.CudaRNGStatesTracker() diff --git a/composer/distributed/mosaic_parallelism.py b/composer/distributed/mosaic_parallelism.py index fc261e1edf..0fa6a0547c 100644 --- a/composer/distributed/mosaic_parallelism.py +++ b/composer/distributed/mosaic_parallelism.py @@ -8,7 +8,6 @@ from typing import Any, Union import torch -from packaging import version from torch import distributed from torch.distributed import ProcessGroup from torch.distributed.fsdp import ( diff --git a/composer/profiler/torch_profiler.py b/composer/profiler/torch_profiler.py index 2d76c5bf95..93e753bbd5 100644 --- a/composer/profiler/torch_profiler.py +++ b/composer/profiler/torch_profiler.py @@ -13,12 +13,12 @@ import torch.cuda import torch.profiler -from packaging import version from torch.profiler.profiler import ProfilerAction as TorchProfilerAction from composer.core.callback import Callback from composer.loggers import Logger from composer.profiler.profiler_action import ProfilerAction +from composer.profiler.utils import export_memory_timeline_html from composer.utils import ( FORMAT_NAME_WITH_DIST_AND_TIME_TABLE, FORMAT_NAME_WITH_DIST_TABLE, @@ -27,7 +27,6 @@ format_name_with_dist, format_name_with_dist_and_time, ) -from composer.profiler.utils import export_memory_timeline_html if TYPE_CHECKING: from composer.core import State diff --git a/composer/profiler/utils.py b/composer/profiler/utils.py index 63a0fa59a1..68f2862549 100644 --- a/composer/profiler/utils.py +++ b/composer/profiler/utils.py @@ -10,12 +10,10 @@ from tempfile import NamedTemporaryFile from typing import Any, Optional, Union -from torch.profiler._memory_profiler import _CATEGORY_TO_COLORS, _CATEGORY_TO_INDEX, MemoryProfileTimeline - import numpy as np import torch import torch.cuda -from packaging import version +from torch.profiler._memory_profiler import _CATEGORY_TO_COLORS, _CATEGORY_TO_INDEX, MemoryProfileTimeline from torch.profiler.profiler import profile as TorchProfile log = logging.getLogger(__name__) @@ -31,7 +29,6 @@ def export_memory_timeline_html( return_fig: bool = False, ) -> Optional[Union[None, Any]]: """Exports a memory timeline to an HTML file. Similar to the PyTorch plotting function, but with adjusted axis tickers and grids.""" - # Default to device 0, if unset. Fallback on cpu. if device is None and prof.use_device and prof.use_device != 'cuda': device = prof.use_device + ':0' diff --git a/composer/trainer/_patch_pytorch.py b/composer/trainer/_patch_pytorch.py index bd430e6ce9..881914e2ce 100644 --- a/composer/trainer/_patch_pytorch.py +++ b/composer/trainer/_patch_pytorch.py @@ -11,7 +11,6 @@ """PyTorch, especially PyTorch Distributed, monkeypatches.""" import logging -import math import functools import contextlib from dataclasses import asdict @@ -20,16 +19,9 @@ import torch -import torch.distributed._shard.sharded_tensor.metadata as sharded_tensor_meta -from torch.distributed._shard.sharding_spec import ChunkShardingSpec import torch.nn as nn -import torch.nn.functional as F from packaging import version -from torch.distributed._shard.sharding_spec import ShardMetadata -from torch.distributed._shard.sharding_spec._internals import get_chunked_dim_size, get_split_size from torch.distributed.fsdp import FullyShardedDataParallel, ShardingStrategy -from torch.distributed.fsdp._fsdp_extensions import _ext_pre_load_state_dict_transform -from torch.distributed.utils import _replace_by_prefix from composer.utils import dist diff --git a/composer/utils/dist.py b/composer/utils/dist.py index 5b89b5b531..0515828a10 100644 --- a/composer/utils/dist.py +++ b/composer/utils/dist.py @@ -47,12 +47,8 @@ import torch import torch.distributed as dist import torch.utils.data -from packaging import version -from composer.utils.device import get_device, is_hpu_installed, is_xla_installed - -if is_xla_installed(): - import torch_xla +from composer.utils.device import get_device, is_hpu_installed if TYPE_CHECKING: from composer.devices import Device From 71c3de7255f6afef4d9e85ae0dd3c7a04cfa7e5c Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 9 Aug 2024 17:59:48 -0400 Subject: [PATCH 21/23] 4wide --- .github/workflows/docker-configure-build-push.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker-configure-build-push.yaml b/.github/workflows/docker-configure-build-push.yaml index 2b6bf4893d..a668e75217 100644 --- a/.github/workflows/docker-configure-build-push.yaml +++ b/.github/workflows/docker-configure-build-push.yaml @@ -36,7 +36,7 @@ on: required: true jobs: configure-build-push: - runs-on: ubuntu-latest + runs-on: mosaic-4wide steps: - name: Maximize Build Space on Worker uses: easimon/maximize-build-space@v4 From 1ce316938c365dbfd0cfadcdd8ee58e05b5e350b Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 9 Aug 2024 18:00:50 -0400 Subject: [PATCH 22/23] bump fa --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 35a8d2887a..c3f4dee907 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -262,7 +262,7 @@ RUN if [ -n "$MOFED_VERSION" ] ; then \ RUN if [ -n "$CUDA_VERSION" ] ; then \ pip${PYTHON_VERSION} install --upgrade --no-cache-dir ninja==1.11.1 && \ pip${PYTHON_VERSION} install --upgrade --no-cache-dir --force-reinstall packaging==22.0 && \ - MAX_JOBS=1 pip${PYTHON_VERSION} install --no-cache-dir --no-build-isolation flash-attn==2.6.2; \ + MAX_JOBS=1 pip${PYTHON_VERSION} install --no-cache-dir --no-build-isolation flash-attn==2.6.3; \ cd .. ; \ fi From e9224f45f2eb2b44d6df0e49eee8313240fc9813 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Mon, 12 Aug 2024 17:47:43 -0400 Subject: [PATCH 23/23] remove 2.1 tests --- .github/workflows/daily.yaml | 28 ---------------------------- .github/workflows/pr-cpu.yaml | 4 ---- composer/trainer/trainer.py | 1 - 3 files changed, 33 deletions(-) diff --git a/.github/workflows/daily.yaml b/.github/workflows/daily.yaml index b64e68d493..ee94e89c2b 100644 --- a/.github/workflows/daily.yaml +++ b/.github/workflows/daily.yaml @@ -17,11 +17,6 @@ jobs: strategy: matrix: include: - - name: cpu-3.10-2.1 - container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - markers: not daily and (remote or not remote) and not gpu and not doctest - pytest_command: coverage run -m pytest - composer_package_name: mosaicml - name: cpu-3.11-2.2 container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04 markers: not daily and (remote or not remote) and not gpu and not doctest @@ -42,11 +37,6 @@ jobs: markers: not daily and (remote or not remote) and not gpu and doctest pytest_command: coverage run -m pytest tests/test_docs.py composer_package_name: mosaicml - - name: daily-cpu-3.10-2.1 - container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - markers: daily and (remote or not remote) and not gpu and not doctest - pytest_command: coverage run -m pytest - composer_package_name: mosaicml - name: daily-cpu-3.11-2.2 container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04 markers: daily and (remote or not remote) and not gpu and not doctest @@ -102,12 +92,6 @@ jobs: # Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time # on MCLOUD and not eat up all GPUs at once include: - - name: "gpu-3.10-2.1-1-gpu" - container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 - markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" - pytest_command: "coverage run -m pytest" - composer_package_name: "mosaicml" - gpu_num: 1 - name: "gpu-3.11-2.2-1-gpu" container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" @@ -120,12 +104,6 @@ jobs: pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" gpu_num: 1 - - name: "gpu-3.10-2.1-2-gpu" - container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 - markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" - pytest_command: "coverage run -m pytest" - composer_package_name: "mosaicml" - gpu_num: 2 - name: "gpu-3.11-2.2-2-gpu" container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" @@ -138,12 +116,6 @@ jobs: pytest_command: "coverage run -m pytest" composer_package_name: "mosaicml" gpu_num: 2 - - name: "gpu-3.10-2.1-4-gpu" - container: mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04 - markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" - pytest_command: "coverage run -m pytest" - composer_package_name: "mosaicml" - gpu_num: 4 - name: "gpu-3.11-2.2-4-gpu" container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04 markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)" diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 23129715db..4d44e69824 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -13,10 +13,6 @@ jobs: strategy: matrix: include: - - name: cpu-3.10-2.1 - container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 - markers: not daily and not remote and not gpu and not doctest - pytest_command: coverage run -m pytest - name: cpu-3.11-2.2 container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04 markers: not daily and not remote and not gpu and not doctest diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index b2f829ca10..27323718fc 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -41,7 +41,6 @@ import torch.utils.data from packaging import version from torch._dynamo import OptimizedModule -from torch.cuda.amp.grad_scaler import GradScaler from torch.distributed.fsdp import FullyShardedDataParallel from torch.distributed.fsdp._runtime_utils import _post_backward_final_callback from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler