From 6b96832fa30f8e7058971f5d1d842eed079b4c44 Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Thu, 26 Sep 2024 11:19:07 -0700 Subject: [PATCH 01/10] Initial commit of Llama 3.1 405B ref --- large_language_model/nemo/Dockerfile | 20 ++ large_language_model/nemo/README.md | 111 ++++++++ large_language_model/nemo/config.sh | 50 ++++ large_language_model/nemo/pretrain_llama31.py | 259 ++++++++++++++++++ large_language_model/nemo/run_llama31.sh | 82 ++++++ 5 files changed, 522 insertions(+) create mode 100644 large_language_model/nemo/Dockerfile create mode 100644 large_language_model/nemo/README.md create mode 100644 large_language_model/nemo/config.sh create mode 100644 large_language_model/nemo/pretrain_llama31.py create mode 100644 large_language_model/nemo/run_llama31.sh diff --git a/large_language_model/nemo/Dockerfile b/large_language_model/nemo/Dockerfile new file mode 100644 index 000000000..944ef5973 --- /dev/null +++ b/large_language_model/nemo/Dockerfile @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARG NEMO_BASE_IMAGE=nvcr.io/nvidia/nemo:dev +FROM ${NEMO_BASE_IMAGE} AS nemo-base-image + +# setup workspace +WORKDIR /workspace/llama31 +COPY . . diff --git a/large_language_model/nemo/README.md b/large_language_model/nemo/README.md new file mode 100644 index 000000000..9c26fa087 --- /dev/null +++ b/large_language_model/nemo/README.md @@ -0,0 +1,111 @@ +# 1. Problem + +Large Language Model pretraining - Llama 3.1 405B + +# 2. Directions + +### Steps to configure machine + +To use this repository, please install a supported version of PyTorch with GPU support (python 3.10, pytorch 2.4, cuda 12.5, and nccl 2.22.3 and above) and NVIDIA APEX. **Slurm-based clusters are required to run the reference**. + +We recommend using the latest NeMo FW container. The latest tested compatible version is `nvcr.io/nvidia/nemo:dev`). + +#### Container Setup + +All of the following codes are assumed to be run within a container. A [Dockerfile](./Dockerfile) is available for building containers on top of `nvcr.io/nvidia/nemo:dev`. + +To build the container: + +```bash +docker build -t -f Dockerfile . +``` + +To launch the container: + +```bash +docker run -it --rm \ +--network=host --ipc=host \ +-v ~/.ssh:/root/.ssh \ + bash +``` + +Note: it's recommended to map your `.ssh` folder to inside the container, so that it's easier for the code to set up remote cluster access. + +### Steps to download and verify data + +The current codebase is still using GPT3's train/val datasets and SentencePieceModel tokenizer. Please refer to [GPT3 instructions](https://github.com/mlcommons/training/tree/master/large_language_model/megatron-lm#preprocessed-data-download) to download preprocessed datasets and SPM checkpoints. + +### Steps to run and time + +To train Llama 3.1 405B, we need to fill out all fields in [config.sh](./config.sh). This file contains all configurations for Slurm cluster access and job submission configurations, directory mappings, containers, and model configurations. + +Once the `config.sh` is properly filled, we run the following code snippet **inside the container**: + +```bash +source config.sh +bash run_llama31.sh +``` + +# 3. Dataset/Environment +### Publication/Attribution + +We use the c4/en/3.0.1 dataset from [HuggingFace/AllenAI](https://huggingface.co/datasets/allenai/c4). + +### Data preprocessing + +To be filled. For now, please refer to [GPT3 data preprocessing instructions](https://github.com/mlcommons/training/tree/master/large_language_model/megatron-lm#dataset-preprocessing). + +### Training and test data separation + +To be determined. For now, we are using the default split from the C4 dataset. + +### Training data order + +To be determined. + +### Test data order + +To be determined. + +# 4. Model +### Publication/Attribution + +The model largely follows the Llama 3.1 405B [paper](https://arxiv.org/abs/2407.21783). The main difference is that the model parameters is *to be determined from experiments*. + +### Model details + +| Config | Value | +| :-- | :-- | +| Embedding | RoPE + parameter adjustments | +| # Layers | 126 | +| Attention Type | GQA | +| # Attn Heads | 128 | +| Key/Value Heads | 8 | +| Model Dimension | 16,384 | +| Hidden Dimension | 53248 | +| Activation | SwiGLU | +| Normalization | RMSNorm | +| Tokenizer | TokTokenizer | +| Vocab size | 128,000 | +| Context Length | 8192 | + +### Optimizer + +Adam + +# 5. Quality +### Quality metric + +Log Perplexity + +### Quality target + +To be determined. + +### Evaluation frequency + +To be determined. + +### Evaluation thoroughness + +To be determined. \ No newline at end of file diff --git a/large_language_model/nemo/config.sh b/large_language_model/nemo/config.sh new file mode 100644 index 000000000..530b977ad --- /dev/null +++ b/large_language_model/nemo/config.sh @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# SSH: username that connects to the remote cluster +export USER="" +# SSH: remote cluster URL +export HOST="" +# Slurm: account for job submission +export ACCOUNT="" +# Slurm: partition for job submission +export PARTITION="" +# Slurm: job time limit +export TIME="" +# Slurm: --nodes arguments +export NNODES=0 +# Slurm: --gpus_per_node and --ntasks_per_node argument +export GPUS_PER_NODE=0 + +# Folder mapping: +# Output directory that holds logs +export JOB_DIR="" +# Image path, either local cache file or remote URL +export IMAGE="" +# Dataset: C4 dataset location that contains the dataset after preprocessing +export PREPROCESSED_DATA="" +# Dataset: Trained SentencePieceModel checkpoint path +export SPM_CKPT="" + +# Optional +# Numpy index working directory +export TMP_NPY_INDEX="" + +# Fixed +# Model: size, to choose from 8b, 70b, 405b +export SIZE="" +# Dataloader: Global batch size +export GBS=0 +# Dataloader: Micro batch size +export MBS=0 \ No newline at end of file diff --git a/large_language_model/nemo/pretrain_llama31.py b/large_language_model/nemo/pretrain_llama31.py new file mode 100644 index 000000000..66c88e8ea --- /dev/null +++ b/large_language_model/nemo/pretrain_llama31.py @@ -0,0 +1,259 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from typing import Optional +from nemo.collections import llm +from nemo.collections.common.tokenizers import SentencePieceTokenizer +import nemo_run as run + +def slurm_executor( + user: str, + host: str, + remote_job_dir: str, + account: str, + partition: str, + nodes: int, + devices: int, + time: str = "01:00:00", + custom_mounts: Optional[list[str]] = None, + custom_env_vars: Optional[dict[str, str]] = None, + container_image: str = "nvcr.io/nvidia/nemo:dev", + dependencies: list[str] = [], + retries: int = 0, +) -> run.SlurmExecutor: + if not (user and host and remote_job_dir and account and partition and nodes and devices): + raise RuntimeError( + "Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this function." + ) + + mounts = [] + if custom_mounts: + mounts.extend(custom_mounts) + + env_vars = { + "TRANSFORMERS_OFFLINE": "1", + "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", + "NCCL_NVLS_ENABLE": "0", + "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", + "NVTE_ASYNC_AMAX_REDUCTION": "1", + "NVTE_FUSED_ATTN": "0", + } + if custom_env_vars: + env_vars |= custom_env_vars + + executor = run.SlurmExecutor( + account=account, + partition=partition, + tunnel=run.SSHTunnel( + user=user, + host=host, + job_dir=remote_job_dir, + ), + nodes=nodes, + ntasks_per_node=devices, + gpus_per_node=devices, + mem="0", + exclusive=True, + gres="gpu:8", + packager=run.GitArchivePackager(), + dependencies=dependencies, + ) + + executor.launcher = None + executor.container_image = container_image + executor.container_mounts = mounts + executor.env_vars = env_vars + executor.retries = retries + executor.time = time + + return executor + +def get_pretrain( + size: str, + nnodes: int, + ngpus_per_node: int, +) -> run.Partial: + + if size == "8b": + exp_name = "llama3-8b" + pretrain_fn = llm.llama3_8b.pretrain_recipe + elif size == "70b": + exp_name = "llama3-70b" + pretrain_fn = llm.llama3_70b.pretrain_recipe + elif size == "405b": + exp_name = "llama31-405b" + pretrain_fn = llm.llama31_405b.pretrain_recipe + + pretrain = pretrain_fn( + dir="/outputs", + name=exp_name, + num_nodes=nnodes, + num_gpus_per_node=ngpus_per_node + ) + + return exp_name, pretrain + +def get_data( + gbs: int = 288, + mbs: int = 4, + seq_length: int = 8192, +) -> run.Config: + tokenizer = run.Config(SentencePieceTokenizer, model_path="/workspace/llm/tokenizer.model") + data_paths = { + "train": [ + 0.5, + "/preproc_data/c4_en_6_c4_spm_text_document", + 0.5, + "/preproc_data/c4_en_7_c4_spm_text_document", + ], + "validation": [ + "/preproc_data/c4_en_validation_subset_c4_spm_text_document" + ], + "test": [ + "/preproc_data/c4_en_validation_subset_c4_spm_text_document" + ], + } + + return run.Config( + llm.PreTrainingDataModule, + tokenizer=tokenizer, + paths=data_paths, + num_workers=2, # TODO: make it configurable + seq_length=seq_length, + global_batch_size=gbs, + micro_batch_size=mbs, + index_mapping_dir="/npy_index", + seed=1234, # TODO: make seed configurable here + + # The following options are not set in e2e_example but are present in pretrain_llama3 + # reset_position_ids=False, + # reset_attention_mask=False, + # eod_mask_loss=False, + # rampup_batch_size=None, + ) + +def get_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Llama3.1 Pretraining") + parser.add_argument("--tag", type=str, help="Optional experiment tag", required=False, default="") + + # Slurm and executor related + slurm_group = parser.add_argument_group("Slurm executor arguments") + slurm_group.add_argument('--user', type=str, required=True, help="Remote cluster SSH user name") + slurm_group.add_argument("--host", type=str, required=True, help="Remote cluster host address") + slurm_group.add_argument("--job_dir", type=str, required=True, help="Remote job directory") + + slurm_group.add_argument("--account", type=str, required=True, help="Account to be used for Slurm job submission") + slurm_group.add_argument("--partition", type=str, required=True, help="Partition to be used for Slurm job submission") + slurm_group.add_argument("--nodes", type=int, required=True, help="Number of nodes to be used") + slurm_group.add_argument("--gpus_per_node", type=int, required=True, help="Number of GPUs per node") + slurm_group.add_argument("--time", type=str, required=True, help="Time limit for the job") + slurm_group.add_argument("--dependencies", nargs="*", help="list of dependencies for the job, dependency type as 'afterok'") # not useful for now + + slurm_group.add_argument( + "--mounts", + type=str, + required=True, + help=( + "Custom mount paths, formatted as a string of :[,:], " + + "and should contain " + + "one path for /output, " + + "NeMo mounted on /opt/NeMo, " + + "dataset path: /workspace/llm/tokenizer.model, /preproc_data, /npy_index" + )) + slurm_group.add_argument("--envvars", type=str, help="Environment variables to be added", default=None) + slurm_group.add_argument("--image", type=str, required=True, help="Container image path, either remote or local") + + model_group = parser.add_argument_group("Model arguments") + model_group.add_argument( + "--size", + type=str, + default="8b", + help="Choose the model to be trained", + choices=[ + "8b", # Llama 3 8B config for debugging + "70b", # Llama 3 70B config for debugging + "405b", # Llama 3.1 405B config + ]) + + data_group = parser.add_argument_group("Dataset arguments") + + data_group.add_argument("--gbs", type=int, default=288, help="Global batch size, should be divisible by PP") + data_group.add_argument("--mbs", type=int, default=1, help="Micro batch size") + + experiment_group = parser.add_argument_group("Experiment management arguments") + experiment_group.add_argument("--dryrun", action="store_true", help="Whether we are launching dryrun or actual runs") + experiment_group.add_argument("--seed", type=int, default=1234, help="random seed") + + # TODO: add a checkpoint loading path here + return parser + + +if __name__ == "__main__": + args = get_parser().parse_args() + if args.tag and not args.tag.startswith("-"): + args.tag = "-" + args.tag + + executor = slurm_executor( + user=args.user, + host=args.host, + remote_job_dir=args.job_dir, + account=args.account, + partition=args.partition, + nodes=args.nodes, + devices=args.gpus_per_node, + time = args.time, + custom_mounts=list(args.mounts.split(",")), + custom_env_vars=({envvar.split("=")[0]: envvar.split("=")[1] for envvar in args.envvars.split(",")} if args.envvars is not None else None), + container_image=args.image, + dependencies=args.dependencies, + ) + + exp_name, pretrain = get_pretrain( + size=args.size, + nnodes=args.nodes, + ngpus_per_node=args.gpus_per_node, + ) + + assert args.gbs % pretrain.trainer.strategy.pipeline_model_parallel_size == 0, "GBS should be divisible by PP" + seq_length = pretrain.model.config.seq_length + + data = get_data( + gbs=args.gbs, + mbs=args.mbs, + seq_length=seq_length, + ) + + pretrain.data = data + + # Override config for MLPerf + pretrain.trainer.num_sanity_val_steps = 0 + + # insert plugins and callbacks here + # pretrain.trainer.callbacks.append(...) + + with run.Experiment(f"{exp_name}{args.tag}") as exp: + for i in range(1): + exp.add( + pretrain, + executor=executor, + name=exp_name, + plugins=[] + ) + + if args.dryrun: + exp.dryrun() + else: + exp.run(sequential=True, detach=True) + diff --git a/large_language_model/nemo/run_llama31.sh b/large_language_model/nemo/run_llama31.sh new file mode 100644 index 000000000..af85b2d57 --- /dev/null +++ b/large_language_model/nemo/run_llama31.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +git config --global --add safe.directory /workspace/llama31 + +# Vars without defaults +# Slurm settings +: "${USER:?USER not set}" +: "${HOST:?HOST not set}" +: "${ACCOUNT:?ACCOUNT not set}" +: "${PARTITION:?PARTITION not set}" + +# Job settings +: "${JOB_DIR:?JOB_DIR not set}" +: "${IMAGE:?IMAGE not set}" + +# Dataset settings +: "${PREPROCESSED_DATA:?PREPROCESSED_DATA not set}" +: "${SPM_CKPT:?SPM_CKPT not set}" + + +# Vars with defaults +# Slurm settings +: "${TIME:="00:30:00"}" +: "${NNODES:=72}" +: "${GPUS_PER_NODE:=8}" +: "${DEPENDENCIES:=""}" + +# Job settings +: "${NEMO_DIR:=""}" # Provide customized NeMo path here +: "${TMP_NPY_INDEX:=""}" # Provide temporary NNumpy Index saving directory + +# Model settings +: "${SIZE:="405b"}" +: "${GBS:=288}" +: "${MBS:=1}" + + +# Run + +MOUNTS="${JOB_DIR}:/output,${PREPROCESSED_DATA}:/preproc_data,${SPM_CKPT}:/workspace/llm/tokenizer.model" + +if [ ! $NEMO_DIR = "" ]; then + MOUNTS="${MOUNTS},${NEMO_DIR}:/opt/NeMo" +fi + +if [ ! $TMP_NPY_INDEX = "" ]; then + MOUNTS="${MOUNTS},${TMP_NPY_INDEX}:/npy_index" +fi + +if [ ! $DEPENDENCIES = "" ]; then + DEPENDENCIES="--dependencies ${DEPENDENCIES}" +fi + +set -x + +python3 pretrain_llama31.py \ +--user $USER --host $HOST \ +--job_dir $JOB_DIR \ +--account $ACCOUNT --partition $PARTITION \ +--nodes $NNODES --gpus_per_node $GPUS_PER_NODE \ +$DEPENDENCIES \ +--time $TIME \ +--mounts $MOUNTS \ +--image $IMAGE \ +--size $SIZE \ +--gbs $GBS --mbs $MBS From 29b82f90bfd15071cef25baf7c844cc4ea240593 Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Thu, 26 Sep 2024 16:50:38 -0700 Subject: [PATCH 02/10] removes comments --- large_language_model/nemo/pretrain_llama31.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/large_language_model/nemo/pretrain_llama31.py b/large_language_model/nemo/pretrain_llama31.py index 66c88e8ea..d7acc3101 100644 --- a/large_language_model/nemo/pretrain_llama31.py +++ b/large_language_model/nemo/pretrain_llama31.py @@ -136,12 +136,6 @@ def get_data( micro_batch_size=mbs, index_mapping_dir="/npy_index", seed=1234, # TODO: make seed configurable here - - # The following options are not set in e2e_example but are present in pretrain_llama3 - # reset_position_ids=False, - # reset_attention_mask=False, - # eod_mask_loss=False, - # rampup_batch_size=None, ) def get_parser() -> argparse.ArgumentParser: From 18b3bc9924c335b87743f4a2bfa42cca9103978a Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Thu, 17 Oct 2024 21:56:22 -0700 Subject: [PATCH 03/10] adds checkpoint loading and full C4 dataset loading --- large_language_model/nemo/Dockerfile | 4 + large_language_model/nemo/README.md | 60 ++++++- large_language_model/nemo/config.sh | 26 ++- large_language_model/nemo/pretrain_llama31.py | 161 +++++++++++++----- large_language_model/nemo/run_llama31.sh | 45 ++++- .../nemo/utils/consolidate_data.sh | 29 ++++ .../nemo/utils/nemo_convert.py | 10 ++ large_language_model/nemo/utils/preprocess.sh | 34 ++++ 8 files changed, 313 insertions(+), 56 deletions(-) create mode 100644 large_language_model/nemo/utils/consolidate_data.sh create mode 100644 large_language_model/nemo/utils/nemo_convert.py create mode 100644 large_language_model/nemo/utils/preprocess.sh diff --git a/large_language_model/nemo/Dockerfile b/large_language_model/nemo/Dockerfile index 944ef5973..4a573e595 100644 --- a/large_language_model/nemo/Dockerfile +++ b/large_language_model/nemo/Dockerfile @@ -15,6 +15,10 @@ ARG NEMO_BASE_IMAGE=nvcr.io/nvidia/nemo:dev FROM ${NEMO_BASE_IMAGE} AS nemo-base-image +RUN pip uninstall transformers -y +RUN pip install transformers blobfile +RUN pip install prettytable + # setup workspace WORKDIR /workspace/llama31 COPY . . diff --git a/large_language_model/nemo/README.md b/large_language_model/nemo/README.md index 9c26fa087..c0af03a7d 100644 --- a/large_language_model/nemo/README.md +++ b/large_language_model/nemo/README.md @@ -53,7 +53,32 @@ We use the c4/en/3.0.1 dataset from [HuggingFace/AllenAI](https://huggingface.co ### Data preprocessing -To be filled. For now, please refer to [GPT3 data preprocessing instructions](https://github.com/mlcommons/training/tree/master/large_language_model/megatron-lm#dataset-preprocessing). +Run the following commands to merge all 1024 training files into 8 `json.gz` files and all 8 validation files into a single `json.gz` file. Each of the `json.gz` files will be preprocessed into a pair of megatron dataset files (`.bin` and `.idx`). + +```bash +export C4_PATH="" +export MERGED_C4_PATH="" + +bash consolidate_data.sh +``` + +After preparing the data folder, refer to the Model preprocessing part to prepare the models (and its associated tokenizers). After the model is preprocessed and converted, there should be a `nemo_tokenizer` folder under the preprocessed NeMo checkpoint, and this folder will be used to preprocess the dataset. + +We have provided a [script](./utils/preprocess.sh) to perform preprocessing. To run the preprocessing script, we need to use the following commands: + +```bash +# fill in the built container path here +export CONT_IMAGE_URL="" +# pass in the folder path that contains tokenizer.json here +# please refer to checkpoint conversion for more details +export NEMO_MODEL_PATH="" +# pass in the merged file path here +export MERGED_C4_PATH="" +# this path is used for storing the preprocessed .bin and .idx files +export PREPROCESSED_PATH="" + +sbatch preprocess.sh +``` ### Training and test data separation @@ -89,6 +114,39 @@ The model largely follows the Llama 3.1 405B [paper](https://arxiv.org/abs/2407. | Vocab size | 128,000 | | Context Length | 8192 | + +### Checkpoint download and conversion + +To experiment with a given checkpoint, we have added a `--ckpt` argument that loads the pretrained checkpoint from a **NeMo checkpoint path**, which requires some checkpoint format conversion if the original checkpoint is in LlamaStack or HuggingFace format. + +#### Converting LlamaStack to HuggingFace + +To convert models form LlamaStack to HuggingFace, we follow the [HF conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). An example command is: + +```bash +TRANSFORMER_PATH="" +INPUT_DIR="" +OUTPUT_DIR="" +MODEL_SIZE="405B" # For MP16 checkpoint, MODEL_SIZE should be 405B-MP16 +python3 $TRANSFORMER_PATH/models/llama/convert_llama_weights_to_hf.py \ + --input_dir $INPUT_DIR --output_dir $OUTPUT_DIR \ + --model_size $MODEL_SIZE --llama_version 3.1 +``` + +#### Converting HuggingFace to NeMo + +To convert HuggingFace checkpoints to NeMo formats, we have provided a [conversion script](./utils/nemo_convert.py). Example command to launch this conversion: + +```bash +INPUT_DIR="" +OUTPUT_DIR="" +python3 /workspace/llama31/utils/nemo_convert.py --source $INPUT_DIR --destination $OUTPUT_DIR +``` + +#### Loading the NeMo checkpoint + +After the checkpoint is converted, we can now load them by setting the `MODEL_CKPT` environment variable to the folder that contains the NeMo checkpoint. Setting the `MODEL_CKPT=""` will not load any checkpoints. + ### Optimizer Adam diff --git a/large_language_model/nemo/config.sh b/large_language_model/nemo/config.sh index 530b977ad..776418c32 100644 --- a/large_language_model/nemo/config.sh +++ b/large_language_model/nemo/config.sh @@ -34,17 +34,29 @@ export JOB_DIR="" export IMAGE="" # Dataset: C4 dataset location that contains the dataset after preprocessing export PREPROCESSED_DATA="" -# Dataset: Trained SentencePieceModel checkpoint path -export SPM_CKPT="" - -# Optional -# Numpy index working directory +# Dataset: Numpy index working directory export TMP_NPY_INDEX="" +# Dataset: Tokenizer path +export TOKENIZER="" + +# Model: checkpoint and tokenizer path +export MODEL_CKPT="" +# Model: Whether we want to restore from checkpoint +export USE_CKPT=0 + -# Fixed +# Training Configs: # Model: size, to choose from 8b, 70b, 405b export SIZE="" # Dataloader: Global batch size export GBS=0 # Dataloader: Micro batch size -export MBS=0 \ No newline at end of file +export MBS=0 +# Dataloader: Evaluate every N batches, optional +export EVAL_EVERY="" +# Dataloader: Evaluate using N batches, optional +export EVAL_BATCHES="" +# Dataloader: Max run N batches, optional +export MAX_STEPS="" +# Experiment manager: Number of experiments to launch +export NEXP=1 \ No newline at end of file diff --git a/large_language_model/nemo/pretrain_llama31.py b/large_language_model/nemo/pretrain_llama31.py index d7acc3101..4c8cb6bc2 100644 --- a/large_language_model/nemo/pretrain_llama31.py +++ b/large_language_model/nemo/pretrain_llama31.py @@ -15,8 +15,17 @@ import argparse from typing import Optional from nemo.collections import llm -from nemo.collections.common.tokenizers import SentencePieceTokenizer +from nemo.collections.common.tokenizers import AutoTokenizer +from nemo import lightning as nl +from megatron.core.distributed import DistributedDataParallelConfig +from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed +from nemo.collections.llm.gpt.data.mock import MockDataModule +from nemo.collections.llm.recipes.log.default import default_log, default_resume +from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing import nemo_run as run +from nemo.utils.exp_manager import TimingCallback +import torch +import os def slurm_executor( user: str, @@ -84,45 +93,87 @@ def get_pretrain( size: str, nnodes: int, ngpus_per_node: int, + data_module: run.Config, + ckpt: Optional[str]=None, + eval_every: Optional[int]=None, + eval_batches: Optional[int]=None, + max_steps: Optional[int]=None, ) -> run.Partial: + exp_name = size + if size == "8b": - exp_name = "llama3-8b" - pretrain_fn = llm.llama3_8b.pretrain_recipe + pretrain = llm.llama3_8b.pretrain_recipe( + dir="/outputs", + name=exp_name, + num_nodes=nnodes, + num_gpus_per_node=ngpus_per_node + ) + + llama31_config = run.Config(llm.gpt.model.llama.Llama31Config8B) + llama31_config.seq_length = 8192 + pretrain.model.config = llama31_config + pretrain.optim = distributed_fused_adam_with_cosine_annealing(max_lr=3e-4) elif size == "70b": - exp_name = "llama3-70b" - pretrain_fn = llm.llama3_70b.pretrain_recipe + pretrain = llm.llama3_70b.pretrain_recipe( + dir="/outputs", + name=exp_name, + num_nodes=nnodes, + num_gpus_per_node=ngpus_per_node + ) + + llama31_config = run.Config(llm.gpt.model.llama.Llama31Config70B) + llama31_config.seq_length = 8192 + pretrain.model.config = llama31_config + pretrain.optim = distributed_fused_adam_with_cosine_annealing(max_lr=1.5e-4) elif size == "405b": - exp_name = "llama31-405b" - pretrain_fn = llm.llama31_405b.pretrain_recipe - - pretrain = pretrain_fn( - dir="/outputs", - name=exp_name, - num_nodes=nnodes, - num_gpus_per_node=ngpus_per_node - ) + pretrain = llm.llama31_405b.pretrain_recipe( + dir="/outputs", + name=exp_name, + num_nodes=nnodes, + num_gpus_per_node=ngpus_per_node + ) + + pretrain.trainer.strategy.ddp = run.Config( + DistributedDataParallelConfig, + check_for_nan_in_grad=True, + grad_reduce_in_fp32=True, + overlap_grad_reduce=True, + overlap_param_gather=True, + ) + + pretrain.optim = distributed_fused_adam_with_cosine_annealing(max_lr=8e-5) + + # sets up everything else + pretrain.data = data_module + pretrain.trainer.val_check_interval = eval_every + pretrain.trainer.limit_val_batches = eval_batches + pretrain.trainer.limit_test_batches = eval_batches + if max_steps is not None: + pretrain.trainer.max_steps = max_steps + + if ckpt is not None: + pretrain.resume = run.Config(nl.AutoResume, restore_config=run.Config(nl.RestoreConfig, path=ckpt)) return exp_name, pretrain def get_data( gbs: int = 288, mbs: int = 4, - seq_length: int = 8192, + seq_length: Optional[int] = 8192, + tokenizer_path: Optional[str] = "", + seed: Optional[int] = 1234, ) -> run.Config: - tokenizer = run.Config(SentencePieceTokenizer, model_path="/workspace/llm/tokenizer.model") + tokenizer = run.Config(AutoTokenizer, pretrained_model_name=tokenizer_path) + + train_datasets = sum([["12.5", f"/preproc_data/c4-train.en_{idx}_text_document"] for idx in range(8)], []) data_paths = { - "train": [ - 0.5, - "/preproc_data/c4_en_6_c4_spm_text_document", - 0.5, - "/preproc_data/c4_en_7_c4_spm_text_document", - ], + "train": train_datasets, "validation": [ - "/preproc_data/c4_en_validation_subset_c4_spm_text_document" + "/preproc_data/c4-validation.en_text_document" ], "test": [ - "/preproc_data/c4_en_validation_subset_c4_spm_text_document" + "/preproc_data/c4-validation.en_text_document" ], } @@ -135,7 +186,16 @@ def get_data( global_batch_size=gbs, micro_batch_size=mbs, index_mapping_dir="/npy_index", - seed=1234, # TODO: make seed configurable here + seed=seed, + + # Option to reset the position IDs in the dataset at an interval. + reset_position_ids=False, + # Option to reset the attention mask from the dataset. + reset_attention_mask=False, + # Option to enable the EOD mask loss. + eod_mask_loss=False, + # Rampup batch size, should be in format of [start_global_batch_size, batch_size_increment, ramup_samples]. + rampup_batch_size=None, ) def get_parser() -> argparse.ArgumentParser: @@ -180,15 +240,23 @@ def get_parser() -> argparse.ArgumentParser: "70b", # Llama 3 70B config for debugging "405b", # Llama 3.1 405B config ]) + + model_group.add_argument("--ckpt_path", type=str, default=None) + model_group.add_argument("--use_ckpt", action="store_true") data_group = parser.add_argument_group("Dataset arguments") data_group.add_argument("--gbs", type=int, default=288, help="Global batch size, should be divisible by PP") data_group.add_argument("--mbs", type=int, default=1, help="Micro batch size") + data_group.add_argument("--eval_every", type=int, default=10) + data_group.add_argument("--eval_batches", type=int, default=None) + data_group.add_argument('--max_steps', type=int, default=None) + data_group.add_argument("--tokenizer_path", type=str, help="Tokenizer path that's used to tokenize the dataset") experiment_group = parser.add_argument_group("Experiment management arguments") experiment_group.add_argument("--dryrun", action="store_true", help="Whether we are launching dryrun or actual runs") experiment_group.add_argument("--seed", type=int, default=1234, help="random seed") + experiment_group.add_argument("--num_exps", type=int, default=1) # TODO: add a checkpoint loading path here return parser @@ -214,22 +282,28 @@ def get_parser() -> argparse.ArgumentParser: dependencies=args.dependencies, ) - exp_name, pretrain = get_pretrain( - size=args.size, - nnodes=args.nodes, - ngpus_per_node=args.gpus_per_node, - ) - - assert args.gbs % pretrain.trainer.strategy.pipeline_model_parallel_size == 0, "GBS should be divisible by PP" - seq_length = pretrain.model.config.seq_length + seq_length = 8192 data = get_data( gbs=args.gbs, mbs=args.mbs, seq_length=seq_length, + tokenizer_path=args.tokenizer_path, + seed=args.seed, ) - pretrain.data = data + exp_prefix, pretrain = get_pretrain( + size=args.size, + nnodes=args.nodes, + ngpus_per_node=args.gpus_per_node, + data_module=data, + ckpt=args.ckpt_path if args.use_ckpt else None, + eval_every=args.eval_every, + eval_batches=args.eval_batches, + max_steps=args.max_steps, + ) + + assert args.gbs % pretrain.trainer.strategy.pipeline_model_parallel_size == 0, "GBS should be divisible by PP" # Override config for MLPerf pretrain.trainer.num_sanity_val_steps = 0 @@ -237,17 +311,18 @@ def get_parser() -> argparse.ArgumentParser: # insert plugins and callbacks here # pretrain.trainer.callbacks.append(...) - with run.Experiment(f"{exp_name}{args.tag}") as exp: - for i in range(1): + exp_prefix = f"{exp_prefix}{args.tag}" + + for i in range(args.num_exps): + exp_name = f"{exp_prefix}_{i}" + with run.Experiment(exp_name) as exp: exp.add( - pretrain, - executor=executor, + pretrain, executor=executor, name=exp_name, plugins=[] ) - if args.dryrun: - exp.dryrun() - else: - exp.run(sequential=True, detach=True) - + if args.dryrun: + exp.dryrun() + else: + exp.run(sequential=True, detach=True) diff --git a/large_language_model/nemo/run_llama31.sh b/large_language_model/nemo/run_llama31.sh index af85b2d57..659b003fb 100644 --- a/large_language_model/nemo/run_llama31.sh +++ b/large_language_model/nemo/run_llama31.sh @@ -31,8 +31,11 @@ git config --global --add safe.directory /workspace/llama31 # Dataset settings : "${PREPROCESSED_DATA:?PREPROCESSED_DATA not set}" -: "${SPM_CKPT:?SPM_CKPT not set}" +: "${TOKENIZER:?TOKENIZER not set}" +# Model settings +: "${MODEL_CKPT:?MODEL_CKPT not set}" +: "${USE_CKPT:?USE_CKPT not set}" # Vars with defaults # Slurm settings @@ -50,10 +53,26 @@ git config --global --add safe.directory /workspace/llama31 : "${GBS:=288}" : "${MBS:=1}" +# Dataloader settings +: "${EVAL_EVERY:=""}" +: "${EVAL_BATCHES:=""}" +: "${MAX_STEPS:=""}" + +# Experiment settings +: "${SEED:=$RANDOM}" +: "${NEXP:=1}" # Run -MOUNTS="${JOB_DIR}:/output,${PREPROCESSED_DATA}:/preproc_data,${SPM_CKPT}:/workspace/llm/tokenizer.model" +MOUNTS="${JOB_DIR}:/output,${PREPROCESSED_DATA}:/preproc_data,${MODEL_CKPT}:/checkpoint,${TOKENIZER}:/tokenizer" + +CKPT_OPTION="" + +CMD_SUFFIX="" + +if [ $USE_CKPT -gt 0 ]; then + CMD_SUFFIX="${CMD_SUFFIX} --use_ckpt" +fi if [ ! $NEMO_DIR = "" ]; then MOUNTS="${MOUNTS},${NEMO_DIR}:/opt/NeMo" @@ -64,7 +83,19 @@ if [ ! $TMP_NPY_INDEX = "" ]; then fi if [ ! $DEPENDENCIES = "" ]; then - DEPENDENCIES="--dependencies ${DEPENDENCIES}" + CMD_SUFFIX="${CMD_SUFFIX} --dependencies ${DEPENDENCIES}" +fi + +if [ ! $EVAL_EVERY = "" ]; then + CMD_SUFFIX="${CMD_SUFFIX} --eval_every ${EVAL_EVERY}" +fi + +if [ ! $EVAL_BATCHES = "" ]; then + CMD_SUFFIX="${CMD_SUFFIX} --eval_batches ${EVAL_BATCHES}" +fi + +if [ ! $MAX_STEPS = "" ]; then + CMD_SUFFIX="${CMD_SUFFIX} --max_steps ${MAX_STEPS}" fi set -x @@ -74,9 +105,13 @@ python3 pretrain_llama31.py \ --job_dir $JOB_DIR \ --account $ACCOUNT --partition $PARTITION \ --nodes $NNODES --gpus_per_node $GPUS_PER_NODE \ -$DEPENDENCIES \ --time $TIME \ --mounts $MOUNTS \ --image $IMAGE \ --size $SIZE \ ---gbs $GBS --mbs $MBS +--gbs $GBS --mbs $MBS \ +--seed $SEED \ +--num_exps $NEXP \ +--ckpt_path /checkpoint \ +--tokenizer_path /tokenizer \ +$CMD_SUFFIX diff --git a/large_language_model/nemo/utils/consolidate_data.sh b/large_language_model/nemo/utils/consolidate_data.sh new file mode 100644 index 000000000..a929a52a2 --- /dev/null +++ b/large_language_model/nemo/utils/consolidate_data.sh @@ -0,0 +1,29 @@ +set -e + +: "${C4_PATH:?C4_PATH not set}" +: "${MERGED_C4_PATH:?MERGED_C4_PATH not set}" + +# create softlinks to store each shard before merging +mkdir -p softlinks +for shard in {0..7}; do + start=$((shard * 128)) + end=$((shard * 128 + 127)) + mkdir -p softlinks/en_$shard + for ind in $(seq -f "%05g" $start $end); do + ln -s ${C4_PATH}/c4-train.${ind}-of-01024.json.gz softlinks/en_${shard}/c4-train.${ind}-of-01024.json.gz + done +done + +mkdir -p softlinks/en_validation +start=0 +end=7 +for ind in $(seq -f "%05g" $start $end); do + ln -s ${C4_PATH}/c4-validation.${ind}-of-00008.json.gz softlinks/en_validation/c4-validation.${ind}-of-00008.json.gz +done + +# merge +for shard in {0..7}; do + cat softlinks/en_${shard}/*gz > ${MERGED_C4_PATH}/c4-train.en_${shard}.json.gz +done + +cat softlinks/en_validation/*gz > ${MERGED_C4_PATH}/c4-validation.en.json.gz \ No newline at end of file diff --git a/large_language_model/nemo/utils/nemo_convert.py b/large_language_model/nemo/utils/nemo_convert.py new file mode 100644 index 000000000..78d67327d --- /dev/null +++ b/large_language_model/nemo/utils/nemo_convert.py @@ -0,0 +1,10 @@ +if __name__ == "__main__": + import argparse + from nemo.collections.llm.gpt.model.llama import HFLlamaImporter + parser = argparse.ArgumentParser() + parser.add_argument("--source", default="/source", type=str) + parser.add_argument("--destination", default="/destination", type=str) + args = parser.parse_args() + + importer = HFLlamaImporter(args.source) + importer.apply(args.destination) diff --git a/large_language_model/nemo/utils/preprocess.sh b/large_language_model/nemo/utils/preprocess.sh new file mode 100644 index 000000000..ed6100f33 --- /dev/null +++ b/large_language_model/nemo/utils/preprocess.sh @@ -0,0 +1,34 @@ +#!/bin/bash +#SBATCH -N 9 +#SBATCH --gpus-per-node 1 +#SBATCH -t 04:00:00 +#SBATCH --mem=0 + +set -e + +: "${CONT_IMAGE_URL:?CONT_IMAGE_URL not set}" +: "${NEMO_MODEL_PATH:?NEMO_MODEL_PATH not set}" +: "${MERGED_C4_PATH:?MERGED_C4_PATH not set}" +: "${PREPROCESSED_PATH:?PREPROCESSED_PATH not set}" + +container_maps="${NEMO_MODEL_PATH}/nemo_tokenizer:/llama3.1-tokenizer,${MERGED_C4_PATH}:/dataset,${PREPROCESSED_PATH}:/outputs" + +for index in {0..7}; do + srun --nodes=1 --ntasks-per-node=1 \ + --container-image=$CONT_IMAGE_URL --container-mounts $container_maps --no-container-entrypoint \ + python3 /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \ + --input "/dataset/c4-train.en_${index}.json.gz" \ + --output-prefix "/outputs/c4-train.en_${index}" \ + --tokenizer-library huggingface --tokenizer-type /llama3.1-tokenizer \ + --dataset-impl mmap --workers 128 & +done + +srun --nodes=1 --ntasks-per-node=1 \ + --container-image=$CONT_IMAGE_URL --container-mounts $container_maps --no-container-entrypoint \ + --output preprocess_outputs/dataset_preprocess_validation.out \ + python3 /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \ + --input "/dataset/c4-validation.en.json.gz" \ + --output-prefix "/outputs/c4-validation.en" \ + --tokenizer-library huggingface --tokenizer-type /llama3.1-tokenizer \ + --dataset-impl mmap --workers 128 & +wait From 56b400a06fa989cafd66d408babed92b15da46cf Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Thu, 12 Dec 2024 23:52:59 -0800 Subject: [PATCH 04/10] updates checkpointings and instructions --- large_language_model/nemo/Dockerfile | 2 +- large_language_model/nemo/README.md | 56 +++---- large_language_model/nemo/callbacks.py | 38 +++++ large_language_model/nemo/config.sh | 30 +++- large_language_model/nemo/pretrain_llama31.py | 144 +++++++++++++++--- large_language_model/nemo/run_llama31.sh | 29 +++- .../nemo/utils/nemo_convert.py | 10 -- large_language_model/nemo/utils/preprocess.sh | 8 +- 8 files changed, 238 insertions(+), 79 deletions(-) create mode 100644 large_language_model/nemo/callbacks.py delete mode 100644 large_language_model/nemo/utils/nemo_convert.py diff --git a/large_language_model/nemo/Dockerfile b/large_language_model/nemo/Dockerfile index 4a573e595..2b3ba7097 100644 --- a/large_language_model/nemo/Dockerfile +++ b/large_language_model/nemo/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG NEMO_BASE_IMAGE=nvcr.io/nvidia/nemo:dev +ARG NEMO_BASE_IMAGE=nvcr.io/nvidia/nemo:24.12-rc0 FROM ${NEMO_BASE_IMAGE} AS nemo-base-image RUN pip uninstall transformers -y diff --git a/large_language_model/nemo/README.md b/large_language_model/nemo/README.md index c0af03a7d..abcd79c29 100644 --- a/large_language_model/nemo/README.md +++ b/large_language_model/nemo/README.md @@ -8,11 +8,11 @@ Large Language Model pretraining - Llama 3.1 405B To use this repository, please install a supported version of PyTorch with GPU support (python 3.10, pytorch 2.4, cuda 12.5, and nccl 2.22.3 and above) and NVIDIA APEX. **Slurm-based clusters are required to run the reference**. -We recommend using the latest NeMo FW container. The latest tested compatible version is `nvcr.io/nvidia/nemo:dev`). +We recommend using the latest NeMo FW container. The latest tested compatible version is `nvcr.io/nvidia/nemo:24.12-rc0`). #### Container Setup -All of the following codes are assumed to be run within a container. A [Dockerfile](./Dockerfile) is available for building containers on top of `nvcr.io/nvidia/nemo:dev`. +All of the following codes are assumed to be run within a container. A [Dockerfile](./Dockerfile) is available for building containers on top of `nvcr.io/nvidia/nemo:24.12-rc0`. To build the container: @@ -33,7 +33,7 @@ Note: it's recommended to map your `.ssh` folder to inside the container, so tha ### Steps to download and verify data -The current codebase is still using GPT3's train/val datasets and SentencePieceModel tokenizer. Please refer to [GPT3 instructions](https://github.com/mlcommons/training/tree/master/large_language_model/megatron-lm#preprocessed-data-download) to download preprocessed datasets and SPM checkpoints. +The current codebase is still using GPT3's train/val datasets and SentencePieceModel tokenizer. Please refer to [GPT3 instructions](https://github.com/mlcommons/training/tree/master/large_language_model/megatron-lm#preprocessed-data-download) to download **the raw C4 dataset** that we can preprocess later. ### Steps to run and time @@ -51,6 +51,12 @@ bash run_llama31.sh We use the c4/en/3.0.1 dataset from [HuggingFace/AllenAI](https://huggingface.co/datasets/allenai/c4). +We use the Mixtral 8x22B tokenizer from [HuggingFace/MistralAI](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1). + +### Tokenizer + +We use Mixtral 8x22B tokenizer in this benchmark. Tokenizer files can be downloaded [here](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1/tree/main). Only the five files containing tokenizer-related contents (`special_tokens_map.json`, `tokenizer.json`, `tokenizer.model`, `tokenizer.model.v1`, `tokenizer_config.json`) are needed. + ### Data preprocessing Run the following commands to merge all 1024 training files into 8 `json.gz` files and all 8 validation files into a single `json.gz` file. Each of the `json.gz` files will be preprocessed into a pair of megatron dataset files (`.bin` and `.idx`). @@ -62,16 +68,14 @@ export MERGED_C4_PATH="" bash consolidate_data.sh ``` -After preparing the data folder, refer to the Model preprocessing part to prepare the models (and its associated tokenizers). After the model is preprocessed and converted, there should be a `nemo_tokenizer` folder under the preprocessed NeMo checkpoint, and this folder will be used to preprocess the dataset. - -We have provided a [script](./utils/preprocess.sh) to perform preprocessing. To run the preprocessing script, we need to use the following commands: +After the data consolidation is done, we can run this [script](./utils/preprocess.sh) to perform preprocessing. To run the preprocessing script, we need to use the following commands: ```bash # fill in the built container path here export CONT_IMAGE_URL="" -# pass in the folder path that contains tokenizer.json here -# please refer to checkpoint conversion for more details -export NEMO_MODEL_PATH="" +# pass in the folder path that contains the Mixtral tokenizer here +# please refer to the tokenizer section above for more details +export TOKENIZER_PATH="" # pass in the merged file path here export MERGED_C4_PATH="" # this path is used for storing the preprocessed .bin and .idx files @@ -86,7 +90,7 @@ To be determined. For now, we are using the default split from the C4 dataset. ### Training data order -To be determined. +To be determined. Current plan is to use the last 256 of 1024 files (shards 6 and 7) for the benchmarked area. ### Test data order @@ -110,42 +114,20 @@ The model largely follows the Llama 3.1 405B [paper](https://arxiv.org/abs/2407. | Hidden Dimension | 53248 | | Activation | SwiGLU | | Normalization | RMSNorm | -| Tokenizer | TokTokenizer | +| Tokenizer | TikTokenizer | | Vocab size | 128,000 | | Context Length | 8192 | ### Checkpoint download and conversion -To experiment with a given checkpoint, we have added a `--ckpt` argument that loads the pretrained checkpoint from a **NeMo checkpoint path**, which requires some checkpoint format conversion if the original checkpoint is in LlamaStack or HuggingFace format. - -#### Converting LlamaStack to HuggingFace - -To convert models form LlamaStack to HuggingFace, we follow the [HF conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). An example command is: +To be determined. For now, we are not using Llama 3.1 default checkpoint. -```bash -TRANSFORMER_PATH="" -INPUT_DIR="" -OUTPUT_DIR="" -MODEL_SIZE="405B" # For MP16 checkpoint, MODEL_SIZE should be 405B-MP16 -python3 $TRANSFORMER_PATH/models/llama/convert_llama_weights_to_hf.py \ - --input_dir $INPUT_DIR --output_dir $OUTPUT_DIR \ - --model_size $MODEL_SIZE --llama_version 3.1 -``` - -#### Converting HuggingFace to NeMo - -To convert HuggingFace checkpoints to NeMo formats, we have provided a [conversion script](./utils/nemo_convert.py). Example command to launch this conversion: - -```bash -INPUT_DIR="" -OUTPUT_DIR="" -python3 /workspace/llama31/utils/nemo_convert.py --source $INPUT_DIR --destination $OUTPUT_DIR -``` +~~To experiment with a given checkpoint, we have added a `--ckpt` argument that loads the pretrained checkpoint from a **NeMo checkpoint path**, which requires some checkpoint format conversion if the original checkpoint is in LlamaStack or HuggingFace format.~~ -#### Loading the NeMo checkpoint +#### Saving and restoring a checkpoint -After the checkpoint is converted, we can now load them by setting the `MODEL_CKPT` environment variable to the folder that contains the NeMo checkpoint. Setting the `MODEL_CKPT=""` will not load any checkpoints. +Large runs might need to span across multiple Slurm jobs, and we need to save and load checkpoints with contexts so that training can resume between jobs. To support this, we have added some environment variables. Please refer to `config.sh` for more details. ### Optimizer diff --git a/large_language_model/nemo/callbacks.py b/large_language_model/nemo/callbacks.py new file mode 100644 index 000000000..cca20baad --- /dev/null +++ b/large_language_model/nemo/callbacks.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import lightning.pytorch as pl +from nemo.utils import logging + +class PreemptiveStop(pl.Callback): + """Preemptively stop training at a given global step. Allows stopping training before reaching + the max steps. Useful for testing checkpoint save and resume. + + Args: + stop_on_step (int): Stop training when trainer.global_step reaches this value. + Checked at the start of every step. + """ + + def __init__(self, stop_on_step: int): + self.stop_on_step = stop_on_step + + def on_train_batch_end( + self, trainer: pl.Trainer, pl_module: pl.LightningModule, outputs, batch, batch_idx + ) -> None: + if trainer.global_step >= self.stop_on_step: + logging.info(f"Global step {trainer.global_step} >= {self.stop_on_step}, signaling Trainer to stop.") + trainer.should_stop = True + # skip EarlyStopping validation unless val_check_interval met + if trainer.global_step % trainer.val_check_interval != 0: + trainer.limit_val_batches = 0 \ No newline at end of file diff --git a/large_language_model/nemo/config.sh b/large_language_model/nemo/config.sh index 776418c32..62671b900 100644 --- a/large_language_model/nemo/config.sh +++ b/large_language_model/nemo/config.sh @@ -26,6 +26,8 @@ export TIME="" export NNODES=0 # Slurm: --gpus_per_node and --ntasks_per_node argument export GPUS_PER_NODE=0 +# Slurm: max job retries for transient job failures +export MAX_RETRIES=0 # Folder mapping: # Output directory that holds logs @@ -39,10 +41,26 @@ export TMP_NPY_INDEX="" # Dataset: Tokenizer path export TOKENIZER="" +# Environment: NeMo remount +export NEMO_DIR="" + # Model: checkpoint and tokenizer path +# This is the checkpoint that we want to start with. +# Each checkpoint should be a folder containing two sub-folders: context and weights. +# And we need to pass this folder's path (the folder containing these two sub-folders) here. export MODEL_CKPT="" -# Model: Whether we want to restore from checkpoint +# Model: Continual checkpoint directory to write and resume +# This is the directory to hold all intermediate checkpoints. +# Once a run is complete and we specify to save checkpoints, +# we should see a checkpoint written in this folder +# with name `checkpoint-par-x-y-steps` +# Inside this directory, there should be a `checkpoint` directory that holds context and weights +# which is the "actual checkpoint" +export CONTINUAL_CKPT="" +# Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring. export USE_CKPT=0 +# Model: Whether we want to save a checkpoint. Must be true if NPAR > 1 +export SAVE_CKPT=0 # Training Configs: @@ -58,5 +76,13 @@ export EVAL_EVERY="" export EVAL_BATCHES="" # Dataloader: Max run N batches, optional export MAX_STEPS="" + +# Experiment: starting steps +# This is the starting "offset" step from the checkpoint. +# For instance, if you are resuming from a checkpoint folder `checkpoint-par-x-y-steps/checkpoint`, +# then the value y is needed here. +export START_STEPS="" # Experiment manager: Number of experiments to launch -export NEXP=1 \ No newline at end of file +export NEXP=0 +# Experiment manager: how many consecutive jobs we want for each experiment +export NPAR=0 \ No newline at end of file diff --git a/large_language_model/nemo/pretrain_llama31.py b/large_language_model/nemo/pretrain_llama31.py index 4c8cb6bc2..c68eb0da0 100644 --- a/large_language_model/nemo/pretrain_llama31.py +++ b/large_language_model/nemo/pretrain_llama31.py @@ -18,14 +18,11 @@ from nemo.collections.common.tokenizers import AutoTokenizer from nemo import lightning as nl from megatron.core.distributed import DistributedDataParallelConfig -from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed -from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.recipes.log.default import default_log, default_resume from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing import nemo_run as run -from nemo.utils.exp_manager import TimingCallback -import torch -import os +from nemo.lightning.run import plugins +from nemo.collections.llm.gpt.data import build_pretraining_datamodule +from callbacks import PreemptiveStop def slurm_executor( user: str, @@ -58,6 +55,7 @@ def slurm_executor( "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", "NVTE_ASYNC_AMAX_REDUCTION": "1", "NVTE_FUSED_ATTN": "0", + "TOKENIZERS_PARALLELISM": "false", } if custom_env_vars: env_vars |= custom_env_vars @@ -94,10 +92,8 @@ def get_pretrain( nnodes: int, ngpus_per_node: int, data_module: run.Config, - ckpt: Optional[str]=None, eval_every: Optional[int]=None, eval_batches: Optional[int]=None, - max_steps: Optional[int]=None, ) -> run.Partial: exp_name = size @@ -142,18 +138,49 @@ def get_pretrain( overlap_param_gather=True, ) - pretrain.optim = distributed_fused_adam_with_cosine_annealing(max_lr=8e-5) + pretrain.trainer.strategy.virtual_pipeline_model_parallel_size = 7 + + pretrain.optim = distributed_fused_adam_with_cosine_annealing( + max_lr=8e-5, + warmup_steps=8000, + min_lr=8e-7 + ) + + from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import ( + userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192, + ) + from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback + + pretrain.trainer.callbacks.append( + run.Config( + MegatronCommOverlapCallback, + tp_comm_overlap=True, + tp_comm_overlap_cfg=userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192, + defer_embedding_wgrad_compute=True, + wgrad_deferral_limit=50, + overlap_param_gather_with_optimizer_step=False, + align_param_gather=True, + ) + ) # sets up everything else + pretrain.trainer.max_steps = 1_200_000 # Llama 3.1 paper section 3.4.1 - decays LR to 8e10-7 over 1,200,000 steps + pretrain.data = data_module pretrain.trainer.val_check_interval = eval_every pretrain.trainer.limit_val_batches = eval_batches pretrain.trainer.limit_test_batches = eval_batches - if max_steps is not None: - pretrain.trainer.max_steps = max_steps - if ckpt is not None: - pretrain.resume = run.Config(nl.AutoResume, restore_config=run.Config(nl.RestoreConfig, path=ckpt)) + pretrain.log.tensorboard = None + pretrain.log.ckpt.every_n_train_steps = None + pretrain.log.ckpt.save_top_k = 1 + pretrain.log.ckpt.save_last = False + pretrain.log.ckpt.always_save_context = True + pretrain.log.ckpt.save_weights_only = False + pretrain.log.ckpt.save_optim_on_train_end = True + pretrain.log.ckpt.save_on_train_epoch_end = True + pretrain.log.ckpt.monitor = "consumed_samples" + pretrain.log.ckpt.mode = "max" return exp_name, pretrain @@ -214,6 +241,7 @@ def get_parser() -> argparse.ArgumentParser: slurm_group.add_argument("--gpus_per_node", type=int, required=True, help="Number of GPUs per node") slurm_group.add_argument("--time", type=str, required=True, help="Time limit for the job") slurm_group.add_argument("--dependencies", nargs="*", help="list of dependencies for the job, dependency type as 'afterok'") # not useful for now + slurm_group.add_argument("--max_retries", type=int, default=0) slurm_group.add_argument( "--mounts", @@ -241,8 +269,11 @@ def get_parser() -> argparse.ArgumentParser: "405b", # Llama 3.1 405B config ]) - model_group.add_argument("--ckpt_path", type=str, default=None) + model_group.add_argument("--initial_ckpt_path", type=str, default=None) model_group.add_argument("--use_ckpt", action="store_true") + model_group.add_argument("--ckpt_start_step", type=int, default=0) + model_group.add_argument("--continual_ckpt_path", type=str, default=None) + model_group.add_argument("--save_ckpt", action="store_true") data_group = parser.add_argument_group("Dataset arguments") @@ -257,8 +288,8 @@ def get_parser() -> argparse.ArgumentParser: experiment_group.add_argument("--dryrun", action="store_true", help="Whether we are launching dryrun or actual runs") experiment_group.add_argument("--seed", type=int, default=1234, help="random seed") experiment_group.add_argument("--num_exps", type=int, default=1) + experiment_group.add_argument("--num_pars", type=int, default=1) - # TODO: add a checkpoint loading path here return parser @@ -267,6 +298,9 @@ def get_parser() -> argparse.ArgumentParser: if args.tag and not args.tag.startswith("-"): args.tag = "-" + args.tag + assert not (args.num_pars == 1 and args.continual_ckpt_path is None), "NPar > 1 but a shared checkpoint path is not found" + assert not (not args.save_ckpt and args.num_pars > 1), "multiple experiments are specified but checkpoint is not saved" + executor = slurm_executor( user=args.user, host=args.host, @@ -280,6 +314,7 @@ def get_parser() -> argparse.ArgumentParser: custom_env_vars=({envvar.split("=")[0]: envvar.split("=")[1] for envvar in args.envvars.split(",")} if args.envvars is not None else None), container_image=args.image, dependencies=args.dependencies, + retries = args.max_retries, ) seq_length = 8192 @@ -297,10 +332,8 @@ def get_parser() -> argparse.ArgumentParser: nnodes=args.nodes, ngpus_per_node=args.gpus_per_node, data_module=data, - ckpt=args.ckpt_path if args.use_ckpt else None, eval_every=args.eval_every, eval_batches=args.eval_batches, - max_steps=args.max_steps, ) assert args.gbs % pretrain.trainer.strategy.pipeline_model_parallel_size == 0, "GBS should be divisible by PP" @@ -311,16 +344,83 @@ def get_parser() -> argparse.ArgumentParser: # insert plugins and callbacks here # pretrain.trainer.callbacks.append(...) + run_plugins = [ + plugins.PerfEnvPlugin(), + ] + exp_prefix = f"{exp_prefix}{args.tag}" + # Pretrain data index builder + # max steps + pretrain.data.num_train_samples = pretrain.trainer.max_steps * pretrain.data.global_batch_size + datamodule = pretrain.data.clone() + datamodule.num_dataset_builder_threads = 8 + build_data_index = run.Partial( + build_pretraining_datamodule, + datamodule=datamodule, + trainer_max_steps=pretrain.trainer.max_steps, + trainer_val_check_interval=pretrain.trainer.val_check_interval, + trainer_limit_val_batches=pretrain.trainer.limit_val_batches, + trainer_limit_test_batches=pretrain.trainer.limit_test_batches, + ) + data_index_executor = executor.clone() + data_index_executor.launcher = None + data_index_executor.nodes = 1 + data_index_executor.ntasks_per_node = 1 + data_index_executor.retries = 1 + + static_read_from_path = args.initial_ckpt_path if args.use_ckpt else None + static_write_to_path = args.continual_ckpt_path + static_max_steps = args.max_steps if args.max_steps is not None else None + + if not args.save_ckpt: + pretrain.trainer.enable_checkpointing = False + + original_callbacks = pretrain.trainer.callbacks + for i in range(args.num_exps): exp_name = f"{exp_prefix}_{i}" + experiment_read_from_path = static_read_from_path + experiment_write_to_path = static_write_to_path + experiment_max_steps = args.ckpt_start_step + with run.Experiment(exp_name) as exp: - exp.add( - pretrain, executor=executor, - name=exp_name, - plugins=[] - ) + exp.add(build_data_index, executor=data_index_executor, name="build_data_index") + + for j in range(args.num_pars): + ending_steps = "" + starting_steps = f"{experiment_max_steps}" + if static_max_steps is not None: + ending_steps = f"-{experiment_max_steps + static_max_steps}-steps" + + checkpoint_name = "checkpoint" + f"-par-{j}{ending_steps}" + experiment_write_to_path = static_write_to_path + "/" + checkpoint_name + + pretrain.resume.resume_from_directory = experiment_read_from_path + pretrain.resume.resume_from_path = experiment_read_from_path + pretrain.log.ckpt.train_time_interval = None + + if args.save_ckpt: + pretrain.log.ckpt.dirpath = experiment_write_to_path + pretrain.log.ckpt.filename = "checkpoint" + + if static_max_steps is not None: + experiment_max_steps += static_max_steps + pretrain.trainer.callbacks = ( + original_callbacks + + [run.Config(PreemptiveStop, stop_on_step=experiment_max_steps)] + ) + if args.save_ckpt: + pretrain.log.ckpt.every_n_train_steps = experiment_max_steps + pretrain.log.ckpt.save_on_train_epoch_end = False + + experiment_read_from_path = experiment_write_to_path + "/checkpoint" + + exp.add( + pretrain, executor=executor, + name=f"{exp_name}_{j}_{starting_steps}{ending_steps}", + plugins=run_plugins + ) if args.dryrun: exp.dryrun() diff --git a/large_language_model/nemo/run_llama31.sh b/large_language_model/nemo/run_llama31.sh index 659b003fb..6d3847f1d 100644 --- a/large_language_model/nemo/run_llama31.sh +++ b/large_language_model/nemo/run_llama31.sh @@ -36,6 +36,7 @@ git config --global --add safe.directory /workspace/llama31 # Model settings : "${MODEL_CKPT:?MODEL_CKPT not set}" : "${USE_CKPT:?USE_CKPT not set}" +: "${CONTINUAL_CKPT:?CONTINUAL_CKPT not set}" # Vars with defaults # Slurm settings @@ -46,12 +47,15 @@ git config --global --add safe.directory /workspace/llama31 # Job settings : "${NEMO_DIR:=""}" # Provide customized NeMo path here +: "${NEMO_RUN_DIR:=""}" # Provide customized NeMo-Run path here : "${TMP_NPY_INDEX:=""}" # Provide temporary NNumpy Index saving directory +: "${MAX_RETRIES:=0}" # Model settings : "${SIZE:="405b"}" -: "${GBS:=288}" +: "${GBS:=1152}" : "${MBS:=1}" +: "${START_STEPS:=0}" # Dataloader settings : "${EVAL_EVERY:=""}" @@ -61,10 +65,13 @@ git config --global --add safe.directory /workspace/llama31 # Experiment settings : "${SEED:=$RANDOM}" : "${NEXP:=1}" +: "${NPAR:=1}" +: "${SAVE_CKPT:=1}" +: "${TAG:=""}" # Run -MOUNTS="${JOB_DIR}:/output,${PREPROCESSED_DATA}:/preproc_data,${MODEL_CKPT}:/checkpoint,${TOKENIZER}:/tokenizer" +MOUNTS="${JOB_DIR}:/output,${PREPROCESSED_DATA}:/preproc_data,${MODEL_CKPT}:/checkpoint,${TOKENIZER}:/tokenizer,${CONTINUAL_CKPT}:/continual" CKPT_OPTION="" @@ -74,10 +81,18 @@ if [ $USE_CKPT -gt 0 ]; then CMD_SUFFIX="${CMD_SUFFIX} --use_ckpt" fi +if [ $SAVE_CKPT -gt 0 ]; then + CMD_SUFFIX="${CMD_SUFFIX} --save_ckpt" +fi + if [ ! $NEMO_DIR = "" ]; then MOUNTS="${MOUNTS},${NEMO_DIR}:/opt/NeMo" fi +if [ ! $NEMO_RUN_DIR = "" ]; then + MOUNTS="${MOUNTS},${NEMO_RUN_DIR}:/opt/NeMo-Run" +fi + if [ ! $TMP_NPY_INDEX = "" ]; then MOUNTS="${MOUNTS},${TMP_NPY_INDEX}:/npy_index" fi @@ -98,6 +113,10 @@ if [ ! $MAX_STEPS = "" ]; then CMD_SUFFIX="${CMD_SUFFIX} --max_steps ${MAX_STEPS}" fi +if [ ! $TAG = "" ]; then + CMD_SUFFIX="${CMD_SUFFIX} --tag ${TAG}" +fi + set -x python3 pretrain_llama31.py \ @@ -112,6 +131,10 @@ python3 pretrain_llama31.py \ --gbs $GBS --mbs $MBS \ --seed $SEED \ --num_exps $NEXP \ ---ckpt_path /checkpoint \ +--num_pars $NPAR \ +--initial_ckpt_path /checkpoint \ +--continual_ckpt_path /continual \ --tokenizer_path /tokenizer \ +--ckpt_start_step $START_STEPS \ +--max_retries $MAX_RETRIES \ $CMD_SUFFIX diff --git a/large_language_model/nemo/utils/nemo_convert.py b/large_language_model/nemo/utils/nemo_convert.py deleted file mode 100644 index 78d67327d..000000000 --- a/large_language_model/nemo/utils/nemo_convert.py +++ /dev/null @@ -1,10 +0,0 @@ -if __name__ == "__main__": - import argparse - from nemo.collections.llm.gpt.model.llama import HFLlamaImporter - parser = argparse.ArgumentParser() - parser.add_argument("--source", default="/source", type=str) - parser.add_argument("--destination", default="/destination", type=str) - args = parser.parse_args() - - importer = HFLlamaImporter(args.source) - importer.apply(args.destination) diff --git a/large_language_model/nemo/utils/preprocess.sh b/large_language_model/nemo/utils/preprocess.sh index ed6100f33..5be604458 100644 --- a/large_language_model/nemo/utils/preprocess.sh +++ b/large_language_model/nemo/utils/preprocess.sh @@ -7,11 +7,11 @@ set -e : "${CONT_IMAGE_URL:?CONT_IMAGE_URL not set}" -: "${NEMO_MODEL_PATH:?NEMO_MODEL_PATH not set}" +: "${TOKENIZER_PATH:?TOKENIZER_PATH not set}" : "${MERGED_C4_PATH:?MERGED_C4_PATH not set}" : "${PREPROCESSED_PATH:?PREPROCESSED_PATH not set}" -container_maps="${NEMO_MODEL_PATH}/nemo_tokenizer:/llama3.1-tokenizer,${MERGED_C4_PATH}:/dataset,${PREPROCESSED_PATH}:/outputs" +container_maps="${TOKENIZER_PATH}:/tokenizer,${MERGED_C4_PATH}:/dataset,${PREPROCESSED_PATH}:/outputs" for index in {0..7}; do srun --nodes=1 --ntasks-per-node=1 \ @@ -19,7 +19,7 @@ for index in {0..7}; do python3 /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \ --input "/dataset/c4-train.en_${index}.json.gz" \ --output-prefix "/outputs/c4-train.en_${index}" \ - --tokenizer-library huggingface --tokenizer-type /llama3.1-tokenizer \ + --tokenizer-library huggingface --tokenizer-type /tokenizer \ --dataset-impl mmap --workers 128 & done @@ -29,6 +29,6 @@ srun --nodes=1 --ntasks-per-node=1 \ python3 /opt/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \ --input "/dataset/c4-validation.en.json.gz" \ --output-prefix "/outputs/c4-validation.en" \ - --tokenizer-library huggingface --tokenizer-type /llama3.1-tokenizer \ + --tokenizer-library huggingface --tokenizer-type /tokenizer \ --dataset-impl mmap --workers 128 & wait From 9eeb1cb5c08e45f9197102cb3212a6e99bdc337d Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Thu, 19 Dec 2024 23:10:49 -0800 Subject: [PATCH 05/10] adds MLPerf callbacks --- large_language_model/nemo/Dockerfile | 1 + large_language_model/nemo/callbacks.py | 194 +++++++++++++++++- large_language_model/nemo/pretrain_llama31.py | 70 ++++++- large_language_model/nemo/run_llama31.sh | 2 +- 4 files changed, 257 insertions(+), 10 deletions(-) diff --git a/large_language_model/nemo/Dockerfile b/large_language_model/nemo/Dockerfile index 2b3ba7097..f2b5ff3b3 100644 --- a/large_language_model/nemo/Dockerfile +++ b/large_language_model/nemo/Dockerfile @@ -18,6 +18,7 @@ FROM ${NEMO_BASE_IMAGE} AS nemo-base-image RUN pip uninstall transformers -y RUN pip install transformers blobfile RUN pip install prettytable +RUN pip install git+https://github.com/mlcommons/logging.git@4.1.0-rc3 # setup workspace WORKDIR /workspace/llama31 diff --git a/large_language_model/nemo/callbacks.py b/large_language_model/nemo/callbacks.py index cca20baad..ba13a6494 100644 --- a/large_language_model/nemo/callbacks.py +++ b/large_language_model/nemo/callbacks.py @@ -12,6 +12,55 @@ # See the License for the specific language governing permissions and # limitations under the License. + +### MLLogger +from mlperf_logging import mllog +from mlperf_logging.mllog import constants +import torch.distributed as dist + +def is_dist_avail_and_initialized(): + return (dist.is_available() and dist.is_initialized()) + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + +def barrier(): + if not is_dist_avail_and_initialized(): + return + + dist.barrier() + +class MLLogger: + def __init__(self, filepath="/mlperf-outputs/mlperf_llama31.log", default_stack_offset=2): + self.logger = mllog.get_mllogger() + mllog.config(default_stack_offset=default_stack_offset, filename=filepath) + + def start(self, **kwargs): + if get_rank() == 0: + self.logger.start(**kwargs) + + def end(self, **kwargs): + if get_rank() == 0: + self.logger.end(**kwargs) + + def event(self, **kwargs): + if get_rank() == 0: + self.logger.event(**kwargs) + + def submission_info(self): + self.event(key=constants.SUBMISSION_BENCHMARK, value="llama31") + self.event(key=constants.SUBMISSION_ORG, value="reference_implementation") + self.event(key=constants.SUBMISSION_DIVISION, value=constants.CLOSED) + self.event(key=constants.SUBMISSION_STATUS, value=constants.ONPREM) + self.event(key=constants.SUBMISSION_PLATFORM, value="DGX-H100") + self.event(key=constants.SUBMISSION_POC_NAME, value="Yunzhou Liu") + self.event(key=constants.SUBMISSION_POC_EMAIL, value="yunzhoul@nvidia.com") + +mllogger = MLLogger() + +### Preemptive checkpoint callbacks import lightning.pytorch as pl from nemo.utils import logging @@ -28,11 +77,152 @@ def __init__(self, stop_on_step: int): self.stop_on_step = stop_on_step def on_train_batch_end( - self, trainer: pl.Trainer, pl_module: pl.LightningModule, outputs, batch, batch_idx + self, trainer, pl_module, outputs, batch, batch_idx ) -> None: if trainer.global_step >= self.stop_on_step: logging.info(f"Global step {trainer.global_step} >= {self.stop_on_step}, signaling Trainer to stop.") trainer.should_stop = True # skip EarlyStopping validation unless val_check_interval met if trainer.global_step % trainer.val_check_interval != 0: - trainer.limit_val_batches = 0 \ No newline at end of file + trainer.limit_val_batches = 0 + + +### Metrics Logger +from pytorch_lightning.loggers import Logger +from pytorch_lightning.utilities import rank_zero_only + +class MetricsLogger(Logger): + def __init__( + self, + init_global_step, global_batch_size, seq_length, + target_log_ppl, + train_loss_key = "reduced_train_loss", + val_loss_key = "val_loss" + ): + super().__init__() + + self.init_global_step = init_global_step + self.gbs = global_batch_size + self.seq_len = seq_length + + self.target = target_log_ppl + self.train_loss_key = train_loss_key + self.val_loss_key = val_loss_key + self.is_target_reached = False + + def log_metrics(self, metrics, step): + if self.val_loss_key in metrics: + self.log_validation_loss(metrics, step) + + def log_validation_loss(self, metrics, step): + consumed_tokens = (step - self.init_global_step) * self.gbs * self.seq_len + + loss = metrics[self.val_loss_key] + + mllogger.event(key=constants.EVAL_ACCURACY, value=loss, metadata={'epoch_num': consumed_tokens}) + + if not self.is_target_reached and loss <= self.target: + self.is_target_reached = True + + @rank_zero_only + def log_hyperparams(self, params, *args, **kwargs): + pass + + @property + def name(self): + return 'mlperf-metrics' + + @property + def version(self): + return 1 + +### MLPerf callbacks +def compute_consumed_mllog_tokens(trainer, init_global_step, global_batch_size, seq_length): + steps_since_resume = trainer.global_step - init_global_step + consumed_samples = ( + steps_since_resume * global_batch_size + ) + return int(consumed_samples) * seq_length + +class MLPerfCallback(pl.Callback): + def __init__( + self, + global_batch_size, + micro_batch_size, + sequence_length, + init_global_step, + configs={} + ): + mllogger.event(key=constants.CACHE_CLEAR, value=True) + mllogger.start(key=constants.INIT_START) + super().__init__() + + self.init_global_step = init_global_step + self.gbs = global_batch_size + self.mbs = micro_batch_size + self.seq_len = sequence_length + + self.is_target_reached = False + self.status = constants.ABORTED + self.configs = configs + + def consumed_tokens(self, trainer): + return compute_consumed_mllog_tokens(trainer, self.init_global_step, self.gbs, self.seq_len) + + def set_success_status(self): + self.status = constants.SUCCESS + self.is_target_reached = True + + @rank_zero_only + def on_train_epoch_start(self, trainer, pl_module): + mllogger.start(key=constants.EPOCH_START, metadata={'epoch_num': self.consumed_tokens(trainer)}) + mllogger.start(key=constants.BLOCK_START, metadata={"epoch_num": self.consumed_tokens(trainer)}) + + return super().on_train_epoch_start(trainer, pl_module) + + @rank_zero_only + def on_train_epoch_end(self, trainer, pl_module): + mllogger.end(key=constants.EPOCH_STOP, metadata={'epoch_num': self.consumed_tokens(trainer)}) + return super().on_train_epoch_end(trainer, pl_module) + + def on_train_end(self, trainer, pl_module): + # for every occurrences, run on all ranks to allow sync + barrier() + mllogger.end(key=constants.RUN_STOP, metadata={"status": self.status}) + mllogger.event(key="trained_samples", value=self.consumed_tokens(trainer)) + return super().on_train_end(trainer, pl_module) + + @rank_zero_only + def on_validation_start(self, trainer, pl_module): + mllogger.end(key=constants.BLOCK_STOP, metadata={'epoch_num': self.consumed_tokens(trainer)}) + mllogger.start(key=constants.EVAL_START, metadata={'epoch_num': self.consumed_tokens(trainer)}) + return super().on_validation_start(trainer, pl_module) + + def on_validation_end(self, trainer, pl_module): + mllogger.end(key=constants.EVAL_STOP, metadata={'epoch_num': self.consumed_tokens(trainer)}) + + for logger in trainer.loggers: + if isinstance(logger, MetricsLogger): + if logger.is_target_reached: + trainer.should_stop = True + + if not trainer.should_stop: + mllogger.start(key=constants.BLOCK_START, metadata={"epoch_num": self.consumed_tokens(trainer)}) + + return super().on_validation_end(trainer, pl_module) + + @rank_zero_only + def load_state_dict(self, state_dict): + print(f":::MLLOG Weight initialization: {state_dict.keys()}") + return super().load_state_dict(state_dict) + + def on_train_start(self, trainer, pl_module): + # run on all ranks to allow synchronization + barrier() + mllogger.submission_info() + + for key, value in self.configs.items(): + mllogger.event(key=key, value=value) + + mllogger.end(key=constants.INIT_STOP) + mllogger.start(key=constants.RUN_START) \ No newline at end of file diff --git a/large_language_model/nemo/pretrain_llama31.py b/large_language_model/nemo/pretrain_llama31.py index c68eb0da0..6240354d9 100644 --- a/large_language_model/nemo/pretrain_llama31.py +++ b/large_language_model/nemo/pretrain_llama31.py @@ -22,7 +22,7 @@ import nemo_run as run from nemo.lightning.run import plugins from nemo.collections.llm.gpt.data import build_pretraining_datamodule -from callbacks import PreemptiveStop +from callbacks import PreemptiveStop, MLPerfCallback, MetricsLogger def slurm_executor( user: str, @@ -289,6 +289,7 @@ def get_parser() -> argparse.ArgumentParser: experiment_group.add_argument("--seed", type=int, default=1234, help="random seed") experiment_group.add_argument("--num_exps", type=int, default=1) experiment_group.add_argument("--num_pars", type=int, default=1) + experiment_group.add_argument("--target_log_ppl", type=float, default=1) return parser @@ -337,13 +338,46 @@ def get_parser() -> argparse.ArgumentParser: ) assert args.gbs % pretrain.trainer.strategy.pipeline_model_parallel_size == 0, "GBS should be divisible by PP" + + # Collect all HP configs + from mlperf_logging.mllog import constants + + tp = pretrain.trainer.strategy.tensor_model_parallel_size + pp = pretrain.trainer.strategy.pipeline_model_parallel_size + cp = pretrain.trainer.strategy.context_parallel_size + dp = (pretrain.trainer.num_nodes * pretrain.trainer.devices) // (tp * pp * cp) + mini_batch_size = (args.gbs // dp) + grad_accumulation_steps = mini_batch_size // args.mbs + + configs = { + # seeds + constants.SEED: args.seed, + + # HPs + constants.GLOBAL_BATCH_SIZE: args.gbs, + constants.GRADIENT_ACCUMULATION_STEPS: grad_accumulation_steps, + constants.MAX_SEQUENCE_LENGTH: 8192, + constants.EVAL_SAMPLES: "to be determined", + + # Optimizers + constants.OPT_NAME: pretrain.optim.config.optimizer, + constants.OPT_BASE_LR: pretrain.optim.config.lr, + constants.OPT_ADAM_BETA_1: pretrain.optim.config.adam_beta1, + constants.OPT_ADAM_BETA_2: pretrain.optim.config.adam_beta2, + constants.OPT_ADAM_EPSILON: pretrain.optim.config.adam_eps, + constants.OPT_WEIGHT_DECAY: pretrain.optim.config.weight_decay, + constants.OPT_GRADIENT_CLIP_NORM: pretrain.optim.config.clip_grad, + + # Schedulers + constants.OPT_END_LR: pretrain.optim.lr_scheduler.min_lr, + constants.OPT_LR_WARMUP_STEPS: pretrain.optim.lr_scheduler.warmup_steps, + constants.OPT_LR_DECAY_STEPS: pretrain.trainer.max_steps - pretrain.optim.lr_scheduler.warmup_steps, + constants.OPT_LR_DECAY_SCHEDULE: "cosine with linear warmups", + } # Override config for MLPerf pretrain.trainer.num_sanity_val_steps = 0 - # insert plugins and callbacks here - # pretrain.trainer.callbacks.append(...) - run_plugins = [ plugins.PerfEnvPlugin(), ] @@ -386,7 +420,7 @@ def get_parser() -> argparse.ArgumentParser: with run.Experiment(exp_name) as exp: exp.add(build_data_index, executor=data_index_executor, name="build_data_index") - + for j in range(args.num_pars): ending_steps = "" starting_steps = f"{experiment_max_steps}" @@ -405,11 +439,33 @@ def get_parser() -> argparse.ArgumentParser: pretrain.log.ckpt.filename = "checkpoint" if static_max_steps is not None: + start_step = experiment_max_steps experiment_max_steps += static_max_steps + configs[constants.INIT_CHECKPOINT_STEP] = start_step pretrain.trainer.callbacks = ( - original_callbacks + - [run.Config(PreemptiveStop, stop_on_step=experiment_max_steps)] + original_callbacks + [ + run.Config(PreemptiveStop, stop_on_step=experiment_max_steps), + run.Config( + MLPerfCallback, + global_batch_size=args.gbs, + micro_batch_size=args.mbs, + sequence_length=8192, + init_global_step=start_step, + configs=configs, + ), + ] ) + + pretrain.log.extra_loggers = [ + run.Config( + MetricsLogger, + init_global_step=start_step, + global_batch_size=args.gbs, + seq_length=8192, + target_log_ppl=args.target_log_ppl + ), + ] + if args.save_ckpt: pretrain.log.ckpt.every_n_train_steps = experiment_max_steps pretrain.log.ckpt.save_on_train_epoch_end = False diff --git a/large_language_model/nemo/run_llama31.sh b/large_language_model/nemo/run_llama31.sh index 6d3847f1d..48650f263 100644 --- a/large_language_model/nemo/run_llama31.sh +++ b/large_language_model/nemo/run_llama31.sh @@ -71,7 +71,7 @@ git config --global --add safe.directory /workspace/llama31 # Run -MOUNTS="${JOB_DIR}:/output,${PREPROCESSED_DATA}:/preproc_data,${MODEL_CKPT}:/checkpoint,${TOKENIZER}:/tokenizer,${CONTINUAL_CKPT}:/continual" +MOUNTS="${JOB_DIR}:/output,${JOB_DIR}:/mlperf-outputs,${PREPROCESSED_DATA}:/preproc_data,${MODEL_CKPT}:/checkpoint,${TOKENIZER}:/tokenizer,${CONTINUAL_CKPT}:/continual" CKPT_OPTION="" From 46784abb1173476b525b24322bcbdc55ea64be72 Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Tue, 7 Jan 2025 23:49:39 -0800 Subject: [PATCH 06/10] Changes the dataset sources and adds multiple seeds --- large_language_model/nemo/Dockerfile | 3 + large_language_model/nemo/config.sh | 5 +- large_language_model/nemo/mcore.patch | 90 +++++++++++++++++++ large_language_model/nemo/pretrain_llama31.py | 38 ++++++-- large_language_model/nemo/run_llama31.sh | 8 +- 5 files changed, 132 insertions(+), 12 deletions(-) create mode 100644 large_language_model/nemo/mcore.patch diff --git a/large_language_model/nemo/Dockerfile b/large_language_model/nemo/Dockerfile index f2b5ff3b3..6932f3c73 100644 --- a/large_language_model/nemo/Dockerfile +++ b/large_language_model/nemo/Dockerfile @@ -23,3 +23,6 @@ RUN pip install git+https://github.com/mlcommons/logging.git@4.1.0-rc3 # setup workspace WORKDIR /workspace/llama31 COPY . . + +# Fixes the validation dataset order +RUN patch --directory=/opt/megatron-lm -p1 < mcore.patch diff --git a/large_language_model/nemo/config.sh b/large_language_model/nemo/config.sh index 62671b900..19d73899e 100644 --- a/large_language_model/nemo/config.sh +++ b/large_language_model/nemo/config.sh @@ -85,4 +85,7 @@ export START_STEPS="" # Experiment manager: Number of experiments to launch export NEXP=0 # Experiment manager: how many consecutive jobs we want for each experiment -export NPAR=0 \ No newline at end of file +export NPAR=0 +# Experiment manager: provides seeds to the launched experiments, use space as delimiter, such as "1234 1235 1236" +# The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP. +export SEEDS="" \ No newline at end of file diff --git a/large_language_model/nemo/mcore.patch b/large_language_model/nemo/mcore.patch new file mode 100644 index 000000000..5a9639e64 --- /dev/null +++ b/large_language_model/nemo/mcore.patch @@ -0,0 +1,90 @@ +diff --git a/megatron/core/datasets/gpt_dataset.py b/megatron/core/datasets/gpt_dataset.py +index 2eb7702b..d1f0b9a9 100644 +--- a/megatron/core/datasets/gpt_dataset.py ++++ b/megatron/core/datasets/gpt_dataset.py +@@ -407,9 +407,10 @@ class GPTDataset(MegatronDataset): + + numpy_random_state = numpy.random.RandomState(self.config.random_seed) + ++ shuffle = self.index_split == Split.train + # Build the document index + document_index = _build_document_index( +- self.indices, num_epochs, numpy_random_state, separate_final_epoch ++ self.indices, num_epochs, numpy_random_state, separate_final_epoch, shuffle + ) + + drop_last_partial_sequence = True +@@ -450,11 +451,11 @@ class GPTDataset(MegatronDataset): + # Build the shuffle index + if separate_final_epoch: + shuffle_index = _build_shuffle_index( +- num_samples_sans_final_epoch, sample_index.shape[0] - 1, numpy_random_state ++ num_samples_sans_final_epoch, sample_index.shape[0] - 1, numpy_random_state, shuffle + ) + else: + shuffle_index = _build_shuffle_index( +- sample_index.shape[0] - 1, sample_index.shape[0] - 1, numpy_random_state ++ sample_index.shape[0] - 1, sample_index.shape[0] - 1, numpy_random_state, shuffle + ) + + if path_to_cache: +@@ -558,6 +559,7 @@ def _build_document_index( + num_epochs: int, + numpy_random_state: numpy.random.RandomState, + separate_final_epoch: bool, ++ shuffle: bool = True, + ) -> numpy.ndarray: + """Build an array with length = num epochs * num documents + +@@ -578,7 +580,8 @@ def _build_document_index( + document_index[:] = documents + document_index = document_index.reshape(-1) + document_index = document_index.astype(numpy.int32) +- numpy_random_state.shuffle(document_index) ++ if shuffle: ++ numpy_random_state.shuffle(document_index) + return document_index + + doc_idx_first = _build_document_index(documents, num_epochs - 1, numpy_random_state, False) +@@ -587,7 +590,8 @@ def _build_document_index( + + + def _build_shuffle_index( +- num_samples: int, total_size: int, numpy_random_state: numpy.random.RandomState ++ num_samples: int, total_size: int, numpy_random_state: numpy.random.RandomState, ++ shuffle: bool = True + ) -> numpy.ndarray: + """Build the range [0, size) and shuffle + +@@ -607,12 +611,16 @@ def _build_shuffle_index( + dtype_ = numpy.int64 + + shuffle_idx_first = numpy.arange(start=0, stop=num_samples, step=1, dtype=dtype_) +- numpy_random_state.shuffle(shuffle_idx_first) ++ ++ if shuffle: ++ numpy_random_state.shuffle(shuffle_idx_first) + if num_samples == total_size: + return shuffle_idx_first + + shuffle_idx_last = numpy.arange(start=num_samples, stop=total_size, step=1, dtype=dtype_) +- numpy_random_state.shuffle(shuffle_idx_last) ++ ++ if shuffle: ++ numpy_random_state.shuffle(shuffle_idx_last) + + return numpy.concatenate((shuffle_idx_first, shuffle_idx_last)) + +diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py +index 0c1504d4..71d29629 100644 +--- a/megatron/core/transformer/moe/moe_utils.py ++++ b/megatron/core/transformer/moe/moe_utils.py +@@ -264,6 +264,7 @@ def topk_softmax_with_capacity( + # Pre softmax + scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits) + probs, top_indices = torch.topk(scores, k=topk, dim=1) ++ probs /= probs.sum(dim=-1, keepdim=True) + else: + # Post softmax + if topk == 1: + diff --git a/large_language_model/nemo/pretrain_llama31.py b/large_language_model/nemo/pretrain_llama31.py index 6240354d9..37fccd723 100644 --- a/large_language_model/nemo/pretrain_llama31.py +++ b/large_language_model/nemo/pretrain_llama31.py @@ -190,10 +190,17 @@ def get_data( seq_length: Optional[int] = 8192, tokenizer_path: Optional[str] = "", seed: Optional[int] = 1234, + use_full_dataset: Optional[bool] = False, ) -> run.Config: tokenizer = run.Config(AutoTokenizer, pretrained_model_name=tokenizer_path) - train_datasets = sum([["12.5", f"/preproc_data/c4-train.en_{idx}_text_document"] for idx in range(8)], []) + train_datasets = None + + if use_full_dataset: + train_datasets = sum([["12.5", f"/preproc_data/c4-train.en_{idx}_text_document"] for idx in range(8)], []) + else: + train_datasets = sum([["50", f"/preproc_data/c4-train.en_{idx}_text_document"] for idx in range(6, 8)], []) + data_paths = { "train": train_datasets, "validation": [ @@ -208,7 +215,7 @@ def get_data( llm.PreTrainingDataModule, tokenizer=tokenizer, paths=data_paths, - num_workers=2, # TODO: make it configurable + num_workers=8, # TODO: make it configurable seq_length=seq_length, global_batch_size=gbs, micro_batch_size=mbs, @@ -282,11 +289,12 @@ def get_parser() -> argparse.ArgumentParser: data_group.add_argument("--eval_every", type=int, default=10) data_group.add_argument("--eval_batches", type=int, default=None) data_group.add_argument('--max_steps', type=int, default=None) + data_group.add_argument("--use_full_dataset", action="store_true", help="Whether we use the full dataset or use the last 256/1024 dataset") data_group.add_argument("--tokenizer_path", type=str, help="Tokenizer path that's used to tokenize the dataset") experiment_group = parser.add_argument_group("Experiment management arguments") experiment_group.add_argument("--dryrun", action="store_true", help="Whether we are launching dryrun or actual runs") - experiment_group.add_argument("--seed", type=int, default=1234, help="random seed") + experiment_group.add_argument("--seeds", type=int, nargs="*", default=[], help="random seeds") experiment_group.add_argument("--num_exps", type=int, default=1) experiment_group.add_argument("--num_pars", type=int, default=1) experiment_group.add_argument("--target_log_ppl", type=float, default=1) @@ -325,7 +333,8 @@ def get_parser() -> argparse.ArgumentParser: mbs=args.mbs, seq_length=seq_length, tokenizer_path=args.tokenizer_path, - seed=args.seed, + seed=1234, # overwritten in each experiments + use_full_dataset=args.use_full_dataset, ) exp_prefix, pretrain = get_pretrain( @@ -350,9 +359,6 @@ def get_parser() -> argparse.ArgumentParser: grad_accumulation_steps = mini_batch_size // args.mbs configs = { - # seeds - constants.SEED: args.seed, - # HPs constants.GLOBAL_BATCH_SIZE: args.gbs, constants.GRADIENT_ACCUMULATION_STEPS: grad_accumulation_steps, @@ -402,6 +408,7 @@ def get_parser() -> argparse.ArgumentParser: data_index_executor.nodes = 1 data_index_executor.ntasks_per_node = 1 data_index_executor.retries = 1 + data_index_executor.time = "02:00:00" static_read_from_path = args.initial_ckpt_path if args.use_ckpt else None static_write_to_path = args.continual_ckpt_path @@ -412,8 +419,21 @@ def get_parser() -> argparse.ArgumentParser: original_callbacks = pretrain.trainer.callbacks - for i in range(args.num_exps): - exp_name = f"{exp_prefix}_{i}" + random_seeds = args.seeds + if len(random_seeds) < args.num_exps: + import random + random_seeds = random_seeds + [random.randint(0, 32767) for _ in range(args.num_exps - len(random_seeds))] + print(f"Missing {args.num_exps - len(random_seeds)} seeds, padding the random seeds to {random_seeds}") + + random_seeds = random_seeds[:args.num_exps] + + for index, seed in enumerate(random_seeds): + # sets the seeds + pretrain.data.seed = seed + build_data_index.datamodule.seed = seed + configs[constants.SEED] = seed + + exp_name = f"{exp_prefix}_{index}_seed_{seed}" experiment_read_from_path = static_read_from_path experiment_write_to_path = static_write_to_path experiment_max_steps = args.ckpt_start_step diff --git a/large_language_model/nemo/run_llama31.sh b/large_language_model/nemo/run_llama31.sh index 48650f263..d1390b009 100644 --- a/large_language_model/nemo/run_llama31.sh +++ b/large_language_model/nemo/run_llama31.sh @@ -63,7 +63,8 @@ git config --global --add safe.directory /workspace/llama31 : "${MAX_STEPS:=""}" # Experiment settings -: "${SEED:=$RANDOM}" +: "${SEEDS:=""}" +IFS=" " read -ra seeds <<< $SEEDS : "${NEXP:=1}" : "${NPAR:=1}" : "${SAVE_CKPT:=1}" @@ -117,6 +118,9 @@ if [ ! $TAG = "" ]; then CMD_SUFFIX="${CMD_SUFFIX} --tag ${TAG}" fi +# Allows MLLogger objects to be constructed locally +if [ ! -d /mlperf-outputs ]; then mkdir /mlperf-outputs; fi + set -x python3 pretrain_llama31.py \ @@ -129,7 +133,7 @@ python3 pretrain_llama31.py \ --image $IMAGE \ --size $SIZE \ --gbs $GBS --mbs $MBS \ ---seed $SEED \ +--seeds ${seeds[@]} \ --num_exps $NEXP \ --num_pars $NPAR \ --initial_ckpt_path /checkpoint \ From e8c0a7c50f77f4700c0f7e4ec97731905c44e3f1 Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Thu, 9 Jan 2025 18:13:40 -0800 Subject: [PATCH 07/10] Resolves comments --- large_language_model/nemo/Dockerfile | 4 +- large_language_model/nemo/README.md | 94 ++++++++++++++----- large_language_model/nemo/callbacks.py | 4 +- large_language_model/nemo/config.sh | 4 +- large_language_model/nemo/pretrain_llama31.py | 4 +- large_language_model/nemo/run_llama31.sh | 2 +- 6 files changed, 83 insertions(+), 29 deletions(-) diff --git a/large_language_model/nemo/Dockerfile b/large_language_model/nemo/Dockerfile index 6932f3c73..a609ea89e 100644 --- a/large_language_model/nemo/Dockerfile +++ b/large_language_model/nemo/Dockerfile @@ -16,8 +16,8 @@ ARG NEMO_BASE_IMAGE=nvcr.io/nvidia/nemo:24.12-rc0 FROM ${NEMO_BASE_IMAGE} AS nemo-base-image RUN pip uninstall transformers -y -RUN pip install transformers blobfile -RUN pip install prettytable +RUN pip install transformers==4.47.1 blobfile==3.0.0 +RUN pip install prettytable==3.12.0 RUN pip install git+https://github.com/mlcommons/logging.git@4.1.0-rc3 # setup workspace diff --git a/large_language_model/nemo/README.md b/large_language_model/nemo/README.md index abcd79c29..153946876 100644 --- a/large_language_model/nemo/README.md +++ b/large_language_model/nemo/README.md @@ -53,36 +53,50 @@ We use the c4/en/3.0.1 dataset from [HuggingFace/AllenAI](https://huggingface.co We use the Mixtral 8x22B tokenizer from [HuggingFace/MistralAI](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1). -### Tokenizer +### Preprocessed data download -We use Mixtral 8x22B tokenizer in this benchmark. Tokenizer files can be downloaded [here](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1/tree/main). Only the five files containing tokenizer-related contents (`special_tokens_map.json`, `tokenizer.json`, `tokenizer.model`, `tokenizer.model.v1`, `tokenizer_config.json`) are needed. +The pre-tokenized dataset and the tokenizer are available to download from the S3 bucket. You can download this data from the bucket using RClone as follows: -### Data preprocessing +To run Rclone on Windows, you can download the executable here. To install Rclone on Linux/macOS/BSD systems, run: -Run the following commands to merge all 1024 training files into 8 `json.gz` files and all 8 validation files into a single `json.gz` file. Each of the `json.gz` files will be preprocessed into a pair of megatron dataset files (`.bin` and `.idx`). +``` +sudo -v ; curl https://rclone.org/install.sh | sudo bash +``` -```bash -export C4_PATH="" -export MERGED_C4_PATH="" +Once Rclone is installed, run the following command to authenticate with the bucket: -bash consolidate_data.sh +``` +to be filled with access keys ``` -After the data consolidation is done, we can run this [script](./utils/preprocess.sh) to perform preprocessing. To run the preprocessing script, we need to use the following commands: +You can then navigate in the terminal to your desired download directory and run the following commands to download the dataset and checkpoints: -```bash -# fill in the built container path here -export CONT_IMAGE_URL="" -# pass in the folder path that contains the Mixtral tokenizer here -# please refer to the tokenizer section above for more details -export TOKENIZER_PATH="" -# pass in the merged file path here -export MERGED_C4_PATH="" -# this path is used for storing the preprocessed .bin and .idx files -export PREPROCESSED_PATH="" +#### Dataset -sbatch preprocess.sh ``` +# Replace this path with your desired path on the machine +export PREPROCESSED_PATH="./" +rclone copy mlc-training-write:mlcommons-training-wg-public/llama3_1/datasets/c4 $PREPROCESSED_PATH -P +``` + +After the download is complete, you should see files with the following naming conventions under `PREPROCESSED_PATH`, ending with both `.idx` and `.bin`: +- Training partitions: `c4-train.en__text_document` +- Validation partitions: `c4-validation.en_text_document` + +#### Tokenizer + +``` +# Replace this path with your desired path on the machine +export TOKENIZER_PATH="./" +rclone copy mlc-training-write:mlcommons-training-wg-public/llama3_1/datasets/tokenizer $TOKENIZER_PATH -P +``` + +After the download is complete, you should see five files under `TOKENIZER_PATH`: +- `special_tokens_map.json` +- `tokenizer.json` +- `tokenizer.model` +- `tokenizer.model.v1` +- `tokenizer_config.json` ### Training and test data separation @@ -148,4 +162,42 @@ To be determined. ### Evaluation thoroughness -To be determined. \ No newline at end of file +To be determined. + + +# 6. Other + +### Data Preprocessing + +Here are the instructions to prepare the preprocessed dataset from scratch. Data preprocessing is already done and the final dataset can be accessed by following instructions in the [Preprocessed data download]() section. + +#### Tokenizer + +We use Mixtral 8x22B tokenizer in this benchmark. Tokenizer files can be downloaded [here](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1/tree/main). Only the five files containing tokenizer-related contents (`special_tokens_map.json`, `tokenizer.json`, `tokenizer.model`, `tokenizer.model.v1`, `tokenizer_config.json`) are needed. + +#### Run Data preprocessing + +Run the following commands to merge all 1024 training files into 8 `json.gz` files and all 8 validation files into a single `json.gz` file. Each of the `json.gz` files will be preprocessed into a pair of megatron dataset files (`.bin` and `.idx`). + +```bash +export C4_PATH="" +export MERGED_C4_PATH="" + +bash consolidate_data.sh +``` + +After the data consolidation is done, we can run this [script](./utils/preprocess.sh) to perform preprocessing. To run the preprocessing script, we need to use the following commands: + +```bash +# fill in the built container path here +export CONT_IMAGE_URL="" +# pass in the folder path that contains the Mixtral tokenizer here +# please refer to the tokenizer section above for more details +export TOKENIZER_PATH="" +# pass in the merged file path here +export MERGED_C4_PATH="" +# this path is used for storing the preprocessed .bin and .idx files +export PREPROCESSED_PATH="" + +sbatch preprocess.sh +``` diff --git a/large_language_model/nemo/callbacks.py b/large_language_model/nemo/callbacks.py index ba13a6494..906dfee76 100644 --- a/large_language_model/nemo/callbacks.py +++ b/large_language_model/nemo/callbacks.py @@ -33,7 +33,7 @@ def barrier(): dist.barrier() class MLLogger: - def __init__(self, filepath="/mlperf-outputs/mlperf_llama31.log", default_stack_offset=2): + def __init__(self, filepath="/mlperf-outputs/mlperf_llama31_405b.log", default_stack_offset=2): self.logger = mllog.get_mllogger() mllog.config(default_stack_offset=default_stack_offset, filename=filepath) @@ -50,7 +50,7 @@ def event(self, **kwargs): self.logger.event(**kwargs) def submission_info(self): - self.event(key=constants.SUBMISSION_BENCHMARK, value="llama31") + self.event(key=constants.SUBMISSION_BENCHMARK, value="llama31_405b") self.event(key=constants.SUBMISSION_ORG, value="reference_implementation") self.event(key=constants.SUBMISSION_DIVISION, value=constants.CLOSED) self.event(key=constants.SUBMISSION_STATUS, value=constants.ONPREM) diff --git a/large_language_model/nemo/config.sh b/large_language_model/nemo/config.sh index 19d73899e..5585d59bf 100644 --- a/large_language_model/nemo/config.sh +++ b/large_language_model/nemo/config.sh @@ -35,11 +35,11 @@ export JOB_DIR="" # Image path, either local cache file or remote URL export IMAGE="" # Dataset: C4 dataset location that contains the dataset after preprocessing -export PREPROCESSED_DATA="" +export PREPROCESSED_PATH="" # Dataset: Numpy index working directory export TMP_NPY_INDEX="" # Dataset: Tokenizer path -export TOKENIZER="" +export TOKENIZER_PATH="" # Environment: NeMo remount export NEMO_DIR="" diff --git a/large_language_model/nemo/pretrain_llama31.py b/large_language_model/nemo/pretrain_llama31.py index 37fccd723..fd944b5c0 100644 --- a/large_language_model/nemo/pretrain_llama31.py +++ b/large_language_model/nemo/pretrain_llama31.py @@ -98,6 +98,8 @@ def get_pretrain( exp_name = size + # Providing 8B and 70B here for debugging purpose + # Actual benchmark should use 405B if size == "8b": pretrain = llm.llama3_8b.pretrain_recipe( dir="/outputs", @@ -268,7 +270,7 @@ def get_parser() -> argparse.ArgumentParser: model_group.add_argument( "--size", type=str, - default="8b", + default="405b", help="Choose the model to be trained", choices=[ "8b", # Llama 3 8B config for debugging diff --git a/large_language_model/nemo/run_llama31.sh b/large_language_model/nemo/run_llama31.sh index d1390b009..8d3e94c4d 100644 --- a/large_language_model/nemo/run_llama31.sh +++ b/large_language_model/nemo/run_llama31.sh @@ -72,7 +72,7 @@ IFS=" " read -ra seeds <<< $SEEDS # Run -MOUNTS="${JOB_DIR}:/output,${JOB_DIR}:/mlperf-outputs,${PREPROCESSED_DATA}:/preproc_data,${MODEL_CKPT}:/checkpoint,${TOKENIZER}:/tokenizer,${CONTINUAL_CKPT}:/continual" +MOUNTS="${JOB_DIR}:/output,${JOB_DIR}:/mlperf-outputs,${PREPROCESSED_PATH}:/preproc_data,${MODEL_CKPT}:/checkpoint,${TOKENIZER_PATH}:/tokenizer,${CONTINUAL_CKPT}:/continual" CKPT_OPTION="" From 5cda0c0f306c2c24e67ef225a9d123f7c9105e29 Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Mon, 13 Jan 2025 10:48:29 -0800 Subject: [PATCH 08/10] updates instructions --- large_language_model/nemo/README.md | 6 +- large_language_model/nemo/config.sh | 67 +++++++++++-------- large_language_model/nemo/pretrain_llama31.py | 2 +- large_language_model/nemo/run_llama31.sh | 4 +- 4 files changed, 44 insertions(+), 35 deletions(-) diff --git a/large_language_model/nemo/README.md b/large_language_model/nemo/README.md index 153946876..ed0884497 100644 --- a/large_language_model/nemo/README.md +++ b/large_language_model/nemo/README.md @@ -66,7 +66,7 @@ sudo -v ; curl https://rclone.org/install.sh | sudo bash Once Rclone is installed, run the following command to authenticate with the bucket: ``` -to be filled with access keys +rclone config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com ``` You can then navigate in the terminal to your desired download directory and run the following commands to download the dataset and checkpoints: @@ -76,7 +76,7 @@ You can then navigate in the terminal to your desired download directory and run ``` # Replace this path with your desired path on the machine export PREPROCESSED_PATH="./" -rclone copy mlc-training-write:mlcommons-training-wg-public/llama3_1/datasets/c4 $PREPROCESSED_PATH -P +rclone copy mlc-training:mlcommons-training-wg-public/llama3_1/datasets/c4 $PREPROCESSED_PATH -P ``` After the download is complete, you should see files with the following naming conventions under `PREPROCESSED_PATH`, ending with both `.idx` and `.bin`: @@ -88,7 +88,7 @@ After the download is complete, you should see files with the following naming c ``` # Replace this path with your desired path on the machine export TOKENIZER_PATH="./" -rclone copy mlc-training-write:mlcommons-training-wg-public/llama3_1/datasets/tokenizer $TOKENIZER_PATH -P +rclone copy mlc-training:mlcommons-training-wg-public/llama3_1/datasets/tokenizer $TOKENIZER_PATH -P ``` After the download is complete, you should see five files under `TOKENIZER_PATH`: diff --git a/large_language_model/nemo/config.sh b/large_language_model/nemo/config.sh index 5585d59bf..e1624497d 100644 --- a/large_language_model/nemo/config.sh +++ b/large_language_model/nemo/config.sh @@ -20,34 +20,34 @@ export HOST="" export ACCOUNT="" # Slurm: partition for job submission export PARTITION="" -# Slurm: job time limit -export TIME="" -# Slurm: --nodes arguments -export NNODES=0 -# Slurm: --gpus_per_node and --ntasks_per_node argument -export GPUS_PER_NODE=0 -# Slurm: max job retries for transient job failures -export MAX_RETRIES=0 +# Slurm: job time limit, defaults to 4 hours +export TIME="04:00:00" +# Slurm: --nodes arguments, default to use 288 nodes +export NNODES=288 +# Slurm: --gpus_per_node and --ntasks_per_node argument, defaults to 8 GPUs per node +export GPUS_PER_NODE=8 +# Slurm: max job retries for transient job failures, defaults to retry 3 times +export MAX_RETRIES=3 # Folder mapping: -# Output directory that holds logs +# Output directory that holds logs, any path that you like. export JOB_DIR="" -# Image path, either local cache file or remote URL +# Image / container path, either local cache file or remote URL export IMAGE="" # Dataset: C4 dataset location that contains the dataset after preprocessing +# This corresponds to the PREPROCESSED_PATH in README section 3's dataset download part export PREPROCESSED_PATH="" -# Dataset: Numpy index working directory +# Dataset: Numpy index working directory, contains shuffled dataset +# This path must be able to hold >400GB data export TMP_NPY_INDEX="" # Dataset: Tokenizer path +# This corresponds to the TOKENIZER_PATH in README section 3's tokenizer download part export TOKENIZER_PATH="" -# Environment: NeMo remount -export NEMO_DIR="" - # Model: checkpoint and tokenizer path # This is the checkpoint that we want to start with. # Each checkpoint should be a folder containing two sub-folders: context and weights. -# And we need to pass this folder's path (the folder containing these two sub-folders) here. +# And we need to pass this folder's path (the folder containing context and weights) here. export MODEL_CKPT="" # Model: Continual checkpoint directory to write and resume # This is the directory to hold all intermediate checkpoints. @@ -55,37 +55,46 @@ export MODEL_CKPT="" # we should see a checkpoint written in this folder # with name `checkpoint-par-x-y-steps` # Inside this directory, there should be a `checkpoint` directory that holds context and weights -# which is the "actual checkpoint" +# which is the "actual checkpoint". +# Notice that this path must be able to hold at least 5.2TB data since each checkpoint is 5.2TB. export CONTINUAL_CKPT="" # Model: Whether we want to restore from MODEL_CKPT path. If 0, then we are not restoring. export USE_CKPT=0 -# Model: Whether we want to save a checkpoint. Must be true if NPAR > 1 +# Model: Whether we want to save a checkpoint. Must be 1 if NPAR > 1. If 1, then we save a checkpoint at the end. export SAVE_CKPT=0 # Training Configs: # Model: size, to choose from 8b, 70b, 405b -export SIZE="" +export SIZE="405b" # Dataloader: Global batch size -export GBS=0 +export GBS=1152 # Dataloader: Micro batch size -export MBS=0 +export MBS=1 # Dataloader: Evaluate every N batches, optional -export EVAL_EVERY="" +# defaults to evaluate every 20 batches, or 188_743_680 tokens +export EVAL_EVERY="20" # Dataloader: Evaluate using N batches, optional -export EVAL_BATCHES="" +# defaults to use 10 batches for evaluation, or 94_371_840 tokens +# If an empty string is provided (""), then we use full validation dataset for evaluation +export EVAL_BATCHES="10" # Dataloader: Max run N batches, optional -export MAX_STEPS="" +# defaults to train 425 steps, or 4_010_803_200 tokens +# If an empty string is provided (""), then the training will continue until time limit +# If we want to save a checkpoint, then this value must be set +export MAX_STEPS="425" # Experiment: starting steps # This is the starting "offset" step from the checkpoint. -# For instance, if you are resuming from a checkpoint folder `checkpoint-par-x-y-steps/checkpoint`, -# then the value y is needed here. -export START_STEPS="" +# For instance, if you are resuming from a checkpoint folder `checkpoint-par-0-20-steps/checkpoint`, +# which means that the model is trained for 20 steps to generate the checkpoint, +# then the value 20 is needed here. +export START_STEPS="0" # Experiment manager: Number of experiments to launch -export NEXP=0 +export NEXP=1 # Experiment manager: how many consecutive jobs we want for each experiment -export NPAR=0 +export NPAR=1 # Experiment manager: provides seeds to the launched experiments, use space as delimiter, such as "1234 1235 1236" -# The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP. +# The training script will discard all excessive seeds, and generate seeds if given seeds < NEXP. +# To preserve randomness, we recommend not to set this value so that each time seeds can be randomly generated. export SEEDS="" \ No newline at end of file diff --git a/large_language_model/nemo/pretrain_llama31.py b/large_language_model/nemo/pretrain_llama31.py index fd944b5c0..a612e4fbe 100644 --- a/large_language_model/nemo/pretrain_llama31.py +++ b/large_language_model/nemo/pretrain_llama31.py @@ -288,7 +288,7 @@ def get_parser() -> argparse.ArgumentParser: data_group.add_argument("--gbs", type=int, default=288, help="Global batch size, should be divisible by PP") data_group.add_argument("--mbs", type=int, default=1, help="Micro batch size") - data_group.add_argument("--eval_every", type=int, default=10) + data_group.add_argument("--eval_every", type=int, default=20) data_group.add_argument("--eval_batches", type=int, default=None) data_group.add_argument('--max_steps', type=int, default=None) data_group.add_argument("--use_full_dataset", action="store_true", help="Whether we use the full dataset or use the last 256/1024 dataset") diff --git a/large_language_model/nemo/run_llama31.sh b/large_language_model/nemo/run_llama31.sh index 8d3e94c4d..02818cd63 100644 --- a/large_language_model/nemo/run_llama31.sh +++ b/large_language_model/nemo/run_llama31.sh @@ -30,8 +30,8 @@ git config --global --add safe.directory /workspace/llama31 : "${IMAGE:?IMAGE not set}" # Dataset settings -: "${PREPROCESSED_DATA:?PREPROCESSED_DATA not set}" -: "${TOKENIZER:?TOKENIZER not set}" +: "${PREPROCESSED_PATH:?PREPROCESSED_PATH not set}" +: "${TOKENIZER_PATH:?TOKENIZER_PATH not set}" # Model settings : "${MODEL_CKPT:?MODEL_CKPT not set}" From 84d86e8a63d92626335b289b69c9190424a86cb8 Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Mon, 13 Jan 2025 11:26:33 -0800 Subject: [PATCH 09/10] patches to download instructions --- large_language_model/nemo/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/large_language_model/nemo/README.md b/large_language_model/nemo/README.md index ed0884497..604ca6f73 100644 --- a/large_language_model/nemo/README.md +++ b/large_language_model/nemo/README.md @@ -76,7 +76,7 @@ You can then navigate in the terminal to your desired download directory and run ``` # Replace this path with your desired path on the machine export PREPROCESSED_PATH="./" -rclone copy mlc-training:mlcommons-training-wg-public/llama3_1/datasets/c4 $PREPROCESSED_PATH -P +rclone copy mlc-training:mlcommons-training-wg-public/llama3_1/datasets/preprocessed_c4 $PREPROCESSED_PATH -P ``` After the download is complete, you should see files with the following naming conventions under `PREPROCESSED_PATH`, ending with both `.idx` and `.bin`: From cf61ea534c3fe629d418e717e563791eda41ffd0 Mon Sep 17 00:00:00 2001 From: Yunzhou Liu Date: Mon, 13 Jan 2025 13:07:14 -0800 Subject: [PATCH 10/10] renames folder --- .../nemo/Dockerfile | 0 .../nemo/README.md | 0 .../nemo/callbacks.py | 0 .../nemo/config.sh | 0 .../nemo/mcore.patch | 0 .../nemo/pretrain_llama31.py | 0 .../nemo/run_llama31.sh | 0 .../nemo/utils/consolidate_data.sh | 0 .../nemo/utils/preprocess.sh | 0 9 files changed, 0 insertions(+), 0 deletions(-) rename {large_language_model => large_language_model_pretraining}/nemo/Dockerfile (100%) rename {large_language_model => large_language_model_pretraining}/nemo/README.md (100%) rename {large_language_model => large_language_model_pretraining}/nemo/callbacks.py (100%) rename {large_language_model => large_language_model_pretraining}/nemo/config.sh (100%) rename {large_language_model => large_language_model_pretraining}/nemo/mcore.patch (100%) rename {large_language_model => large_language_model_pretraining}/nemo/pretrain_llama31.py (100%) rename {large_language_model => large_language_model_pretraining}/nemo/run_llama31.sh (100%) rename {large_language_model => large_language_model_pretraining}/nemo/utils/consolidate_data.sh (100%) rename {large_language_model => large_language_model_pretraining}/nemo/utils/preprocess.sh (100%) diff --git a/large_language_model/nemo/Dockerfile b/large_language_model_pretraining/nemo/Dockerfile similarity index 100% rename from large_language_model/nemo/Dockerfile rename to large_language_model_pretraining/nemo/Dockerfile diff --git a/large_language_model/nemo/README.md b/large_language_model_pretraining/nemo/README.md similarity index 100% rename from large_language_model/nemo/README.md rename to large_language_model_pretraining/nemo/README.md diff --git a/large_language_model/nemo/callbacks.py b/large_language_model_pretraining/nemo/callbacks.py similarity index 100% rename from large_language_model/nemo/callbacks.py rename to large_language_model_pretraining/nemo/callbacks.py diff --git a/large_language_model/nemo/config.sh b/large_language_model_pretraining/nemo/config.sh similarity index 100% rename from large_language_model/nemo/config.sh rename to large_language_model_pretraining/nemo/config.sh diff --git a/large_language_model/nemo/mcore.patch b/large_language_model_pretraining/nemo/mcore.patch similarity index 100% rename from large_language_model/nemo/mcore.patch rename to large_language_model_pretraining/nemo/mcore.patch diff --git a/large_language_model/nemo/pretrain_llama31.py b/large_language_model_pretraining/nemo/pretrain_llama31.py similarity index 100% rename from large_language_model/nemo/pretrain_llama31.py rename to large_language_model_pretraining/nemo/pretrain_llama31.py diff --git a/large_language_model/nemo/run_llama31.sh b/large_language_model_pretraining/nemo/run_llama31.sh similarity index 100% rename from large_language_model/nemo/run_llama31.sh rename to large_language_model_pretraining/nemo/run_llama31.sh diff --git a/large_language_model/nemo/utils/consolidate_data.sh b/large_language_model_pretraining/nemo/utils/consolidate_data.sh similarity index 100% rename from large_language_model/nemo/utils/consolidate_data.sh rename to large_language_model_pretraining/nemo/utils/consolidate_data.sh diff --git a/large_language_model/nemo/utils/preprocess.sh b/large_language_model_pretraining/nemo/utils/preprocess.sh similarity index 100% rename from large_language_model/nemo/utils/preprocess.sh rename to large_language_model_pretraining/nemo/utils/preprocess.sh