update doc and fix type

yuki-666 · yuki-666 · commit 08cce8cc7c2f · 2025-05-21T23:58:10.000-07:00
Signed-off-by: Yuki Huang &lt;yukih@nvidia.com&gt;
diff --git a/docs/design-docs/fsdp2-parallel-plan.md b/docs/design-docs/fsdp2-parallel-plan.md
@@ -1,49 +1,30 @@
 # FSDP2 Parallel Plan
 
-This guide outlines the parallelization strategy for FSDP2 training in NeMo-RL.
+This guide outlines the parallelization strategy for Fully Sharded Data Parallel version 2 (FSDP2) training in NeMo RL.
 
 ## Fallback Priority
 
-Three parallelization approaches are supported, with the following fallback priority.
+NeMo RL supports three parallelization strategies, applied in the following order of fallback priority:
 
-**Custom Parallel Plan**
+### 1. Custom Parallel Plan
 
-User-defined custom parallel plans take precedence when available.
+Your user-defined custom parallel plans always take precedence when available. For detailed implementation and usage, refer to the [Custom Parallel Plan Example](#custom-parallel-plan-example).
 
-For implementation details and usage guidelines, please refer to [Custom Parallel Plan Example](#custom-parallel-plan-example).
+### 2. Optimized Parallel Plan
 
-**Optimized Parallel Plan**
+Optimized parallel plans are available for specific model architectures. They may offer superior performance compared to Hugging Face's tensor parallel implementation. This approach is used if no custom parallel plan is specified and the model class supports optimized parallelization.
 
-Optimized parallel plans are available for specific model architectures and may offer superior performance compared to the Hugging Face tensor parallel implementation.
+### 3. Hugging Face Tensor Parallel Plan
 
-This approach is used when no custom parallel plan is specified and the model class supports optimized parallelization.
-
-**Hugging Face Tensor Parallel Plan**
-
-Hugging Face provides tensor parallelism for most models through `._tp_plan`.
-
-It serves as the default when neither custom nor optimized parallel plans are available.
+The Hugging Face tensor parallel plan is the default. It's available for most models via `._tp_plan` and is used when neither a custom nor an optimized parallel plan is available.
 
 ## Custom Parallel Plan Example
 
-Custom parallel plan should be defined in a file, exemplified by `examples/custom_parallel.py`.
-
-To implement the custom parallel plan, configure `policy.dtensor_cfg.custom_parallel_plan=examples.custom_parallel.custom_parallel_plan`.
-
-```python
-from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel
-from torch.distributed.tensor.placement_types import Replicate, Shard
+A custom parallel plan should be defined in a separate file, such as the example provided in `examples/custom_parallel.py`.
 
+To implement the custom parallel plan, either update the value of `custom_parallel_plan` in the `yaml` file directly, or pass the override via the command line. For example:
 
-custom_parallel_plan = {
-    "model.embed_tokens": RowwiseParallel(input_layouts=Replicate()),
-    "model.layers.*.self_attn.q_proj": ColwiseParallel(),
-    "model.layers.*.self_attn.k_proj": ColwiseParallel(),
-    "model.layers.*.self_attn.v_proj": ColwiseParallel(),
-    "model.layers.*.self_attn.o_proj": RowwiseParallel(),
-    "model.layers.*.mlp.up_proj": ColwiseParallel(),
-    "model.layers.*.mlp.gate_proj": ColwiseParallel(),
-    "model.layers.*.mlp.down_proj": RowwiseParallel(),
-    "lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False),
-}
+```bash
+uv run examples/run_grpo_math.py \
+    policy.dtensor_cfg.custom_parallel_plan=examples.custom_parallel.custom_parallel_plan
 ```
diff --git a/examples/custom_parallel.py b/examples/custom_parallel.py
@@ -0,0 +1,14 @@
+from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel
+from torch.distributed.tensor.placement_types import Replicate, Shard
+
+custom_parallel_plan = {
+    "model.embed_tokens": RowwiseParallel(input_layouts=Replicate()),
+    "model.layers.*.self_attn.q_proj": ColwiseParallel(),
+    "model.layers.*.self_attn.k_proj": ColwiseParallel(),
+    "model.layers.*.self_attn.v_proj": ColwiseParallel(),
+    "model.layers.*.self_attn.o_proj": RowwiseParallel(),
+    "model.layers.*.mlp.up_proj": ColwiseParallel(),
+    "model.layers.*.mlp.gate_proj": ColwiseParallel(),
+    "model.layers.*.mlp.down_proj": RowwiseParallel(),
+    "lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False),
+}
diff --git a/nemo_rl/models/dtensor/parallelize.py b/nemo_rl/models/dtensor/parallelize.py
@@ -14,7 +14,7 @@
 
 from functools import lru_cache
 from types import FunctionType
-from typing import Callable, Union
+from typing import Callable, Optional, Union
 
 import torch
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
@@ -25,6 +25,7 @@
 from torch.distributed.tensor import DTensor
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
+    ParallelStyle,
     PrepareModuleInput,
     PrepareModuleOutput,
     RowwiseParallel,
@@ -254,7 +255,9 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh):
     return base_model_tp_plan
 
 
-PARALLIZE_FUNCTIONS: dict[type[torch.nn.Module], Callable[..., torch.nn.Module]] = {
+PARALLIZE_FUNCTIONS: dict[
+    type[torch.nn.Module], Callable[..., dict[str, ParallelStyle]]
+] = {
     Qwen2ForCausalLM: _parallelize_qwen,
     Qwen3ForCausalLM: _parallelize_qwen,
     LlamaForCausalLM: _parallelize_llama,
@@ -292,7 +295,21 @@ def translate_parallel_style(style: str):
 def get_hf_tp_plan(model):
     """Get the Hugging Face tensor parallel plan from the model.
 
+    This function:
+    - Retrieves TP strategies from model class, instance, and inner model levels.
+    - Handles special cases for `embed_tokens` and `lm_head` for speed up.
+    - Converts string-based parallel styles to DTensor parallelization strategies.
+
     Taken and modified from: https://github.com/NVIDIA/NeMo/blob/6c6169db01bcca73ae8ad3ac35242fadbb9a78ba/nemo/lightning/pytorch/strategies/utils.py#L532
+
+    Args:
+        model: A Hugging Face model instance
+
+    Returns:
+        dict: A dictionary mapping model component paths to their parallelization strategies
+
+    Raises:
+        AssertionError: If no TP plan is found
     """
     model_cls = type(model)
     if model_cls == Gemma3ForConditionalGeneration:
@@ -317,7 +334,8 @@ def get_hf_tp_plan(model):
         )
 
     assert len(hf_tp_plan) > 0, (
-        f"Hugging Face tp plan is not supported for {model_cls}, please set dtensor_cfg.tensor_parallel_size to 1 or provide a custom parallel plan."
+        f"Hugging Face tp plan is not supported for {model_cls}, please set dtensor_cfg.tensor_parallel_size to 1 or provide a custom_parallel_plan. "
+        "The usage example of custom_parallel_plan can refer to `docs/design-docs/fsdp2-parallel-plan.md`."
     )
 
     # hf tp plan not contain embed_tokens, we add it and set to rowwise_rep
@@ -344,26 +362,31 @@ def get_hf_tp_plan(model):
 
 
 def _parallelize_model(
-    model: Union[Qwen2ForCausalLM, LlamaForCausalLM],
+    model: Union[
+        Qwen2ForCausalLM,
+        LlamaForCausalLM,
+        Gemma3ForCausalLM,
+        Gemma3ForConditionalGeneration,
+    ],
     dp_mesh: DeviceMesh,
     tp_mesh: DeviceMesh,
     param_dtype: torch.dtype,
     sequence_parallel: bool = False,
     activation_checkpointing: bool = False,
     cpu_offload: bool = False,
-    custom_parallel_plan: Union[dict, str] = None,
+    custom_parallel_plan: Optional[Union[dict, str]] = None,
 ):
     """Parallelize a model using DTensor.
 
     Args:
-        model (Union[Qwen2ForCausalLM, LlamaForCausalLM]): The model to parallelize.
-        dp_mesh (DeviceMesh): Device mesh for data parallelism.
-        tp_mesh (DeviceMesh): Device mesh for tensor parallelism.
-        param_dtype (torch.dtype): Data type for model parameters.
-        sequence_parallel (bool, optional): Whether to use sequence parallelism. Defaults to False.
-        activation_checkpointing (bool, optional): Whether to use activation checkpointing. Defaults to False.
-        cpu_offload (bool, optional): Whether to enable cpu offloading for FSDP. Defaults to False.
-        custom_parallel_plan (Union[dict, str], optional): Custom parallel plan for the model. Defaults to None.
+        model: The model to parallelize.
+        dp_mesh: Device mesh for data parallelism.
+        tp_mesh: Device mesh for tensor parallelism.
+        param_dtype: Data type for model parameters.
+        sequence_parallel: Whether to use sequence parallelism. Defaults to False.
+        activation_checkpointing: Whether to use activation checkpointing. Defaults to False.
+        cpu_offload: Whether to enable cpu offloading for FSDP. Defaults to False.
+        custom_parallel_plan: Custom parallel plan for the model. Defaults to None.
             If it's a dict, it will be used as the parallel plan directly.
             If it's a string, it must be a path that points to a dict or a function that returns a dict.
             The usage example can refer to `docs/design-docs/fsdp2-parallel-plan.md`.
@@ -376,11 +399,11 @@ def _parallelize_model(
     """
     model_cls = type(model)
     if model_cls == Gemma3ForConditionalGeneration:
-        layers = model.language_model.model.layers
+        layers: torch.nn.ModuleList = model.language_model.model.layers  # type: ignore
         num_attention_heads = model.config.text_config.num_attention_heads
         num_key_value_heads = model.config.text_config.num_key_value_heads
     else:
-        layers = model.model.layers
+        layers: torch.nn.ModuleList = model.model.layers  # type: ignore
         num_attention_heads = model.config.num_attention_heads
         num_key_value_heads = model.config.num_key_value_heads