Add MTIA into sharding plan and estimator (#3310)

optimisea · facebook-github-bot · commit 43f14730102b · 2025-08-21T21:10:05.000-07:00
Summary: Pull Request resolved: #3310 as title Differential Revision: D80758637
diff --git a/torchrec/distributed/planner/planners.py b/torchrec/distributed/planner/planners.py
@@ -67,7 +67,7 @@
     ShardingType,
     ShardMetadata,
 )
-from torchrec.distributed.utils import none_throws
+from torchrec.distributed.utils import get_device_type, none_throws
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -178,10 +178,11 @@ def __init__(
         heuristical_storage_reservation_percentage: float = 0.15,
     ) -> None:
         if topology is None:
+            compute_device = get_device_type()
             topology = Topology(
                 local_world_size=get_local_size(),
                 world_size=dist.get_world_size(),
-                compute_device="cuda" if torch.cuda.is_available() else "cpu",
+                compute_device=compute_device,
             )
         self._topology: Topology = topology
         self._batch_size: int = batch_size if batch_size else BATCH_SIZE
@@ -624,7 +625,8 @@ def __init__(
             List[Callable[[List[ShardingOption]], List[ShardingOption]]]
         ] = None,
     ) -> None:
-        default_device = "cuda" if torch.cuda.is_available() else "cpu"
+        default_device = get_device_type()
+
         if topology_groups is None:
             topology_groups = {
                 default_device: Topology(
diff --git a/torchrec/distributed/sharding_plan.py b/torchrec/distributed/sharding_plan.py
@@ -42,7 +42,7 @@
     ShardingType,
     ShardMetadata,
 )
-from torchrec.distributed.utils import none_throws
+from torchrec.distributed.utils import get_device_type, none_throws
 
 
 def get_default_sharders() -> List[ModuleSharder[nn.Module]]:
@@ -620,7 +620,7 @@ def column_wise(
         ranks (Optional[List[int]]): Ranks to place columns. Required if size_per_rank is None.
         size_per_rank (Optional[List[int]]): List specifying the number of columns per rank.
             If provided, the columns will be distributed according to these sizes.
-        device_types (Optional[List[str]]): List of device types (e.g., "cpu", "cuda") for each shard.
+        device_types (Optional[List[str]]): List of device types (e.g., "cpu", "cuda", "mtia") for each shard.
             Used to specify different device placements for different shards.
 
     Returns:
@@ -651,7 +651,7 @@ def _parameter_sharding_generator(
             param: The parameter tensor to be sharded.
             local_size: Number of devices in the local process group.
             world_size: Total number of devices across all process groups.
-            device_type: Type of device (e.g., "cuda", "cpu").
+            device_type: Type of device (e.g., "cuda", "cpu", "mtia").
             sharder: The module sharder instance.
 
         Returns:
@@ -895,7 +895,7 @@ def construct_module_sharding_plan(
         )
     """
     if device_type is None:
-        device_type = "cuda" if torch.cuda.is_available() else "cpu"
+        device_type = get_device_type()
     if sharder is None:
         sharder = get_module_to_default_sharders().get(type(module), None)
     assert (
diff --git a/torchrec/distributed/utils.py b/torchrec/distributed/utils.py
@@ -50,6 +50,16 @@
 """
 
 
+def get_device_type() -> str:
+    if torch.cuda.is_available():
+        device_type = "cuda"
+    elif torch.mtia.is_available():
+        device_type = "mtia"
+    else:
+        device_type = "cpu"
+    return device_type
+
+
 def get_class_name(obj: object) -> str:
     if obj is None:
         return "None"