basetenlabs
diff --git a/‎examples/pytorch/quickstart_lora.py
-38 b/‎examples/pytorch/quickstart_lora.py
-38
diff --git a/‎tensorrt_llm/_torch/model_config.py
+3-7 b/‎tensorrt_llm/_torch/model_config.py
+3-7
diff --git a/‎tensorrt_llm/_torch/models/modeling_llama.py
+16-6 b/‎tensorrt_llm/_torch/models/modeling_llama.py
+16-6
diff --git a/‎tensorrt_llm/_torch/models/modeling_utils.py
+21-9 b/‎tensorrt_llm/_torch/models/modeling_utils.py
+21-9
diff --git a/‎tensorrt_llm/_torch/modules/attention.py
+7-7 b/‎tensorrt_llm/_torch/modules/attention.py
+7-7
diff --git a/‎tensorrt_llm/_torch/modules/gated_mlp.py
+19-31 b/‎tensorrt_llm/_torch/modules/gated_mlp.py
+19-31
@@ -168,22 +168,18 @@ def from_pretrained(cls,
                    quant_config_dict=layer_quant_config,
                    **kwargs)
 
-    def get_bindings_model_config(
-            self,
-            tensor_parallelism: int = 1,
-            context_parallelism: int = 1) -> "ModelConfigCpp":
+    def get_bindings_model_config(self) -> "ModelConfigCpp":
         """
         This method is used to construct the bindings config for the model.
         Currently it adheres to gptJsonConfig.cpp::createModelConfig, which assumes
         that an engine has been created.
         """
         # TODO smor- this isn't robust, and currently tested for LlamaConfig only
-        # TODO smor- currently parallelism is not supported, set default to 1
         # TODO smor- currently assuming no rnn layers, no MOE
         from tensorrt_llm.bindings import ModelConfig as ModelConfigCpp
 
         num_heads = self.pretrained_config.num_attention_heads // (
-            tensor_parallelism * context_parallelism)
+            self.mapping.tp_size * self.mapping.cp_size)
 
         model_config_cpp = ModelConfigCpp(
             vocab_size=self.pretrained_config.vocab_size,
@@ -195,7 +191,7 @@ def get_bindings_model_config(
             data_type=torch_dtype_to_binding(
                 self.pretrained_config.torch_dtype))
 
-        mlp_hidden_size = self.pretrained_config.intermediate_size // tensor_parallelism
+        mlp_hidden_size = self.pretrained_config.intermediate_size // self.mapping.tp_size
         if "head_size" in self.pretrained_config:
             head_size = self.pretrained_config.head_size
         else:
 
@@ -13,6 +13,7 @@
                                              AllReduceParams, DeepseekAllReduce)
 from tensorrt_llm._torch.pipeline_interface import PipelineInterface
 from tensorrt_llm.functional import PositionEmbeddingType
+from tensorrt_llm.models.convert_utils import split_matrix_tp
 
 from ...inputs import (ExtraProcessedInputs, InputProcessor, TextPrompt,
                        register_input_processor)
@@ -773,13 +774,14 @@ def __init__(self, model_config: ModelConfig[LlamaConfig]):
         self.padding_idx = config.pad_token_id
 
         vocab_size = config.vocab_size
-        # TODO smor- hack
-        if hasattr(model_config,
-                   'lora_config') and model_config.lora_config is not None:
+        # TODO smor- we load manually only if there is a single lora dir, need to come up with a better solution
+        if hasattr(
+                model_config,
+                'lora_config') and model_config.lora_config is not None and len(
+                    model_config.lora_config.lora_dir) == 1:
             from tensorrt_llm.lora_manager import HfLoraLoader
             lora_loader = HfLoraLoader(model_config.lora_config.lora_dir)
             weight = lora_loader.embed_tokens
-            # TODO smor - need to split tp matrix here
             vocab_size = lora_loader.vocab_size
 
         self.embed_tokens = Embedding(
@@ -791,9 +793,17 @@ def __init__(self, model_config: ModelConfig[LlamaConfig]):
             gather_output=True,
         )
 
-        if hasattr(model_config,
-                   'lora_config') and model_config.lora_config is not None:
+        if hasattr(
+                model_config,
+                'lora_config') and model_config.lora_config is not None and len(
+                    model_config.lora_config.lora_dir) == 1:
             with torch.no_grad():
+                if model_config.mapping.tp_size > 1:
+                    weight = split_matrix_tp(
+                        weight,
+                        model_config.mapping.tp_size,
+                        model_config.mapping.tp_rank,
+                        dim=0)  # split by vocabulary dimension
                 x = weight.to(self.embed_tokens.dtype)
                 self.embed_tokens.weight.data.copy_(x)
 
 
@@ -11,6 +11,9 @@
 from torch.utils._pytree import tree_any_only
 from tqdm import tqdm
 
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models.convert_utils import split_matrix_tp
+
 from ...logger import logger
 from ...mapping import Mapping
 from ...models.modeling_utils import QuantConfig
@@ -240,7 +243,7 @@ def forward(
         input_ids: torch.LongTensor = None,
         position_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
-        lora_params: Optional = None,  # TODO smor add type hint
+        lora_params: Optional[dict] = None,
         **kwargs,
     ) -> torch.Tensor:
         if (input_ids is None) ^ (inputs_embeds is not None):
@@ -357,9 +360,9 @@ def __init__(self, model: TModel, *, config: ModelConfig[TConfig],
                 # TODO(zhenhuanc): Currently lm_head Linear will not accept QuantConfig
                 # will considering per layer QuantConfig in the future.
 
-                # TODO smor- hack
-                if hasattr(config,
-                           'lora_config') and config.lora_config is not None:
+                if hasattr(config, 'lora_config'
+                           ) and config.lora_config is not None and len(
+                               config.lora_config.lora_dir) == 1:
                     from tensorrt_llm.lora_manager import HfLoraLoader
                     lora_loader = HfLoraLoader(config.lora_config.lora_dir)
                     weight = lora_loader.lm_head
@@ -374,9 +377,16 @@ def __init__(self, model: TModel, *, config: ModelConfig[TConfig],
                     gather_output=True,
                 )
 
-                if hasattr(config,
-                           'lora_config') and config.lora_config is not None:
+                if hasattr(config, 'lora_config'
+                           ) and config.lora_config is not None and len(
+                               config.lora_config.lora_dir) == 1:
                     with torch.no_grad():
+                        if config.mapping.tp_size > 1:
+                            weight = split_matrix_tp(
+                                weight,
+                                config.mapping.tp_size,
+                                config.mapping.tp_rank,
+                                dim=0)  # split by vocabulary dimension
                         x = weight.to(self.lm_head.dtype).cuda()
                         self.lm_head.weight.data.copy_(x)
 
@@ -475,7 +485,7 @@ def forward(
         pipeline_interface: Optional[PipelineInterface] = None,
         return_context_logits: bool = False,
         spec_metadata: Optional[SpecMetadata] = None,
-        lora_params: Optional = None,  # TODO smor add type hint
+        lora_params: Optional[dict] = None,
         **kwargs,
     ) -> torch.Tensor:
         if self._supports_pp and self.pp_size > 1:
@@ -657,8 +667,10 @@ def filter_weights(prefix, weights: Dict):
 
             # Skip loading weights for embedding and lm_head if LoRA is enabled
             if hasattr(model.model_config, 'lora_config'
-                       ) and model.model_config.lora_config is not None and (
-                           name == "model.embed_tokens" or name == "lm_head"):
+                       ) and model.model_config.lora_config is not None and len(
+                           model.model_config.lora_config.lora_dir) == 1 and (
+                               name == "model.embed_tokens"
+                               or name == "lm_head"):
                 continue
 
             # Skip if parameter belongs to a missing layer
 
@@ -88,6 +88,9 @@ def __init__(
             quant_config=config.get_quant_config(),
             skip_create_weights_in_init=config.skip_create_weights_in_init,
         )
+        self.o_lora = LoraLayer([LoraModuleType.ATTENTION_DENSE],
+                                [self.hidden_size])
+
         self.o_proj = Linear(
             tp_size * self.q_size,
             self.hidden_size,
@@ -97,6 +100,7 @@ def __init__(
             tensor_parallel_mode=TensorParallelMode.ROW,
             quant_config=config.get_quant_config(),
             skip_create_weights_in_init=config.skip_create_weights_in_init,
+            lora=self.o_lora,
         )
         self.quant_config = config.get_quant_config()
         self.attn_backend = config.attn_backend
@@ -229,13 +233,9 @@ def forward(
                                         mrope_config=mrope_config)
         hidden_states = attn_output
         attn_output = self.o_proj(attn_output,
-                                  all_reduce_params=all_reduce_params)
-        if bool(lora_params):
-            attn_lora_output = self.o_lora(hidden_states, lora_params,
-                                           self.layer_idx)
-            if attn_lora_output is not None:
-                attn_output = attn_output + attn_lora_output
-
+                                  all_reduce_params=all_reduce_params,
+                                  lora_params=lora_params,
+                                  layer_idx=self.layer_idx)
         return attn_output
 
 
 
@@ -76,6 +76,9 @@ def __init__(self,
             reduce_output=False,
             skip_create_weights_in_init=config.skip_create_weights_in_init,
         )
+        self.down_lora = LoraLayer([LoraModuleType.MLP_4H_TO_H],
+                                   [self.hidden_size])
+
         self.down_proj = Linear(
             self.intermediate_size,
             self.hidden_size,
@@ -86,18 +89,20 @@ def __init__(self,
             quant_config=config.get_quant_config(),
             reduce_output=reduce_output,
             skip_create_weights_in_init=config.skip_create_weights_in_init,
+            lora=self.down_lora,
         )
 
         # These two modules are mutually exclusive - either splitted_gate_up_lora or fused_gate_up_lora will be used,
         # but never both at the same time. splitted_gate_up_lora handles gate and up separately while fused_gate_up_lora
         # handles them as a single fused operation.
         self.splitted_gate_up_lora = LoraLayer(
-            [LoraModuleType.MLP_H_TO_4H, LoraModuleType.MLP_GATE],
-            [self.intermediate_size, self.intermediate_size])
-        self.fused_gate_up_lora = LoraLayer([LoraModuleType.MLP_GATE_UP],
-                                            [2 * self.intermediate_size])
-        self.down_lora = LoraLayer([LoraModuleType.MLP_4H_TO_H],
-                                   [self.hidden_size])
+            [LoraModuleType.MLP_H_TO_4H, LoraModuleType.MLP_GATE], [
+                self.intermediate_size // mapping.tp_size,
+                self.intermediate_size // mapping.tp_size
+            ])
+        self.fused_gate_up_lora = LoraLayer(
+            [LoraModuleType.MLP_GATE_UP],
+            [2 * self.intermediate_size // mapping.tp_size])
 
     def forward(
         self,
@@ -107,33 +112,17 @@ def forward(
         lora_params: Optional[dict] = None,
         **kwargs,
     ) -> torch.Tensor:
-        if lora_params is not None:
+        if bool(lora_params):
             return self.forward_lora(x, all_rank_num_tokens,
                                      final_all_reduce_params, lora_params)
 
         if self.activation == F.silu:
             h1 = self.gate_up_proj(x)
-            if bool(lora_params):
-                assert self.layer_idx is not None, "layer_idx is required for lora"
-                h1_lora = self.splitted_gate_up_lora(x, lora_params,
-                                                     self.layer_idx)
-                if h1_lora is not None:
-                    h1 = h1 + h1_lora
-
-                h1_lora = self.fused_gate_up_lora(x, lora_params,
-                                                  self.layer_idx)
-
-                if h1_lora is not None:
-                    h1 = h1 + h1_lora
 
             h2 = swiglu(h1)
             output = self.down_proj(h2,
-                                    all_reduce_params=final_all_reduce_params)
-            if bool(lora_params):
-                output_lora = self.down_lora(h2, lora_params, self.layer_idx)
-                if output_lora is not None:
-                    output = output + output_lora
-
+                                    all_reduce_params=final_all_reduce_params,
+                                    layer_idx=self.layer_idx)
             return output
         else:
             raise NotImplementedError(
@@ -154,19 +143,18 @@ def forward_lora(
         h1 = self.gate_up_proj(x)
 
         h1_lora = self.splitted_gate_up_lora(x, lora_params, self.layer_idx)
+
         if h1_lora is not None:
             h1 = h1 + h1_lora
 
         h1_lora = self.fused_gate_up_lora(x, lora_params, self.layer_idx)
-
         if h1_lora is not None:
             h1 = h1 + h1_lora
 
         h2 = swiglu(h1)
-        output = self.down_proj(h2, all_reduce_params=final_all_reduce_params)
-
-        output_lora = self.down_lora(h2, lora_params, self.layer_idx)
-        if output_lora is not None:
-            output = output + output_lora
+        output = self.down_proj(h2,
+                                all_reduce_params=final_all_reduce_params,
+                                lora_params=lora_params,
+                                layer_idx=self.layer_idx)
 
         return output