[PT FE] Support converting f8 compressed models (#29432)

mvafin · web-flow · commit 459e8f0a6541 · 2025-03-12T20:43:19.000Z
### Details: - *Support converting f8 compressed models* ### Tickets: - *CVS-164161* --------- Signed-off-by: Maxim Vafin <maxim.vafin@intel.com>
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/patch_model.py b/src/bindings/python/src/openvino/frontend/pytorch/patch_model.py
@@ -84,11 +84,12 @@ def __make_16bit_traceable(model: torch.nn.Module,
      - Replace known list of modules with ModuleExtension.
      - Convert other modules with weights to FP32.
     """
+    supported = {torch.float16, torch.bfloat16, torch.float8_e4m3fn, torch.float8_e5m2}
     if patch_condition is None:
         def patch_condition(module):
-            supported = {torch.float32, torch.float16, torch.bfloat16}
+            dtype_to_patch = {torch.float32, *supported}
             weight = getattr(module, "weight", None)
-            return weight is not None and weight.dtype in supported
+            return weight is not None and weight.dtype in dtype_to_patch
 
     def fp32_tensor(*shape):
         return torch.full(shape, 0.5, dtype=torch.float32)
@@ -123,10 +124,9 @@ def fp32_tensor(*shape):
     except ImportError:
         pass
     patch_model(model, extensions, orig_forward_name)
-    dtype_to_patch = {torch.float16, torch.bfloat16}
     for _, module in model.named_modules():
         if (module.__class__ not in extensions and
-            (any(p.dtype in dtype_to_patch for p in module.parameters(False))
-             or any(b.dtype in dtype_to_patch for b in module.buffers(False)))):
+            (any(p.dtype in supported for p in module.parameters(False))
+             or any(b.dtype in supported for b in module.buffers(False)))):
             log.debug("Casting module %s to float32", module)
             module.float()
diff --git a/src/common/transformations/include/transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp b/src/common/transformations/include/transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp
@@ -62,10 +62,10 @@ class ov::pass::KeepConstantsPrecisionAndAddConverts : public MatcherPass {
 
 /**
  * @ingroup ov_transformation_common_api
- * @brief Prevents ConstantFolding for f16/bf16 Const + Convert_To_FP32 to keep original FW float Constants.
+ * @brief Prevents ConstantFolding for low precision Const + Convert_To_FP32 to keep original FW float Constants.
  * Original precision should be kept as long as possible, this prevents redundant conversions and saves memory.
  * E.g. if original FW model was already compressed no need to upcast during CF, store intermediate f32 consts and
- * then again compress them to f16 during save_model.
+ * then again compress them to low precision during save_model.
  */
 class ov::pass::MarkCompressedFloatConstants : public MatcherPass {
 public:
diff --git a/src/common/transformations/src/transformations/fp16_compression/mark_decompression_convert_constant_folding.cpp b/src/common/transformations/src/transformations/fp16_compression/mark_decompression_convert_constant_folding.cpp
@@ -135,7 +135,10 @@ pass::MarkCompressedFloatConstants::MarkCompressedFloatConstants() {
         if (convert_node->get_destination_type() != element::f32)
             return false;
         if (const_node->get_output_element_type(0) != element::f16 &&
-            const_node->get_output_element_type(0) != element::bf16)
+            const_node->get_output_element_type(0) != element::bf16 &&
+            const_node->get_output_element_type(0) != element::f8e4m3 &&
+            const_node->get_output_element_type(0) != element::f8e5m2 &&
+            const_node->get_output_element_type(0) != element::f8e8m0)
             return false;
 
         mark_as_decompression(convert_node);
diff --git a/src/frontends/pytorch/src/frontend.cpp b/src/frontends/pytorch/src/frontend.cpp
@@ -273,7 +273,7 @@ void FrontEnd::normalize(const std::shared_ptr<ov::Model>& model) const {
         manager.register_pass<ov::pass::ConvertConvertLike>();
         manager.register_pass<ov::frontend::pytorch::pass::AtenIndexToSelect>();
 
-        // Mark quantized and f16/bf16 compressed constants to prevent CF for them,
+        // Mark low precision compressed constants to prevent CF for them,
         // so that not extra memory is used for intermediate decompressed constants.
         manager.register_pass<ov::pass::MarkCompressedFloatConstants>();
 
diff --git a/src/frontends/pytorch/src/op/embedding.cpp b/src/frontends/pytorch/src/op/embedding.cpp
@@ -33,7 +33,9 @@ OutputVector translate_embedding_ext(const NodeContext& context) {
     // used in 16-bit patching
     num_inputs_check(context, 2, 5);
     auto data = context.get_input(0);
-    data = context.mark_node(std::make_shared<ov::op::v0::Convert>(data, element::f32));
+    if (data.get_element_type() != element::f32) {
+        data = context.mark_node(std::make_shared<ov::op::v0::Convert>(data, element::f32));
+    }
     auto indices = context.get_input(1);
     indices = context.mark_node(std::make_shared<ov::op::v0::Convert>(indices, element::i32));
     auto axis_0 = context.mark_node(ov::op::v0::Constant::create(element::i32, Shape{}, {0}));
diff --git a/src/frontends/pytorch/src/op/linear.cpp b/src/frontends/pytorch/src/op/linear.cpp
@@ -36,9 +36,8 @@ OutputVector translate_linear_ext(const NodeContext& context) {
     auto x = context.get_input(0);
     auto initial_x = x;
     auto weight = context.get_input(1);
-    bool is_compressed = weight.get_element_type() == element::f16 || weight.get_element_type() == element::bf16;
     bool convert_back = false;
-    if (is_compressed) {
+    if (weight.get_element_type() != element::f32) {
         // In case of patched linear it can have mixed fp16/bf16 and fp32 input type.
         // In other cases these conversion is not required.
         weight = context.mark_node(std::make_shared<v0::Convert>(weight, element::f32));
@@ -52,7 +51,7 @@ OutputVector translate_linear_ext(const NodeContext& context) {
     if (!context.input_is_none(2)) {
         auto bias = context.get_input(2);
 
-        if (bias.get_element_type() == element::f16 || bias.get_element_type() == element::bf16) {
+        if (bias.get_element_type() != element::f32) {
             // Same reason as for weight.
             bias = context.mark_node(std::make_shared<v0::Convert>(bias, element::f32));
         }
diff --git a/tests/layer_tests/py_frontend_tests/test_torch_frontend.py b/tests/layer_tests/py_frontend_tests/test_torch_frontend.py
@@ -428,7 +428,6 @@ def sin_op(context):
         "Parameter", "Sin", "Result"]
 
 
-
 def test_multiple_module_extension():
     from openvino.frontend.pytorch.ts_decoder import TorchScriptPythonDecoder
     from openvino.frontend.pytorch import ModuleExtension
@@ -764,6 +763,7 @@ def forward(self, x1, x2):
     np.testing.assert_allclose(res_bf16[0], res_ref[0].numpy(), atol=1e-2)
     np.testing.assert_allclose(res_bf16[1], res_ref[1].numpy(), atol=1e-2)
 
+
 def test_patched_16bit_model_with_convert():
     from openvino.frontend.pytorch import patch_model
     from openvino import convert_model, Type
@@ -797,6 +797,61 @@ def forward(self, x):
     assert mm_num == 2
 
 
+def test_patched_8bit_model_converts():
+    from openvino.frontend.pytorch import patch_model
+    from openvino import convert_model, compile_model
+    from transformers.pytorch_utils import Conv1D
+
+    class ModelWithLinear(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+            self.branch1 = torch.nn.Sequential(
+                torch.nn.Embedding(10, 64),
+                torch.nn.Linear(64, 32),
+                torch.nn.ReLU()
+            )
+            self.branch2 = torch.nn.Sequential(
+                Conv1D(256, 128),
+                torch.nn.Linear(256, 64), torch.nn.ReLU()
+            )
+            self.buffer = torch.ones(32)
+
+        def forward(self, x1, x2):
+            out1 = self.branch1(x1)
+            out2 = self.branch2(x2)
+            return (out1 + self.buffer, out2)
+
+    example = (torch.randint(0, 10, [32, 64]), torch.randn(32, 128))
+
+    model_ref = ModelWithLinear().to(torch.float8_e4m3fn).float()
+    with torch.no_grad():
+        res_ref = model_ref(*example)
+    model_f8_e4m3 = model_ref.to(torch.float8_e4m3fn)
+    patch_model.__make_16bit_traceable(model_f8_e4m3)
+    # the approach with patching only works for node with no grad
+    with torch.no_grad():
+        converted_model = convert_model(model_f8_e4m3, example_input=example)
+    assert converted_model
+    cm_f8_e4m3 = compile_model(converted_model, "CPU")
+    res_f8_e4m3 = cm_f8_e4m3([x.numpy() for x in example])
+    np.testing.assert_allclose(res_f8_e4m3[0], res_ref[0].numpy(), atol=1e-2)
+    np.testing.assert_allclose(res_f8_e4m3[1], res_ref[1].numpy(), atol=1e-2)
+
+    model_ref = ModelWithLinear().to(torch.float8_e5m2).float()
+    with torch.no_grad():
+        res_ref = model_ref(*example)
+    model_f8_e5m2 = model_ref.to(torch.float8_e5m2)
+    patch_model.__make_16bit_traceable(model_f8_e5m2)
+    # the approach with patching only works for node with no grad
+    with torch.no_grad():
+        converted_model = convert_model(model_f8_e5m2, example_input=example)
+    assert converted_model
+    cm_f8_e5m2 = compile_model(converted_model, "CPU")
+    res_f8_e5m2 = cm_f8_e5m2([x.numpy() for x in example])
+    np.testing.assert_allclose(res_f8_e5m2[0], res_ref[0].numpy(), atol=1e-2)
+    np.testing.assert_allclose(res_f8_e5m2[1], res_ref[1].numpy(), atol=1e-2)
+
 
 class InlinedInputsModel(torch.nn.Module):
     def __init__(self):