[CPU]Gather node executed with f16/bf16 precision (#29239)

chenhu-wang · web-flow · commit 871ab4af716a · 2025-03-14T05:56:53.000Z
### Details: - *Gather node executed with f16/bf16 precision, which accept f16/bf16 input directly without weight decompression. Otherwise weight decompression (convert f16/bf16 to f32) is constant folded and consumes big memory* - [x] - *adding test* ### Tickets: - *CVS-161854*
diff --git a/src/common/transformations/include/transformations/op_conversions/convert_gather_to_compressed.hpp b/src/common/transformations/include/transformations/op_conversions/convert_gather_to_compressed.hpp
@@ -4,19 +4,65 @@
 
 #pragma once
 
+#include "openvino/pass/graph_rewrite.hpp"
 #include "openvino/pass/matcher_pass.hpp"
 #include "transformations_visibility.hpp"
 
 namespace ov {
 namespace pass {
 
+class TRANSFORMATIONS_API CompressedGatherTransformation;
 class TRANSFORMATIONS_API ConvertGatherToGatherCompressed;
+class TRANSFORMATIONS_API MoveDecompressionAfterGather;
 
 }  // namespace pass
 }  // namespace ov
 
+/*
+ * ConvertGatherToGatherCompressed transform gather node with constant weight decompression pattern(U8/NF4/U4/I4 +
+ * Subtract + Multiply) to GatherCompressed node, which handle decompression internally.
+ *
+ *                        Subtract_const(U8/NF4/U4/I4)
+ *                             /
+ *    Weights(U8/NF4/U4/I4)  Convert(F32)                              Weights  Subtract_const Multiply_const Indices
+ *       |                 /                                     (U8/NF4/U4/I4) (U8/NF4/U4/I4) (F32)          (I32)
+ *    Convert(F32)   Reshape(optional)                                  \            \        /            /
+ *            \        /       Multiply_const(F32)      ------>           \           \      /           /
+ *            Subtract(optional)     /                                      \          \    /          /
+ *                  \       Reshape(optional)                                 \         \  /         /
+ *                   \       /                                                    GatherCompressed
+ *    Indices(I32)    Multiply
+ *            \     /
+ *             Gather
+ */
 class ov::pass::ConvertGatherToGatherCompressed : public ov::pass::MatcherPass {
 public:
     OPENVINO_MATCHER_PASS_RTTI("ConvertGatherToGatherCompressed");
     ConvertGatherToGatherCompressed();
 };
+
+/*
+ * MoveDecompressionAfterGather transform gather node with constant weight decompression pattern(FP16/BF16 +
+ * convert(FP32)) to gather node with compressed(FP16/BF16) weight, and move decompression after gather node.
+ *
+ *    Weights(FP16/BF16)                            Weights(FP16/BF16) Indices(I32)
+ *         |                                                   \         /
+ *    Convert(F32)   Indices(I32)    ------>                 Gather(FP16/BF16)
+ *          \           /                                           |
+ *           Gather(F32)                                        Convert(F32)
+ *
+ */
+class ov::pass::MoveDecompressionAfterGather : public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("MoveDecompressionAfterGather");
+    MoveDecompressionAfterGather();
+};
+
+class ov::pass::CompressedGatherTransformation : public ov::pass::GraphRewrite {
+public:
+    OPENVINO_GRAPH_REWRITE_RTTI("CompressedGatherTransformation");
+    CompressedGatherTransformation() {
+        add_matcher<ov::pass::ConvertGatherToGatherCompressed>();
+        add_matcher<ov::pass::MoveDecompressionAfterGather>();
+    }
+};
diff --git a/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp b/src/common/transformations/src/transformations/op_conversions/convert_gather_to_compressed.cpp
@@ -16,6 +16,7 @@
 #include "openvino/pass/pattern/op/pattern.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
 #include "ov_ops/gather_compressed.hpp"
+#include "transformations/rt_info/keep_const_precision.hpp"
 #include "transformations/utils/utils.hpp"
 
 ov::pass::ConvertGatherToGatherCompressed::ConvertGatherToGatherCompressed() {
@@ -146,3 +147,40 @@ ov::pass::ConvertGatherToGatherCompressed::ConvertGatherToGatherCompressed() {
     auto m = std::make_shared<ov::pass::pattern::Matcher>(gather_m, "ConvertGatherToGatherCompressed");
     this->register_matcher(m, callback);
 }
+
+ov::pass::MoveDecompressionAfterGather::MoveDecompressionAfterGather() {
+    using namespace ov::pass::pattern;
+
+    auto dicts = wrap_type<ov::op::v0::Constant>(pattern::type_matches_any({element::f16, element::bf16}));
+    auto convert_predicate = [](ov::Output<ov::Node> output) -> bool {
+        return pattern::consumers_count(1)(output) && pattern::type_matches(ov::element::f32)(output);
+    };
+    auto convert = wrap_type<ov::op::v0::Convert>({dicts}, convert_predicate);
+    auto gather = wrap_type<ov::op::v8::Gather>({convert, any_input(), wrap_type<ov::op::v0::Constant>()});
+
+    ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto constant_node = pattern_map.at(dicts).get_node_shared_ptr();
+        auto convert_node = pattern_map.at(convert).get_node_shared_ptr();
+        auto gather_node = pattern_map.at(gather).get_node_shared_ptr();
+        if (transformation_callback(gather_node)) {
+            return false;
+        }
+
+        auto new_gather = gather_node->clone_with_new_inputs(
+            {constant_node, gather_node->get_input_source_output(1), gather_node->get_input_source_output(2)});
+        auto new_convert = convert_node->clone_with_new_inputs({new_gather});
+        register_new_node(new_gather);
+        register_new_node(new_convert);
+
+        ov::enable_keep_const_precision(constant_node);
+
+        new_convert->set_friendly_name(gather_node->get_friendly_name());
+        ov::copy_runtime_info({convert_node, gather_node}, {new_gather, new_convert});
+        replace_node(gather_node, new_convert);
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(gather, "MoveDecompressionAfterGather");
+    this->register_matcher(m, callback);
+}
diff --git a/src/common/transformations/tests/op_conversions/convert_gather_to_compressed_test.cpp b/src/common/transformations/tests/op_conversions/convert_gather_to_compressed_test.cpp
@@ -225,3 +225,50 @@ TEST_F(TransformationTestsF, ConvertGatherToCompressedMultiOutput) {
         model_ref = std::make_shared<ov::Model>(ov::NodeVector{gather_compressed}, ov::ParameterVector{input1, input2});
     }
 }
+
+// In compressed FP16/BF16 weight case, gather node with constant weight decompression pattern (FP16/BF16 +
+// convert(FP32)) is transformed to gather node with compressed (FP16/BF16) weights, and decompression convert is moved
+// after gather node, so GatherCompressed node should not be generated.
+TEST_F(TransformationTestsF, MoveDecompressionAfterGatherFP16Weight) {
+    {
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{-1, 16});
+        auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {1});
+        auto weights_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{32, 16}, {1});
+        auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
+        auto gather = std::make_shared<ov::op::v8::Gather>(convert, input1, axis_const);
+
+        model = std::make_shared<ov::Model>(ov::NodeVector{gather}, ov::ParameterVector{input1});
+        manager.register_pass<MoveDecompressionAfterGather>();
+    }
+    {
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{-1, 16});
+        auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {1});
+        auto weights_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{32, 16}, {1});
+        auto gather = std::make_shared<ov::op::v8::Gather>(weights_const, input1, axis_const);
+        auto convert = std::make_shared<ov::op::v0::Convert>(gather, ov::element::f32);
+
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{convert}, ov::ParameterVector{input1});
+    }
+}
+
+TEST_F(TransformationTestsF, MoveDecompressionAfterGatherBF16Weight) {
+    {
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{-1, 16});
+        auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {1});
+        auto weights_const = ov::op::v0::Constant::create(ov::element::bf16, ov::Shape{32, 16}, {1});
+        auto convert = std::make_shared<ov::op::v0::Convert>(weights_const, ov::element::f32);
+        auto gather = std::make_shared<ov::op::v8::Gather>(convert, input1, axis_const);
+
+        model = std::make_shared<ov::Model>(ov::NodeVector{gather}, ov::ParameterVector{input1});
+        manager.register_pass<MoveDecompressionAfterGather>();
+    }
+    {
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::PartialShape{-1, 16});
+        auto axis_const = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {1});
+        auto weights_const = ov::op::v0::Constant::create(ov::element::bf16, ov::Shape{32, 16}, {1});
+        auto gather = std::make_shared<ov::op::v8::Gather>(weights_const, input1, axis_const);
+        auto convert = std::make_shared<ov::op::v0::Convert>(gather, ov::element::f32);
+
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{convert}, ov::ParameterVector{input1});
+    }
+}
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -359,7 +359,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
     decompression_handling_manager.set_per_pass_validation(false);
     CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::InitNodeInfo);
     const bool useLpt = !defaultPrecisions.empty();
-    CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::ConvertGatherToGatherCompressed);
+    CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::CompressedGatherTransformation);
     CPU_REGISTER_PASS_COMMON(decompression_handling_manager, ov::pass::MarkShapeOfSubgraphs);
     // We need to fuse Transpose to MatMul to have a simpler callback for the next transformation
     CPU_REGISTER_PASS_X64(decompression_handling_manager, ov::pass::TransposeMatMul);
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/gather_weights_decompression.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/gather_weights_decompression.cpp
@@ -45,4 +45,21 @@ INSTANTIATE_TEST_SUITE_P(smoke_GatherCompressedWeights_basic,
                                             ::testing::ValuesIn(per_tensor_scale)),
                          GatherWeightsDecompression::get_test_case_name);
 
+// fp16/bf16 constant + convert(16bit to f32) + gather case
+TEST_P(GatherWeightsDecompressionWithoutScale, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+    run();
+    check_results();
+}
+const std::vector<ov::element::Type> weights_precisions_wo_scale = {ov::element::f16, ov::element::bf16};
+const std::vector<ov::element::Type> output_precisions_wo_scale = {ov::element::f32, ov::element::f16, ov::element::bf16};
+
+INSTANTIATE_TEST_SUITE_P(smoke_GatherCompressedWeightsWithoutScale_basic,
+                         GatherWeightsDecompressionWithoutScale,
+                         ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_CPU),
+                                            ::testing::ValuesIn(input_shapes_basic),
+                                            ::testing::ValuesIn(weights_precisions_wo_scale),
+                                            ::testing::ValuesIn(output_precisions_wo_scale)),
+                         GatherWeightsDecompressionWithoutScale::get_test_case_name);
+
 }  // namespace
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/gather_weights_decompression.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/gather_weights_decompression.hpp
@@ -11,6 +11,12 @@
 namespace ov {
 namespace test {
 
+class GatherWeightsDecompressionBase : virtual public ov::test::SubgraphBaseTest {
+protected:
+    void generate_inputs(const std::vector<ov::Shape>& target_input_static_shapes) override;
+    void check_results(const ov::element::Type& weights_precision, const size_t& num_exec_ops_expect);
+};
+
 /*
  *                        Subtract_const(U8/NF4/U4/I4)
  *                             /
@@ -36,7 +42,7 @@ using GatherWeightsDecompressionParams = std::tuple<std::string,        // Devic
                                                     bool>;              // per-tensor zero-point
 
 class GatherWeightsDecompression : public testing::WithParamInterface<GatherWeightsDecompressionParams>,
-                                   virtual public ov::test::SubgraphBaseTest {
+                                   virtual public ov::test::GatherWeightsDecompressionBase {
 public:
     static std::string get_test_case_name(testing::TestParamInfo<GatherWeightsDecompressionParams> obj);
 
@@ -52,10 +58,37 @@ class GatherWeightsDecompression : public testing::WithParamInterface<GatherWeig
                                              const bool reshape_on_decompression,
                                              const bool per_tensor_zp,
                                              const bool per_tensor_scale);
-    void generate_inputs(const std::vector<ov::Shape>& target_input_static_shapes) override;
     void check_results();
     void SetUp() override;
 };
 
+/*
+ *    Weights(FP16/BF16)
+ *          |
+ *    Convert(F32) Indices(I32)
+ *          \      /
+ *           Gather
+ */
+using GatherWeightsDecompressionWithoutScaleParams = std::tuple<std::string,  // Device name
+                                                                GatherDecompressionShapeParams,
+                                                                ov::element::Type,   // data type
+                                                                ov::element::Type>;  // output type
+
+class GatherWeightsDecompressionWithoutScale
+    : public testing::WithParamInterface<GatherWeightsDecompressionWithoutScaleParams>,
+      virtual public ov::test::GatherWeightsDecompressionBase {
+public:
+    static std::string get_test_case_name(testing::TestParamInfo<GatherWeightsDecompressionWithoutScaleParams> obj);
+
+protected:
+    std::shared_ptr<ov::Model> init_subgraph(const ov::Shape& data_shape,
+                                             const ov::PartialShape& indices_shape,
+                                             const int axis,
+                                             const int64_t batch_dims,
+                                             const ov::element::Type data_precision,
+                                             const ov::element::Type output_precision);
+    void check_results();
+    void SetUp() override;
+};
 }  // namespace test
 }  // namespace ov
diff --git a/src/tests/functional/shared_test_classes/src/subgraph/gather_weights_decompression.cpp b/src/tests/functional/shared_test_classes/src/subgraph/gather_weights_decompression.cpp