[CPU] Add Clamp for FakeConvertDecomposition (openvinotoolkit#28651)

xuchen-intel · web-flow · commit 72268b5c5d09 · 2025-02-28T07:23:31.000Z
### Details: - *Ngraph FakeConvert layer applies clamp for f8 (f8e4m3 applies clamp, f8e5m2 partially applies clamp), while ngraph Convert layer doesn't apply clamp. So the idea is to add Clamp layer in FakeConvertDecomposition to assure the clamp behavior of FakeConvert is still included for plugins after decomposition.* - *Ngraph reference emulate_f8e4m3_on_fp16 applies clamp for overflowed value as well as for NaN(f8e4m3 does not have INF in Specification). However, it seems emulate_f8e5m2_on_fp16 only applies clamp for overflowed value (by the flag can_round), but not for INF. To align behavior between f8e4m3 and f8e5m2, clamp for INF is added in emulate_f8e5m2_on_fp16.* - *Test cases are added to reproduce the issue beforehand.* ### Tickets: - *[CVS-160719](https://jira.devtools.intel.com/browse/CVS-160719)* - *[CVS-161459](https://jira.devtools.intel.com/browse/CVS-161459)*
diff --git a/src/common/transformations/src/transformations/op_conversions/fake_convert_decomposition.cpp b/src/common/transformations/src/transformations/op_conversions/fake_convert_decomposition.cpp
@@ -7,6 +7,7 @@
 #include "itt.hpp"
 #include "openvino/core/rt_info.hpp"
 #include "openvino/op/add.hpp"
+#include "openvino/op/clamp.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/convert.hpp"
 #include "openvino/op/divide.hpp"
@@ -41,20 +42,30 @@ ov::pass::FakeConvertDecomposition::FakeConvertDecomposition() {
             data = decomp_ops.add(data.get_node_shared_ptr());
         }
 
+        // Align with clamp behavior of FakeConvert in ngraph reference
+        const auto lower_bound = fake_convert_node->get_destination_element_type() == ov::element::f8e4m3
+                                     ? static_cast<float>(std::numeric_limits<ov::float8_e4m3>::lowest())
+                                     : static_cast<float>(std::numeric_limits<ov::float8_e5m2>::lowest());
+        const auto upper_bound = fake_convert_node->get_destination_element_type() == ov::element::f8e4m3
+                                     ? static_cast<float>(std::numeric_limits<ov::float8_e4m3>::max())
+                                     : static_cast<float>(std::numeric_limits<ov::float8_e5m2>::max());
+
         std::shared_ptr<Node> result;
         const auto scale = decomp_ops.make<ov::op::v1::Multiply>(data, input_scale);
         if (fake_convert_node->get_input_size() == 2) {
+            const auto clamp = decomp_ops.make<ov::op::v0::Clamp>(scale, lower_bound, upper_bound);
             const auto downconvert =
-                decomp_ops.make<ov::op::v0::Convert>(scale, fake_convert_node->get_destination_element_type());
+                decomp_ops.make<ov::op::v0::Convert>(clamp, fake_convert_node->get_destination_element_type());
             const auto upconvert = decomp_ops.make<ov::op::v0::Convert>(downconvert, input_type);
 
             result = decomp_ops.make<ov::op::v1::Divide>(upconvert, input_scale);
         } else {
             const Output<Node> input_shift{fake_convert_node->input_value(2)};
             const auto shift = decomp_ops.make<ov::op::v1::Subtract>(scale, input_shift);
 
+            const auto clamp = decomp_ops.make<ov::op::v0::Clamp>(shift, lower_bound, upper_bound);
             const auto downconvert =
-                decomp_ops.make<ov::op::v0::Convert>(shift, fake_convert_node->get_destination_element_type());
+                decomp_ops.make<ov::op::v0::Convert>(clamp, fake_convert_node->get_destination_element_type());
             const auto upconvert = decomp_ops.make<ov::op::v0::Convert>(downconvert, input_type);
 
             const auto deshift = decomp_ops.make<ov::op::v1::Add>(upconvert, input_shift);
diff --git a/src/common/transformations/tests/op_conversions/fake_convert_decomposition_test.cpp b/src/common/transformations/tests/op_conversions/fake_convert_decomposition_test.cpp
@@ -80,17 +80,26 @@ TEST_P(FakeConvertDecompositionTest, CompareFunctions) {
         params.push_back(input_data);
         std::shared_ptr<Node> data = input_data;
 
+        const auto lower_bound = dst_prec == ov::element::f8e4m3
+                                     ? static_cast<float>(std::numeric_limits<ov::float8_e4m3>::lowest())
+                                     : static_cast<float>(std::numeric_limits<ov::float8_e5m2>::lowest());
+        const auto upper_bound = dst_prec == ov::element::f8e4m3
+                                     ? static_cast<float>(std::numeric_limits<ov::float8_e4m3>::max())
+                                     : static_cast<float>(std::numeric_limits<ov::float8_e5m2>::max());
+
         std::shared_ptr<Node> result;
         const auto scale = std::make_shared<ov::op::v1::Multiply>(data, input_scale);
         if (default_shift) {
-            const auto downconvert = std::make_shared<ov::op::v0::Convert>(scale, dst_prec);
+            const auto clamp = std::make_shared<ov::op::v0::Clamp>(scale, lower_bound, upper_bound);
+            const auto downconvert = std::make_shared<ov::op::v0::Convert>(clamp, dst_prec);
             const auto upconvert = std::make_shared<ov::op::v0::Convert>(downconvert, data_prec);
 
             result = std::make_shared<ov::op::v1::Divide>(upconvert, input_scale);
         } else {
             const auto shift = std::make_shared<ov::op::v1::Subtract>(scale, input_shift);
 
-            const auto downconvert = std::make_shared<ov::op::v0::Convert>(shift, dst_prec);
+            const auto clamp = std::make_shared<ov::op::v0::Clamp>(shift, lower_bound, upper_bound);
+            const auto downconvert = std::make_shared<ov::op::v0::Convert>(clamp, dst_prec);
             const auto upconvert = std::make_shared<ov::op::v0::Convert>(downconvert, data_prec);
 
             const auto deshift = std::make_shared<ov::op::v1::Add>(upconvert, input_shift);
diff --git a/src/core/reference/include/openvino/reference/fake_convert.hpp b/src/core/reference/include/openvino/reference/fake_convert.hpp
@@ -18,23 +18,25 @@ namespace func {
  *
  * @param arg_f       Pointer to the input data.
  * @param out_f       Pointer to the otuput data.
- * @param count     Number of elements in the data input.
+ * @param count       Number of elements in the data input.
+ * @param use_clamp   If use clamp.
  */
-void emulate_f8e5m2_on_fp16(const float16* const arg_f, float16* out_f, size_t count);
+void emulate_f8e5m2_on_fp16(const float16* const arg_f, float16* out_f, size_t count, bool use_clamp = true);
 
 /**
  * @brief Emulation of conversion fp16 value to f8e4m3 format
  *
  * @param arg_f       Pointer to the input data.
  * @param out_f       Pointer to the otuput data.
- * @param count     Number of elements in the data input.
+ * @param count       Number of elements in the data input.
+ * @param use_clamp   If use clamp.
  *
  * Exponent denormal values 0 -7
  * Exponent normal values 1..15 -6..8 (7 - exponent)
  * Exponent NaN values 15 8
  *
  */
-void emulate_f8e4m3_on_fp16(const float16* arg_f, float16* out_f, size_t count);
+void emulate_f8e4m3_on_fp16(const float16* arg_f, float16* out_f, size_t count, bool use_clamp = true);
 }  // namespace func
 
 namespace fake_convert_details {
diff --git a/src/core/reference/src/op/fake_convert.cpp b/src/core/reference/src/op/fake_convert.cpp
@@ -7,7 +7,15 @@
 namespace ov {
 namespace reference {
 namespace func {
-void emulate_f8e5m2_on_fp16(const float16* const arg_f, float16* out_f, size_t count) {
+/**
+ * @brief Emulation of conversion fp16 value to f8e5m2 format
+ *
+ * @param arg_f       Pointer to the input data.
+ * @param out_f       Pointer to the otuput data.
+ * @param count       Number of elements in the data input.
+ * @param use_clamp   If use clamp.
+ */
+void emulate_f8e5m2_on_fp16(const float16* const arg_f, float16* out_f, size_t count, bool use_clamp) {
     const auto arg_u = reinterpret_cast<const uint16_t*>(arg_f);
     auto out_u = reinterpret_cast<uint16_t*>(out_f);
     uint16_t val_bit_repr;
@@ -24,13 +32,6 @@ void emulate_f8e5m2_on_fp16(const float16* const arg_f, float16* out_f, size_t c
     for (size_t i = 0; i < count; ++i) {
         /// converts float number to half precision in round-to-nearest-even mode and returns half with converted value.
         val_bit_repr = arg_u[i];
-        /// 0x7c00 = 0111110000000000 - exponent mask
-        /// s 11111 xxx xxxx xxxx - is nan (if some x is 1) or inf (if all x is 0)
-        /// 0x7800 is 0111100000000000 and 0x400 is 0000010000000000
-        /// number is not normal if all exponent is 1 or 0
-        /// 0x7f00 is 0 11111 1100000000
-        /// 0x7b00 is 0 11110 1100000000
-        const bool can_round = ((val_bit_repr & 0x7F00) < 0x7B00) ? true : false;
         /// s 11111 xxx xxxx xxxx - is nan (if some x is 1) or inf (if all x is 0)
         const bool is_naninf = ((val_bit_repr & fp16_inf) == fp16_inf) ? true : false;
         /* nearest rounding masks */
@@ -39,7 +40,7 @@ void emulate_f8e5m2_on_fp16(const float16* const arg_f, float16* out_f, size_t c
         /// rne_tie - 0x180 is      0 00000 0110000000 or 384.0
         uint16_t rnmask_tie = (val_bit_repr & rne_tie);
 
-        if (!is_naninf && can_round) {
+        if (!is_naninf) {
             /* round to nearest even, if rne_mask is enabled */
             /* 0 00000 0010000000, find grs patterns */
             // 0xx - do nothing
@@ -48,6 +49,10 @@ void emulate_f8e5m2_on_fp16(const float16* const arg_f, float16* out_f, size_t c
             val_bit_repr += (((rnmask > 0x0080) || (rnmask_tie == rne_tie)) << lshift);
         }
         val_bit_repr &= mask_mant; /* truncation */
+        if (use_clamp) {
+            // clamp inf to max and -inf to lowest, S.11111.00 -> S.11110.11
+            val_bit_repr -= (((val_bit_repr & 0x7F00) == fp16_inf) << lshift);
+        }
         out_u[i] = val_bit_repr;
     }
 }
@@ -57,19 +62,19 @@ void emulate_f8e5m2_on_fp16(const float16* const arg_f, float16* out_f, size_t c
  *
  * @param arg_f       Pointer to the input data.
  * @param out_f       Pointer to the otuput data.
- * @param count     Number of elements in the data input.
+ * @param count       Number of elements in the data input.
+ * @param use_clamp   If use clamp.
  *
  * Exponent denormal values 0 -7
  * Exponent normal values 1..15 -6..8 (7 - exponent)
  * Exponent NaN values 15 8
  *
  */
-void emulate_f8e4m3_on_fp16(const float16* arg_f, float16* out_f, size_t count) {
+void emulate_f8e4m3_on_fp16(const float16* arg_f, float16* out_f, size_t count, bool use_clamp) {
     const auto arg_u = reinterpret_cast<const uint16_t*>(arg_f);
     auto out_u = reinterpret_cast<uint16_t*>(out_f);
     uint16_t val_bit_repr;
 
-    constexpr auto use_clamp = true;
     constexpr auto exp_bits = 5;
     constexpr auto mbits = 9;
     constexpr auto non_mant_bits = exp_bits + 1;  /// exponent + sign
diff --git a/src/core/src/type/float8_e5m2.cpp b/src/core/src/type/float8_e5m2.cpp
@@ -28,7 +28,7 @@ constexpr uint8_t f8e5m2_m_mask = 0x03;  // f8e5m2 mantissa bit mask
 
 uint8_t f32_to_f8e5m2_bits(const float value) {
     auto f16 = static_cast<float16>(value);
-    reference::func::emulate_f8e5m2_on_fp16(&f16, &f16, 1);
+    reference::func::emulate_f8e5m2_on_fp16(&f16, &f16, 1, false);
     return static_cast<uint8_t>((f16.to_bits() >> byte_shift));
 }
 }  // namespace
diff --git a/src/core/tests/eval.cpp b/src/core/tests/eval.cpp
@@ -3569,10 +3569,13 @@ TEST(eval, evaluate_fake_convert_f32_to_f8e5m2_big_scale_1) {
     EXPECT_EQ(result.get_element_type(), et);
     EXPECT_EQ(result.get_shape(), data_shape);
 
-    constexpr auto inf = std::numeric_limits<float>::infinity();
-    EXPECT_THAT(
-        read_vector<float>(result),
-        Pointwise(FloatEq(), std::vector<float>{fp8::MAX_F8E5M2 / 2.f, fp8::MAX_F8E5M2, fp8::MAX_F8E5M2, inf, inf}));
+    EXPECT_THAT(read_vector<float>(result),
+                Pointwise(FloatEq(),
+                          std::vector<float>{fp8::MAX_F8E5M2 / 2.f,
+                                             fp8::MAX_F8E5M2,
+                                             fp8::MAX_F8E5M2,
+                                             fp8::MAX_F8E5M2,
+                                             fp8::MAX_F8E5M2}));
 }
 
 TEST(eval, evaluate_fake_convert_f32_matching_f8_to_f8e5m2_scale_1) {
diff --git a/src/core/tests/float8_e5m2.cpp b/src/core/tests/float8_e5m2.cpp
@@ -191,28 +191,28 @@ TEST(F8E5M2Test, f8e5m2_num_limits_exp) {
     EXPECT_EQ(max_exp10, 4);
 }
 
-TEST(F8E5M2Test, f32_ge_f8_max_round_to_inf) {
+TEST(F8E5M2Test, f32_as_f16_inf_gt_f8_max_round_to_inf) {
     const auto f8 = ov::float8_e5m2(65520.0f);
 
     EXPECT_EQ(f8.to_bits(), 0b01111100);
 }
 
-TEST(F8E5M2Test, f32_ge_f8_max_round_to_max) {
+TEST(F8E5M2Test, f32_gt_f16_max_gt_f8_max_round_to_inf) {
     const auto f8 = ov::float8_e5m2(65519.9f);
 
-    EXPECT_EQ(f8.to_bits(), 0b01111011);
+    EXPECT_EQ(f8.to_bits(), 0b01111100);
 }
 
-TEST(F8E5M2Test, f32_ge_f8_max_round_to_minus_inf) {
+TEST(F8E5M2Test, f32_as_f16_minus_inf_lt_f8_lowest_round_to_minus_inf) {
     const auto f8 = ov::float8_e5m2(-65520.0f);
 
     EXPECT_EQ(f8.to_bits(), 0b11111100);
 }
 
-TEST(F8E5M2Test, f32_ge_f8_max_round_to_lowest) {
+TEST(F8E5M2Test, f32_lt_f16_lowest_lt_f8_lowest_round_to_minus_inf) {
     const auto f8 = ov::float8_e5m2(-65519.9f);
 
-    EXPECT_EQ(f8.to_bits(), 0b11111011);
+    EXPECT_EQ(f8.to_bits(), 0b11111100);
 }
 
 template <class TContainer>
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -175,6 +175,9 @@ std::vector<std::string> disabledTestPatterns() {
         R"(.*smoke_TopK/TopKLayerTest.Inference.*_k=21_.*_sort=value_modelType=f16_trgDev=CPU.*)",
         // Issue: 121812
         R"(.*ConvertCPULayerTest.*outFmts=(nhwc|nChw8c|nChw16c).*)",
+        // Issue: 123320
+        // Input precision bf16 is converted to fp32 by logic in core_config.cpp during ngraph reference test.
+        R"(.*FakeConvertLayerTest.*dataPrecision=bf16.*)",
         // Need to generate sequence exactly in the i64 data type. Enable in scope of i64 enabling.
         R"(.*RandomUniformLayerTestCPU.*OutPrc=i64.*)",
         // Issue: 123815 (Tests are sensintive to available thread count on testing machines)
@@ -532,6 +535,7 @@ std::vector<std::string> disabledTestPatterns() {
     retVector.emplace_back(R"(.*CompileModelWithCacheEncryptionTest.*CanImportModelWithoutException.*)");
     retVector.emplace_back(R"(.*ConcatMultiQuerySDPTest.*f16.*)");
     retVector.emplace_back(R"(.*ConcatSDPTest.*f16.*)");
+    retVector.emplace_back(R"(.*FakeConvertLayerTest.*f16.*)");
     retVector.emplace_back(R"(.*CoreThreadingTestsWithCacheEnabled.*smoke_compiled_model_cache_enabled.*)");
     retVector.emplace_back(R"(.*CoreThreadingTestsWithIter.*smoke_CompileModel.*)");
     retVector.emplace_back(R"(.*CustomOpConvertI64CPUTest.*CompareWithRefs.*)");
diff --git a/src/tests/functional/shared_test_classes/src/single_op/fake_convert.cpp b/src/tests/functional/shared_test_classes/src/single_op/fake_convert.cpp
@@ -4,6 +4,8 @@
 
 #include "shared_test_classes/single_op/fake_convert.hpp"
 
+#include <random>
+
 #include "openvino/opsets/opset1.hpp"
 #include "openvino/opsets/opset13.hpp"
 
@@ -52,9 +54,24 @@ void FakeConvertLayerTest::SetUp() {
 
     init_input_shapes(data_shapes);
 
+    std::vector<float> scale_values(ov::shape_size(scale_shape));
+    std::vector<float> shift_values(ov::shape_size(shift_shape));
+    std::mt19937 gen(0);
+    std::uniform_real_distribution<float> dis(0, static_cast<float>(ov::shape_size(scale_shape)));
+    for (auto& scale_value : scale_values)
+        scale_value = dis(gen);
+    for (auto& shift_value : shift_values)
+        shift_value = dis(gen);
+
+    if (data_prec == ov::element::f16) {
+        configuration.insert(ov::hint::inference_precision(ov::element::f16));
+    } else if (data_prec == ov::element::bf16) {
+        configuration.insert(ov::hint::inference_precision(ov::element::bf16));
+    }
+
     const auto data = std::make_shared<opset1::Parameter>(data_prec, inputDynamicShapes.front());
-    const auto scale = std::make_shared<opset1::Constant>(data_prec, scale_shape);
-    const auto shift = std::make_shared<opset1::Constant>(data_prec, shift_shape);
+    const auto scale = std::make_shared<opset1::Constant>(data_prec, scale_shape, scale_values);
+    const auto shift = std::make_shared<opset1::Constant>(data_prec, shift_shape, shift_values);
 
     const auto fake_convert = default_shift ? std::make_shared<opset13::FakeConvert>(data, scale, dst_prec)
                                             : std::make_shared<opset13::FakeConvert>(data, scale, shift, dst_prec);

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ constexpr uint8_t f8e5m2_m_mask = 0x03; // f8e5m2 mantissa bit mask`
`28`	`28`
`29`	`29`	`uint8_t f32_to_f8e5m2_bits(const float value) {`
`30`	`30`	`auto f16 = static_cast<float16>(value);`
`31`		`- reference::func::emulate_f8e5m2_on_fp16(&f16, &f16, 1);`
	`31`	`+ reference::func::emulate_f8e5m2_on_fp16(&f16, &f16, 1, false);`
`32`	`32`	`return static_cast<uint8_t>((f16.to_bits() >> byte_shift));`
`33`	`33`	`}`
`34`	`34`	`} // namespace`
Original file line number	Diff line number	Diff line change
`@@ -191,28 +191,28 @@ TEST(F8E5M2Test, f8e5m2_num_limits_exp) {`
`191`	`191`	`EXPECT_EQ(max_exp10, 4);`
`192`	`192`	`}`
`193`	`193`
`194`		`-TEST(F8E5M2Test, f32_ge_f8_max_round_to_inf) {`
	`194`	`+TEST(F8E5M2Test, f32_as_f16_inf_gt_f8_max_round_to_inf) {`
`195`	`195`	`const auto f8 = ov::float8_e5m2(65520.0f);`
`196`	`196`
`197`	`197`	`EXPECT_EQ(f8.to_bits(), 0b01111100);`
`198`	`198`	`}`
`199`	`199`
`200`		`-TEST(F8E5M2Test, f32_ge_f8_max_round_to_max) {`
	`200`	`+TEST(F8E5M2Test, f32_gt_f16_max_gt_f8_max_round_to_inf) {`
`201`	`201`	`const auto f8 = ov::float8_e5m2(65519.9f);`
`202`	`202`
`203`		`- EXPECT_EQ(f8.to_bits(), 0b01111011);`
	`203`	`+ EXPECT_EQ(f8.to_bits(), 0b01111100);`
`204`	`204`	`}`
`205`	`205`
`206`		`-TEST(F8E5M2Test, f32_ge_f8_max_round_to_minus_inf) {`
	`206`	`+TEST(F8E5M2Test, f32_as_f16_minus_inf_lt_f8_lowest_round_to_minus_inf) {`
`207`	`207`	`const auto f8 = ov::float8_e5m2(-65520.0f);`
`208`	`208`
`209`	`209`	`EXPECT_EQ(f8.to_bits(), 0b11111100);`
`210`	`210`	`}`
`211`	`211`
`212`		`-TEST(F8E5M2Test, f32_ge_f8_max_round_to_lowest) {`
	`212`	`+TEST(F8E5M2Test, f32_lt_f16_lowest_lt_f8_lowest_round_to_minus_inf) {`
`213`	`213`	`const auto f8 = ov::float8_e5m2(-65519.9f);`
`214`	`214`
`215`		`- EXPECT_EQ(f8.to_bits(), 0b11111011);`
	`215`	`+ EXPECT_EQ(f8.to_bits(), 0b11111100);`
`216`	`216`	`}`
`217`	`217`
`218`	`218`	`template <class TContainer>`