[CPU] Enabled float (fp32/fp16/bf16) to nf4 precision conversion (#28829)

dmitry-gorokhov · web-flow · commit 9d51a1631459 · 2025-02-09T13:25:43.000Z
### Details: - This PR adds FP32/BF16/FP16 to NF4 support for Convert op in CPU Plugin - ### Tickets: - [CVS-153213](https://jira.devtools.intel.com/browse/CVS-153213)
diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp
@@ -1588,24 +1588,6 @@ ov::element::Type Node::getRuntimePrecision() const {
 }
 
 Node* Node::NodesFactory::create(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr& context) {
-    // getExceptionDescWithoutStatus removes redundant information from the exception message. For instance, the
-    // NotImplemented exception is generated in the form: full_path_to_src_file:line_number [ NOT_IMPLEMENTED ] reason.
-    // An example for gather node:
-    // /path-to-openVino-root/src/plugins/intel_cpu/nodes/gather.cpp:42 [ NOT_IMPLEMENTED ] Only opset7 Gather operation
-    // is supported The most important part of the message is the reason, so the lambda trims everything up to "]" Note
-    // that the op type and its friendly name will also be provided if we fail to create the node.
-    auto getExceptionDescWithoutStatus = [](const ov::Exception& ex) {
-        std::string desc = ex.what();
-        size_t pos = desc.find(']');
-        if (pos != std::string::npos) {
-            if (desc.size() == pos + 1) {
-                desc.erase(0, pos + 1);
-            } else {
-                desc.erase(0, pos + 2);
-            }
-        }
-        return desc;
-    };
     Node* newNode = nullptr;
     std::string errorMessage;
     if (newNode == nullptr) {
@@ -1616,7 +1598,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr<ov::Node>& op, const Grap
             }
         } catch (const ov::Exception& ex) {
             if (dynamic_cast<const ov::NotImplemented*>(&ex) != nullptr) {
-                errorMessage += getExceptionDescWithoutStatus(ex);
+                errorMessage += ex.what();
             } else {
                 throw;
             }
@@ -1631,7 +1613,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr<ov::Node>& op, const Grap
             }
         } catch (const ov::Exception& ex) {
             if (dynamic_cast<const ov::NotImplemented*>(&ex) != nullptr) {
-                const auto currErrorMess = getExceptionDescWithoutStatus(ex);
+                const std::string currErrorMess = ex.what();
                 if (!currErrorMess.empty()) {
                     errorMessage += errorMessage.empty() ? currErrorMess : "\n" + currErrorMess;
                 }
diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp
@@ -893,6 +893,50 @@ struct ConvertFrom4BitPrecision<std::tuple<src_t, dst_t>> {
     }
 };
 
+#define INTEL_CPU_CVT_TO_4BIT_LIST                                              \
+    INTEL_CPU_CVT(f32, nf4), INTEL_CPU_CVT(f16, nf4), INTEL_CPU_CVT(bf16, nf4)
+
+struct ConvertTo4BitContext {
+    ov::element::Type_t outType;
+    const void* srcPtr;
+    void* dstPtr;
+    size_t size;
+    bool converted;
+};
+
+template <typename T>
+struct ConvertTo4BitPrecision;
+
+template <typename src_t, typename dst_t>
+struct ConvertTo4BitPrecision<std::tuple<src_t, dst_t>> {
+    void operator()(ConvertTo4BitContext& ctx) {
+        auto insert_half_byte = [](uint8_t dst, uint8_t val, bool high_half) -> uint8_t {
+            uint8_t shift = high_half ? 4 : 0;
+            return dst | (uint8_t) (val << shift);
+        };
+
+        auto src = static_cast<const src_t*>(ctx.srcPtr);
+        auto dst = static_cast<uint8_t*>(ctx.dstPtr);
+        // each byte must be fully processed within same thread
+        auto work_amount = ctx.size / 2;
+        auto has_tail = ctx.size % work_amount != 0;
+        if (ctx.outType == ov::element::nf4) {
+            parallel_for(work_amount, [&](size_t ib) {
+               size_t idx = ib*2;
+               const auto val = insert_half_byte(0, ConvertNF4::quantize(static_cast<float>(src[idx])), false);
+               dst[ib] = insert_half_byte(val, ConvertNF4::quantize(static_cast<float>(src[idx+1])), true);
+            });
+
+            if (has_tail) {
+                dst[work_amount] = insert_half_byte(0, ConvertNF4::quantize(static_cast<float>(src[2*work_amount])), false);
+            }
+        } else {
+            OPENVINO_THROW("cpu_convert doesn't support output data type: ", ctx.outType, ". Not implemented.");
+        }
+        ctx.converted = true;
+    }
+};
+
 #define INTEL_CPU_CVT_FROM_BYTE_FP_LIST \
     INTEL_CPU_CVT(f8e8m0, f32), INTEL_CPU_CVT(f8e8m0, bf16), INTEL_CPU_CVT(f8e8m0, f16)
 
@@ -1017,6 +1061,12 @@ void cpu_convert(const void* srcPtr,
         if (!ctx.converted) {
             OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc);
         }
+    } else if (dstPrc.bitwidth() == 4u) {
+        ConvertTo4BitContext ctx{dstPrc, srcPtr, dstPtr, size, false};
+        OV_SWITCH(intel_cpu, ConvertTo4BitPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_TO_4BIT_LIST);
+        if (!ctx.converted) {
+            OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc);
+        }
     } else if (srcPrc == ov::element::f8e8m0) {
         ConvertFromByteFPContext ctx{srcPrc, srcPtr, dstPtr, size, false};
         OV_SWITCH(intel_cpu,
@@ -1063,6 +1113,7 @@ bool is_supported_convert(ov::element::Type srcPrc, ov::element::Type dstPrc) {
     OV_SWITCH(intel_cpu, isSupported, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_BIN_LIST);
     OV_SWITCH(intel_cpu, isSupported, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_4BIT_LIST);
     OV_SWITCH(intel_cpu, isSupported, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_BYTE_FP_LIST);
+    OV_SWITCH(intel_cpu, isSupported, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_TO_4BIT_LIST);
     return ctx.isSupported;
 }
 
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp
@@ -104,7 +104,8 @@ void ConvertCPULayerTest::SetUp() {
 #if defined(OPENVINO_ARCH_ARM64)
     if (inPrc == ov::element::u4 || inPrc == ov::element::i4 ||
         inPrc == ov::element::f8e4m3 || inPrc == ov::element::f8e5m2 ||
-        outPrc == ov::element::f8e4m3 || outPrc == ov::element::f8e5m2) {
+        outPrc == ov::element::f8e4m3 || outPrc == ov::element::f8e5m2 ||
+        outPrc == ov::element::nf4) {
         primitive = "ref";
     } else if (shapes.first.is_static() &&
         inPrc != ov::element::bf16 && outPrc != ov::element::bf16 &&
@@ -151,8 +152,16 @@ void ConvertCPULayerTest::generate_inputs(const std::vector<ov::Shape>& targetIn
     const auto& funcInputs = function->inputs();
     for (size_t i = 0; i < funcInputs.size(); ++i) {
         const auto& funcInput = funcInputs[i];
-        ov::Tensor tensor =
-            ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i]);
+        ov::Tensor tensor;
+        if (outPrc == ov::element::nf4) {
+            tensor = ov::test::utils::create_and_fill_tensor_real_distribution(funcInput.get_element_type(),
+                                                                               targetInputStaticShapes[i],
+                                                                               -1.f,
+                                                                               1.f,
+                                                                               1);
+        } else {
+            tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i]);
+        }
         if (special_value != ov::test::SpecialValue::none) {
             if (inPrc == ov::element::f32) {
                 modify_value<float>(tensor, special_value);
@@ -176,6 +185,40 @@ void ConvertCPULayerTest::validate_out_prc() const {
         FAIL() << "ConvertCPULayerTest supports only non boolean output prc";
 }
 
+void ConvertCPULayerTest::validate() {
+    if (outPrc == ov::element::nf4) {
+        // Use custom bit-exact validation, because common tests infra doesn't support 4bits tensors comparision
+        auto actualOutputs = get_plugin_outputs();
+        auto expectedOutputs = calculate_refs();
+        ASSERT_EQ(expectedOutputs.size(), actualOutputs.size());
+        ASSERT_EQ(expectedOutputs.size(), 1);
+        ASSERT_EQ(expectedOutputs[0].get_shape(), actualOutputs[0].get_shape());
+        ASSERT_EQ(expectedOutputs[0].get_element_type(), ov::element::nf4);
+        ASSERT_EQ(expectedOutputs[0].get_element_type(), actualOutputs[0].get_element_type());
+
+        auto expected_data = reinterpret_cast<const uint8_t*>(expectedOutputs[0].data());
+        auto actual_data = reinterpret_cast<const uint8_t*>(actualOutputs[0].data());
+        size_t byte_count = shape_size(expectedOutputs[0].get_shape()) / 2;
+        bool has_tile = shape_size(expectedOutputs[0].get_shape()) % 2 != 0;
+        for (size_t i = 0; i < byte_count; ++i) {
+            uint8_t expected_value = expected_data[i];
+            uint8_t actual_value = actual_data[i];
+            ASSERT_EQ(expected_value, actual_value);
+        }
+
+        // Convert operation doc doesn't specify behavior for odd amount of elements: should upper 4 bits of last byte be filled with zeros or not.
+        // CPU Plugin fills these bits with zeros as it better fits optimized kernels which get NF4 inputs.
+        // In general it is considered as UB, so skip the check for last 4 bits.
+        if (has_tile) {
+            ASSERT_EQ(expected_data[byte_count] & 0x0F, actual_data[byte_count] & 0x0F);
+        }
+
+        return;
+    }
+
+    SubgraphBaseTest::validate();
+}
+
 void ConvertToBooleanCPULayerTest::validate_out_prc() const {
     if (outPrc != ov::element::boolean)
         FAIL() << "ConvertToBooleanCPULayerTest supports only boolean output prc";
@@ -274,7 +317,9 @@ const std::vector<InputShape>& inShapes_4D_dynamic() {
                 {
                     {2, 4, 4, 1},
                     {2, 17, 5, 4},
-                    {1, 2, 3, 4}
+                    {1, 2, 3, 4},
+                    // odd number of elements
+                    {1, 3, 3, 3}
                 }
             },
             {
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp
@@ -29,6 +29,7 @@ class ConvertCPULayerTest : public testing::WithParamInterface<convertLayerTestP
 protected:
     void SetUp() override;
     void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;
+    void validate() override;
     virtual void validate_out_prc() const;
 
     ov::element::Type inPrc, outPrc;
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp
@@ -64,6 +64,15 @@ const std::vector<ov::element::Type> float_precisions = {
     ov::element::bf16,
 };
 
+INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_float_to_nf4, ConvertCPULayerTest,
+                        ::testing::Combine(
+                                ::testing::ValuesIn(inShapes_4D_dynamic()),
+                                ::testing::ValuesIn(float_precisions),
+                                ::testing::Values(ov::element::nf4),
+                                ::testing::Values(ov::test::SpecialValue::none),
+                                ::testing::Values(CPUSpecificParams({nchw}, {nchw}, {}, {"ref"}))),
+                        ConvertCPULayerTest::getTestCaseName);
+
 const std::vector<ov::element::Type> f8_precisions = {
     ov::element::f8e4m3,
     ov::element::f8e5m2,