[CPU] Avoid rounding to zero for Reduce node in quantized models (#25766)

xuchen-intel · web-flow · commit 3c713d4aec23 · 2024-07-31T09:32:10.000Z
### Details: - *If the Reduce node has both input and output precision to be integers from the original model, then rounding to zero should be done before converting intermediate floating point value to integer.* - *However, if such integer precisions are resulted from quantization, then we should not do such rounding, in order to maintain accuracy.* - *Add corresponding test cases.* ### Tickets: - *CVS-147352*
diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp
@@ -91,6 +91,7 @@ size_t ReduceKey::hash() const {
     seed = hash_combine(seed, jcp.reduce_mode);
     seed = hash_combine(seed, jcp.fuse_low_precision);
     seed = hash_combine(seed, jcp.fuse_broadcast);
+    seed = hash_combine(seed, jcp.round_to_zero);
     seed = hash_combine(seed, jcp.src_dt);
     seed = hash_combine(seed, jcp.dst_dt);
     seed = get_post_op_hash(seed, *postOps.get());
@@ -101,17 +102,18 @@ size_t ReduceKey::hash() const {
 bool ReduceKey::operator==(const ReduceKey &rhs) const {
     return jcp.layout == rhs.jcp.layout && jcp.reduce_mode == rhs.jcp.reduce_mode &&
            jcp.fuse_low_precision == rhs.jcp.fuse_low_precision &&
+           jcp.fuse_broadcast == rhs.jcp.fuse_broadcast && jcp.round_to_zero == rhs.jcp.round_to_zero &&
            jcp.src_dt == rhs.jcp.src_dt && jcp.dst_dt == rhs.jcp.dst_dt && *postOps.get() == *rhs.postOps.get();
 }
 } // namespace
 
-#if defined(OPENVINO_ARCH_X86_64)
-
 // some utility functions
 static inline bool isFloatCompatible(memory::data_type type) {
     return memory::data_type::f32 == type || memory::data_type::bf16 == type || memory::data_type::f16 == type;
 }
 
+#if defined(OPENVINO_ARCH_X86_64)
+
 template <cpu_isa_t isa>
 struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_generator {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_reduce_kernel_f32)
@@ -966,7 +968,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
     inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, memory::data_type dst_dt) {
         Xmm xmm_dst = Xmm(vmm_dst.getIdx());
         Ymm ymm_dst = Ymm(vmm_dst.getIdx());
-        if (!isFloatCompatible(jcp_.src_dt) && !support_intermediate_int) {
+        if (jcp_.round_to_zero && !support_intermediate_int) {
             uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero
         }
         if (convert_f32_to_i32(dst_dt)) {
@@ -1020,7 +1022,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
     }
 
     inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) {
-        if (!isFloatCompatible(jcp_.src_dt) && !support_intermediate_int) {
+        if (jcp_.round_to_zero && !support_intermediate_int) {
             uni_vroundps(xmm_dst, xmm_dst, 3);
         }
         if (convert_f32_to_i32(dst_dt)) {
@@ -1522,7 +1524,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
         int depthwise_inj_idx = 0;
         int quantization_inj_idx = 0;
         int post_ops_data_offset = 0;
-        if (!isFloatCompatible(jcp_.src_dt)) {
+        if (jcp_.round_to_zero) {
             uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero
         }
 
@@ -1656,7 +1658,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
         Xmm xmm_dst = Xmm(vmm_dst.getIdx());
         Ymm ymm_dst = Ymm(vmm_dst.getIdx());
         // If there is post ops fusing, necessary rounding has ready been done, no need to do it again.
-        if (!post_ops_fusing && !isFloatCompatible(jcp_.src_dt)) {
+        if (!post_ops_fusing && jcp_.round_to_zero) {
             uni_vroundps(vmm_dst, vmm_dst, 3);
         }
         if (!isFloatCompatible(dst_dt)) {
@@ -1710,7 +1712,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
     }
 
     inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) {
-        if (!post_ops_fusing && !isFloatCompatible(jcp_.src_dt)) {
+        if (!post_ops_fusing && jcp_.round_to_zero) {
             uni_vroundps(xmm_dst, xmm_dst, 3);
         }
         if (!isFloatCompatible(dst_dt)) {
@@ -1913,6 +1915,7 @@ Reduce::Reduce(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr con
         }
         set_use_aux_kernel = false;
         fuse_low_precision = false;
+        round_to_zero = false;
         vec_reduceDH_prc.clear();
         vec_reduceCDW_prc.clear();
         setJITBeyond5D();
@@ -1950,6 +1953,11 @@ void Reduce::initSupportedPrimitiveDescriptors() {
     input_prec = getOriginalInputPrecisionAtPort(REDUCE_DATA);
     output_prec = getOriginalOutputPrecisionAtPort(0);
 
+    if (!isFloatCompatible(DnnlExtensionUtils::ElementTypeToDataType(input_prec)) &&
+        !isFloatCompatible(DnnlExtensionUtils::ElementTypeToDataType(output_prec))) {
+        round_to_zero = true;
+    }
+
     jit_mode = canApplyJIT(input_prec, output_prec);
 
     auto is_precision_sensitive_reduce = [](const Algorithm &algorithm) {
@@ -2194,6 +2202,7 @@ void Reduce::createPrimitive() {
     jcp.layout = layout;
     jcp.reduce_mode = getAlgorithm();
     jcp.fuse_low_precision = fuse_low_precision;
+    jcp.round_to_zero = round_to_zero;
 
 #if defined(OPENVINO_ARCH_X86_64)
     compile_post_kernel = true;
diff --git a/src/plugins/intel_cpu/src/nodes/reduce.h b/src/plugins/intel_cpu/src/nodes/reduce.h
@@ -22,6 +22,7 @@ struct jit_reduce_config_params {
     Algorithm reduce_mode;
     bool fuse_low_precision;
     bool fuse_broadcast;    // if post ops fusion needs broadcast
+    bool round_to_zero;
     dnnl::memory::data_type src_dt;
     dnnl::memory::data_type dst_dt;
     int src_data_size;
@@ -138,6 +139,7 @@ class Reduce : public Node {
     bool jit_beyond_5D = false;
     bool jit_mode = true;
     bool keep_dims = true;
+    bool round_to_zero = false;
     bool is_hybrid_layout = false;
     bool compile_post_kernel = true;
     bool apply_post_kernel = true;
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/integer_reduce_mean.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/integer_reduce_mean.cpp
@@ -0,0 +1,38 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "subgraph_tests/integer_reduce_mean.hpp"
+
+#include <tuple>
+#include <vector>
+
+using namespace ov::test;
+namespace {
+
+const std::vector<ov::element::Type> input_precision = {ov::element::f32};
+const std::vector<ov::element::Type> integer_input_precision = {ov::element::i32, ov::element::i8, ov::element::u8};
+const std::vector<std::vector<size_t>> input_shape = {{1, 2, 3, 3}};
+const std::vector<std::vector<size_t>> axes = {{2, 3}};
+
+INSTANTIATE_TEST_SUITE_P(smoke_ReduceMeanQuantized,
+                         IntegerReduceMeanTest,
+                         testing::Combine(
+                            ::testing::ValuesIn(input_precision),
+                            ::testing::ValuesIn(input_shape),
+                            ::testing::ValuesIn(axes),
+                            ::testing::Values(true),
+                            ::testing::Values(ov::test::utils::DEVICE_CPU)),
+                         IntegerReduceMeanTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_ReduceMeanIntegerInput,
+                         IntegerReduceMeanTest,
+                         testing::Combine(
+                            ::testing::ValuesIn(integer_input_precision),
+                            ::testing::ValuesIn(input_shape),
+                            ::testing::ValuesIn(axes),
+                            ::testing::Values(false),
+                            ::testing::Values(ov::test::utils::DEVICE_CPU)),
+                         IntegerReduceMeanTest::getTestCaseName);
+
+}  // namespace
diff --git a/src/plugins/template/backend/ops/ops_evaluates.hpp b/src/plugins/template/backend/ops/ops_evaluates.hpp
@@ -73,6 +73,10 @@ extern template bool evaluate_node<ov::op::v0::LSTMCell>(std::shared_ptr<ov::Nod
                                                          ov::TensorVector& outputs,
                                                          const ov::TensorVector& inputs);
 
+extern template bool evaluate_node<ov::op::v1::ReduceMean>(std::shared_ptr<ov::Node> node,
+                                                           ov::TensorVector& outputs,
+                                                           const ov::TensorVector& inputs);
+
 OPENVINO_SUPPRESS_DEPRECATED_START
 extern template bool evaluate_node<ov::op::v0::LSTMSequence>(std::shared_ptr<ov::Node> node,
                                                              ov::TensorVector& outputs,
diff --git a/src/plugins/template/backend/ops/reduce_mean.cpp b/src/plugins/template/backend/ops/reduce_mean.cpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/reference/reduce_mean.hpp"
+
+#include "evaluate_node.hpp"
+
+template <ov::element::Type_t ET>
+bool evaluate(const std::shared_ptr<ov::op::v1::ReduceMean>& op,
+              ov::TensorVector& outputs,
+              const ov::TensorVector& inputs) {
+    using T = ov::fundamental_type_for<ET>;
+    ov::reference::reduce_mean(inputs[0].data<const T>(),
+                               outputs[0].data<T>(),
+                               inputs[0].get_shape(),
+                               op->get_reduction_axes());
+    return true;
+}
+
+template <>
+bool evaluate_node<ov::op::v1::ReduceMean>(std::shared_ptr<ov::Node> node,
+                                           ov::TensorVector& outputs,
+                                           const ov::TensorVector& inputs) {
+    const auto& element_type = node->get_output_element_type(0);
+
+    switch (element_type) {
+    case ov::element::bf16:
+        return evaluate<ov::element::bf16>(ov::as_type_ptr<ov::op::v1::ReduceMean>(node), outputs, inputs);
+    case ov::element::f16:
+        return evaluate<ov::element::f16>(ov::as_type_ptr<ov::op::v1::ReduceMean>(node), outputs, inputs);
+    case ov::element::f32:
+        return evaluate<ov::element::f32>(ov::as_type_ptr<ov::op::v1::ReduceMean>(node), outputs, inputs);
+    case ov::element::i8:
+        return evaluate<ov::element::i8>(ov::as_type_ptr<ov::op::v1::ReduceMean>(node), outputs, inputs);
+    case ov::element::u8:
+        return evaluate<ov::element::u8>(ov::as_type_ptr<ov::op::v1::ReduceMean>(node), outputs, inputs);
+    default:
+        OPENVINO_THROW("Unhandled data type ", element_type, " in evaluate_node()");
+    }
+}
diff --git a/src/plugins/template/backend/opset_int_tbl.hpp b/src/plugins/template/backend/opset_int_tbl.hpp
@@ -62,6 +62,7 @@ _OPENVINO_OP_REG(Multiply, op::v1)
 _OPENVINO_OP_REG(NonMaxSuppression, op::v1)
 _OPENVINO_OP_REG(OneHot, op::v1)
 _OPENVINO_OP_REG(Pad, op::v1)
+_OPENVINO_OP_REG(ReduceMean, op::v1)
 _OPENVINO_OP_REG(Split, op::v1)
 _OPENVINO_OP_REG(Reshape, op::v1)
 _OPENVINO_OP_REG(Select, op::v1)
diff --git a/src/tests/functional/plugin/shared/include/subgraph_tests/integer_reduce_mean.hpp b/src/tests/functional/plugin/shared/include/subgraph_tests/integer_reduce_mean.hpp
@@ -0,0 +1,17 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/subgraph/integer_reduce_mean.hpp"
+
+namespace ov {
+namespace test {
+
+TEST_P(IntegerReduceMeanTest, CompareWithRefs){
+    run();
+};
+
+}  // namespace test
+}  // namespace ov
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/integer_reduce_mean.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/integer_reduce_mean.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/base/ov_subgraph.hpp"
+
+namespace ov {
+namespace test {
+
+typedef std::tuple<ov::element::Type,    // input precision
+                   std::vector<size_t>,  // input shape
+                   std::vector<size_t>,  // axes
+                   bool,                 // quantized
+                   const char*           // plugin
+                   > IntegerReduceMeanParams;
+
+// IntegerReduceMeanTest covers the two rounding scenarios in ReduceMean with integer inputs.
+// Scenario 1: ReduceMean has both input and output precisions to be integers from the original model, so rounding to zero should
+//             be done before converting intermediate floating point value to integer. Covered by test suite smoke_ReduceMeanIntegerInput.
+// Scenario 2: Integer inputs of ReduceMean are resulted from quantization, then such rounding should not be done, in order to maintain
+//             accuracy. Coverd by test suite smoke_ReduceMeanQuantized.
+class IntegerReduceMeanTest : public testing::WithParamInterface<IntegerReduceMeanParams>,
+                       public ov::test::SubgraphBaseStaticTest {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<IntegerReduceMeanParams>& obj);
+
+protected:
+    void SetUp() override;
+};
+
+}  // namespace test
+}  // namespace ov
diff --git a/src/tests/functional/shared_test_classes/src/subgraph/integer_reduce_mean.cpp b/src/tests/functional/shared_test_classes/src/subgraph/integer_reduce_mean.cpp
@@ -0,0 +1,68 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/subgraph/integer_reduce_mean.hpp"
+#include "common_test_utils/node_builders/fake_quantize.hpp"
+
+namespace ov {
+namespace test {
+
+std::string IntegerReduceMeanTest::getTestCaseName(const testing::TestParamInfo<IntegerReduceMeanParams>& obj) {
+    ov::element::Type input_precision;
+    std::vector<size_t> input_shape;
+    std::vector<size_t> axes;
+    bool quantized;
+    const char *device;
+    std::tie(input_precision, input_shape, axes, quantized, device) = obj.param;
+    std::ostringstream result;
+    result << "inputPrecision=" << input_precision.to_string() << "_";
+    result << "inputShape=" << ov::test::utils::vec2str(input_shape) << "_";
+    result << "axes=" << ov::test::utils::vec2str(axes) << "_";
+    result << "device=" + std::string(device);
+    if (quantized)
+        result << "quantized=true";
+    else
+        result << "quantized=false";
+    return result.str();
+}
+
+void IntegerReduceMeanTest::SetUp() {
+    ov::element::Type input_precision;
+    std::vector<size_t> input_shape;
+    std::vector<size_t> axes;
+    std::vector<size_t> axes_shape;
+    bool quantized;
+    std::tie(input_precision, input_shape, axes, quantized, targetDevice) = this->GetParam();
+    axes_shape.push_back(axes.size());
+
+    auto dataNode = std::make_shared<ov::op::v0::Parameter>(input_precision, ov::Shape(input_shape));
+    auto axesNode = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape(axes_shape), axes);
+
+    std::shared_ptr<ov::op::v1::ReduceMean> reduce_mean;
+    if (quantized) {
+        std::vector<size_t> dataFqConstShapes(input_shape.size(), 1);
+        size_t constDataSize = ov::shape_size(dataFqConstShapes);
+        std::vector<float> inputLowData(constDataSize), inputHighData(constDataSize), outputLowData(constDataSize), outputHighData(constDataSize);
+        for (size_t i = 0; i < constDataSize; i++) {
+            inputLowData[i] = 0;
+            inputHighData[i] = 255;
+            outputLowData[i] = 0;
+            outputHighData[i] = 255;
+        }
+        auto dataFqNode = ov::test::utils::make_fake_quantize(
+            dataNode, input_precision, 256, dataFqConstShapes, inputLowData, inputHighData, outputLowData, outputHighData);
+        reduce_mean = std::make_shared<ov::op::v1::ReduceMean>(dataFqNode, axesNode, true);
+    } else {
+        reduce_mean = std::make_shared<ov::op::v1::ReduceMean>(dataNode, axesNode, true);
+    }
+
+    ov::ParameterVector inputs;
+    inputs.push_back(dataNode);
+    ov::ResultVector outputs;
+    outputs.push_back(std::make_shared<ov::op::v0::Result>(reduce_mean));
+    function = std::make_shared<ov::Model>(outputs, inputs);
+}
+
+}  // namespace test
+}  // namespace ov