Merge branch 'master' into uk/changing-sub-byte-i4-element-order

ujjayant-kadian · web-flow · commit ea6731f8a75b · 2024-07-31T11:56:25.000+01:00
diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp
@@ -91,6 +91,7 @@ size_t ReduceKey::hash() const {
     seed = hash_combine(seed, jcp.reduce_mode);
     seed = hash_combine(seed, jcp.fuse_low_precision);
     seed = hash_combine(seed, jcp.fuse_broadcast);
+    seed = hash_combine(seed, jcp.round_to_zero);
     seed = hash_combine(seed, jcp.src_dt);
     seed = hash_combine(seed, jcp.dst_dt);
     seed = get_post_op_hash(seed, *postOps.get());
@@ -101,17 +102,18 @@ size_t ReduceKey::hash() const {
 bool ReduceKey::operator==(const ReduceKey &rhs) const {
     return jcp.layout == rhs.jcp.layout && jcp.reduce_mode == rhs.jcp.reduce_mode &&
            jcp.fuse_low_precision == rhs.jcp.fuse_low_precision &&
+           jcp.fuse_broadcast == rhs.jcp.fuse_broadcast && jcp.round_to_zero == rhs.jcp.round_to_zero &&
            jcp.src_dt == rhs.jcp.src_dt && jcp.dst_dt == rhs.jcp.dst_dt && *postOps.get() == *rhs.postOps.get();
 }
 } // namespace
 
-#if defined(OPENVINO_ARCH_X86_64)
-
 // some utility functions
 static inline bool isFloatCompatible(memory::data_type type) {
     return memory::data_type::f32 == type || memory::data_type::bf16 == type || memory::data_type::f16 == type;
 }
 
+#if defined(OPENVINO_ARCH_X86_64)
+
 template <cpu_isa_t isa>
 struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_generator {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_reduce_kernel_f32)
@@ -966,7 +968,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
     inline void store_vector(const Xbyak::Address &op, Vmm vmm_dst, memory::data_type dst_dt) {
         Xmm xmm_dst = Xmm(vmm_dst.getIdx());
         Ymm ymm_dst = Ymm(vmm_dst.getIdx());
-        if (!isFloatCompatible(jcp_.src_dt) && !support_intermediate_int) {
+        if (jcp_.round_to_zero && !support_intermediate_int) {
             uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero
         }
         if (convert_f32_to_i32(dst_dt)) {
@@ -1020,7 +1022,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
     }
 
     inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) {
-        if (!isFloatCompatible(jcp_.src_dt) && !support_intermediate_int) {
+        if (jcp_.round_to_zero && !support_intermediate_int) {
             uni_vroundps(xmm_dst, xmm_dst, 3);
         }
         if (convert_f32_to_i32(dst_dt)) {
@@ -1522,7 +1524,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
         int depthwise_inj_idx = 0;
         int quantization_inj_idx = 0;
         int post_ops_data_offset = 0;
-        if (!isFloatCompatible(jcp_.src_dt)) {
+        if (jcp_.round_to_zero) {
             uni_vroundps(vmm_dst, vmm_dst, 3); // rounding to zero
         }
 
@@ -1656,7 +1658,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
         Xmm xmm_dst = Xmm(vmm_dst.getIdx());
         Ymm ymm_dst = Ymm(vmm_dst.getIdx());
         // If there is post ops fusing, necessary rounding has ready been done, no need to do it again.
-        if (!post_ops_fusing && !isFloatCompatible(jcp_.src_dt)) {
+        if (!post_ops_fusing && jcp_.round_to_zero) {
             uni_vroundps(vmm_dst, vmm_dst, 3);
         }
         if (!isFloatCompatible(dst_dt)) {
@@ -1710,7 +1712,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
     }
 
     inline void store_scalar(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) {
-        if (!post_ops_fusing && !isFloatCompatible(jcp_.src_dt)) {
+        if (!post_ops_fusing && jcp_.round_to_zero) {
             uni_vroundps(xmm_dst, xmm_dst, 3);
         }
         if (!isFloatCompatible(dst_dt)) {
@@ -1913,6 +1915,7 @@ Reduce::Reduce(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr con
         }
         set_use_aux_kernel = false;
         fuse_low_precision = false;
+        round_to_zero = false;
         vec_reduceDH_prc.clear();
         vec_reduceCDW_prc.clear();
         setJITBeyond5D();
@@ -1950,6 +1953,11 @@ void Reduce::initSupportedPrimitiveDescriptors() {
     input_prec = getOriginalInputPrecisionAtPort(REDUCE_DATA);
     output_prec = getOriginalOutputPrecisionAtPort(0);
 
+    if (!isFloatCompatible(DnnlExtensionUtils::ElementTypeToDataType(input_prec)) &&
+        !isFloatCompatible(DnnlExtensionUtils::ElementTypeToDataType(output_prec))) {
+        round_to_zero = true;
+    }
+
     jit_mode = canApplyJIT(input_prec, output_prec);
 
     auto is_precision_sensitive_reduce = [](const Algorithm &algorithm) {
@@ -2194,6 +2202,7 @@ void Reduce::createPrimitive() {
     jcp.layout = layout;
     jcp.reduce_mode = getAlgorithm();
     jcp.fuse_low_precision = fuse_low_precision;
+    jcp.round_to_zero = round_to_zero;
 
 #if defined(OPENVINO_ARCH_X86_64)
     compile_post_kernel = true;
diff --git a/src/plugins/intel_cpu/src/nodes/reduce.h b/src/plugins/intel_cpu/src/nodes/reduce.h
@@ -22,6 +22,7 @@ struct jit_reduce_config_params {
     Algorithm reduce_mode;
     bool fuse_low_precision;
     bool fuse_broadcast;    // if post ops fusion needs broadcast
+    bool round_to_zero;
     dnnl::memory::data_type src_dt;
     dnnl::memory::data_type dst_dt;
     int src_data_size;
@@ -138,6 +139,7 @@ class Reduce : public Node {
     bool jit_beyond_5D = false;
     bool jit_mode = true;
     bool keep_dims = true;
+    bool round_to_zero = false;
     bool is_hybrid_layout = false;
     bool compile_post_kernel = true;
     bool apply_post_kernel = true;
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/integer_reduce_mean.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/subgraph_tests/integer_reduce_mean.cpp
@@ -0,0 +1,38 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "subgraph_tests/integer_reduce_mean.hpp"
+
+#include <tuple>
+#include <vector>
+
+using namespace ov::test;
+namespace {
+
+const std::vector<ov::element::Type> input_precision = {ov::element::f32};
+const std::vector<ov::element::Type> integer_input_precision = {ov::element::i32, ov::element::i8, ov::element::u8};
+const std::vector<std::vector<size_t>> input_shape = {{1, 2, 3, 3}};
+const std::vector<std::vector<size_t>> axes = {{2, 3}};
+
+INSTANTIATE_TEST_SUITE_P(smoke_ReduceMeanQuantized,
+                         IntegerReduceMeanTest,
+                         testing::Combine(
+                            ::testing::ValuesIn(input_precision),
+                            ::testing::ValuesIn(input_shape),
+                            ::testing::ValuesIn(axes),
+                            ::testing::Values(true),
+                            ::testing::Values(ov::test::utils::DEVICE_CPU)),
+                         IntegerReduceMeanTest::getTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(smoke_ReduceMeanIntegerInput,
+                         IntegerReduceMeanTest,
+                         testing::Combine(
+                            ::testing::ValuesIn(integer_input_precision),
+                            ::testing::ValuesIn(input_shape),
+                            ::testing::ValuesIn(axes),
+                            ::testing::Values(false),
+                            ::testing::Values(ov::test::utils::DEVICE_CPU)),
+                         IntegerReduceMeanTest::getTestCaseName);
+
+}  // namespace
diff --git a/src/plugins/template/backend/ops/ops_evaluates.hpp b/src/plugins/template/backend/ops/ops_evaluates.hpp
@@ -73,6 +73,10 @@ extern template bool evaluate_node<ov::op::v0::LSTMCell>(std::shared_ptr<ov::Nod
                                                          ov::TensorVector& outputs,
                                                          const ov::TensorVector& inputs);
 
+extern template bool evaluate_node<ov::op::v1::ReduceMean>(std::shared_ptr<ov::Node> node,
+                                                           ov::TensorVector& outputs,
+                                                           const ov::TensorVector& inputs);
+
 OPENVINO_SUPPRESS_DEPRECATED_START
 extern template bool evaluate_node<ov::op::v0::LSTMSequence>(std::shared_ptr<ov::Node> node,
                                                              ov::TensorVector& outputs,
diff --git a/src/plugins/template/backend/ops/reduce_mean.cpp b/src/plugins/template/backend/ops/reduce_mean.cpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/reference/reduce_mean.hpp"
+
+#include "evaluate_node.hpp"
+
+template <ov::element::Type_t ET>
+bool evaluate(const std::shared_ptr<ov::op::v1::ReduceMean>& op,
+              ov::TensorVector& outputs,
+              const ov::TensorVector& inputs) {
+    using T = ov::fundamental_type_for<ET>;
+    ov::reference::reduce_mean(inputs[0].data<const T>(),
+                               outputs[0].data<T>(),
+                               inputs[0].get_shape(),
+                               op->get_reduction_axes());
+    return true;
+}
+
+template <>
+bool evaluate_node<ov::op::v1::ReduceMean>(std::shared_ptr<ov::Node> node,
+                                           ov::TensorVector& outputs,
+                                           const ov::TensorVector& inputs) {
+    const auto& element_type = node->get_output_element_type(0);
+
+    switch (element_type) {
+    case ov::element::bf16:
+        return evaluate<ov::element::bf16>(ov::as_type_ptr<ov::op::v1::ReduceMean>(node), outputs, inputs);
+    case ov::element::f16:
+        return evaluate<ov::element::f16>(ov::as_type_ptr<ov::op::v1::ReduceMean>(node), outputs, inputs);
+    case ov::element::f32:
+        return evaluate<ov::element::f32>(ov::as_type_ptr<ov::op::v1::ReduceMean>(node), outputs, inputs);
+    case ov::element::i8:
+        return evaluate<ov::element::i8>(ov::as_type_ptr<ov::op::v1::ReduceMean>(node), outputs, inputs);
+    case ov::element::u8:
+        return evaluate<ov::element::u8>(ov::as_type_ptr<ov::op::v1::ReduceMean>(node), outputs, inputs);
+    default:
+        OPENVINO_THROW("Unhandled data type ", element_type, " in evaluate_node()");
+    }
+}
diff --git a/src/plugins/template/backend/opset_int_tbl.hpp b/src/plugins/template/backend/opset_int_tbl.hpp
@@ -62,6 +62,7 @@ _OPENVINO_OP_REG(Multiply, op::v1)
 _OPENVINO_OP_REG(NonMaxSuppression, op::v1)
 _OPENVINO_OP_REG(OneHot, op::v1)
 _OPENVINO_OP_REG(Pad, op::v1)
+_OPENVINO_OP_REG(ReduceMean, op::v1)
 _OPENVINO_OP_REG(Split, op::v1)
 _OPENVINO_OP_REG(Reshape, op::v1)
 _OPENVINO_OP_REG(Select, op::v1)
diff --git a/src/tests/functional/plugin/shared/include/subgraph_tests/integer_reduce_mean.hpp b/src/tests/functional/plugin/shared/include/subgraph_tests/integer_reduce_mean.hpp
@@ -0,0 +1,17 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "shared_test_classes/subgraph/integer_reduce_mean.hpp"
+
+namespace ov {
+namespace test {
+
+TEST_P(IntegerReduceMeanTest, CompareWithRefs){
+    run();
+};
+
+}  // namespace test
+}  // namespace ov
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/integer_reduce_mean.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/integer_reduce_mean.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/base/ov_subgraph.hpp"
+
+namespace ov {
+namespace test {
+
+typedef std::tuple<ov::element::Type,    // input precision
+                   std::vector<size_t>,  // input shape
+                   std::vector<size_t>,  // axes
+                   bool,                 // quantized
+                   const char*           // plugin
+                   > IntegerReduceMeanParams;
+
+// IntegerReduceMeanTest covers the two rounding scenarios in ReduceMean with integer inputs.
+// Scenario 1: ReduceMean has both input and output precisions to be integers from the original model, so rounding to zero should
+//             be done before converting intermediate floating point value to integer. Covered by test suite smoke_ReduceMeanIntegerInput.
+// Scenario 2: Integer inputs of ReduceMean are resulted from quantization, then such rounding should not be done, in order to maintain
+//             accuracy. Coverd by test suite smoke_ReduceMeanQuantized.
+class IntegerReduceMeanTest : public testing::WithParamInterface<IntegerReduceMeanParams>,
+                       public ov::test::SubgraphBaseStaticTest {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<IntegerReduceMeanParams>& obj);
+
+protected:
+    void SetUp() override;
+};
+
+}  // namespace test
+}  // namespace ov
diff --git a/src/tests/functional/shared_test_classes/src/subgraph/integer_reduce_mean.cpp b/src/tests/functional/shared_test_classes/src/subgraph/integer_reduce_mean.cpp
@@ -0,0 +1,68 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "shared_test_classes/subgraph/integer_reduce_mean.hpp"
+#include "common_test_utils/node_builders/fake_quantize.hpp"
+
+namespace ov {
+namespace test {
+
+std::string IntegerReduceMeanTest::getTestCaseName(const testing::TestParamInfo<IntegerReduceMeanParams>& obj) {
+    ov::element::Type input_precision;
+    std::vector<size_t> input_shape;
+    std::vector<size_t> axes;
+    bool quantized;
+    const char *device;
+    std::tie(input_precision, input_shape, axes, quantized, device) = obj.param;
+    std::ostringstream result;
+    result << "inputPrecision=" << input_precision.to_string() << "_";
+    result << "inputShape=" << ov::test::utils::vec2str(input_shape) << "_";
+    result << "axes=" << ov::test::utils::vec2str(axes) << "_";
+    result << "device=" + std::string(device);
+    if (quantized)
+        result << "quantized=true";
+    else
+        result << "quantized=false";
+    return result.str();
+}
+
+void IntegerReduceMeanTest::SetUp() {
+    ov::element::Type input_precision;
+    std::vector<size_t> input_shape;
+    std::vector<size_t> axes;
+    std::vector<size_t> axes_shape;
+    bool quantized;
+    std::tie(input_precision, input_shape, axes, quantized, targetDevice) = this->GetParam();
+    axes_shape.push_back(axes.size());
+
+    auto dataNode = std::make_shared<ov::op::v0::Parameter>(input_precision, ov::Shape(input_shape));
+    auto axesNode = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape(axes_shape), axes);
+
+    std::shared_ptr<ov::op::v1::ReduceMean> reduce_mean;
+    if (quantized) {
+        std::vector<size_t> dataFqConstShapes(input_shape.size(), 1);
+        size_t constDataSize = ov::shape_size(dataFqConstShapes);
+        std::vector<float> inputLowData(constDataSize), inputHighData(constDataSize), outputLowData(constDataSize), outputHighData(constDataSize);
+        for (size_t i = 0; i < constDataSize; i++) {
+            inputLowData[i] = 0;
+            inputHighData[i] = 255;
+            outputLowData[i] = 0;
+            outputHighData[i] = 255;
+        }
+        auto dataFqNode = ov::test::utils::make_fake_quantize(
+            dataNode, input_precision, 256, dataFqConstShapes, inputLowData, inputHighData, outputLowData, outputHighData);
+        reduce_mean = std::make_shared<ov::op::v1::ReduceMean>(dataFqNode, axesNode, true);
+    } else {
+        reduce_mean = std::make_shared<ov::op::v1::ReduceMean>(dataNode, axesNode, true);
+    }
+
+    ov::ParameterVector inputs;
+    inputs.push_back(dataNode);
+    ov::ResultVector outputs;
+    outputs.push_back(std::make_shared<ov::op::v0::Result>(reduce_mean));
+    function = std::make_shared<ov::Model>(outputs, inputs);
+}
+
+}  // namespace test
+}  // namespace ov