[CPU][ARM] Pooling: support i32 indices output and NHWC indices layout (#28626)

alvoron · web-flow · commit 4354147f769a · 2025-02-03T19:58:03.000Z
### Details: - Allow both `u32` and `i32` as the 2nd `Pooling` output (indices) data type since this output may contain only positive values (according to OpenVINO operation specifications) - Support `NHWC` layout in the 2nd `Pooling` output (indices) ### Tickets: - CVS-159121
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.cpp
@@ -55,8 +55,22 @@ bool AclPoolingExecutor::isSupported(const TensorInfo& srcTensorInfo,
         DEBUG_LOG("NCHW + CEIL gives an accuracy problem in ACL AvgPool. ACL executor will not be created.");
         return false;
     }
-    DimensionRoundingType round =
-        (poolingAttrs.rounding == op::RoundingType::CEIL) ? DimensionRoundingType::CEIL : DimensionRoundingType::FLOOR;
+    DimensionRoundingType round;
+    switch (poolingAttrs.rounding) {
+    case op::RoundingType::FLOOR:
+        round = DimensionRoundingType::FLOOR;
+        break;
+    case op::RoundingType::CEIL:
+        round = DimensionRoundingType::CEIL;
+        break;
+    // CEIL_TORCH type is mapped to ACL CEIL type
+    case op::RoundingType::CEIL_TORCH:
+        round = DimensionRoundingType::CEIL;
+        break;
+    default:
+        DEBUG_LOG("Unknown rounding type: ", poolingAttrs.rounding);
+        return false;
+    }
 
     if (srcDimsSize == 5) {
         if (dstDescsSize > 1) {
@@ -89,7 +103,12 @@ bool AclPoolingExecutor::isSupported(const TensorInfo& srcTensorInfo,
         pool_info->pool_type = pool_type;
         pool_info->exclude_padding = exclude_padding;
         if (dstDescsSize > 1) {
-            TensorInfo indTensorInfo = TensorInfo(shapeCast(*indDims), 1, arm_compute::DataType::U32, dataLayout);
+            auto indShape = shapeCast(*indDims);
+            if (dataLayout == arm_compute::DataLayout::NHWC) {
+                changeLayoutToNH_C({&indShape});
+            }
+            // U32 is specified since this is the only data type supported by ACL
+            TensorInfo indTensorInfo = TensorInfo(indShape, 1, arm_compute::DataType::U32, dataLayout);
             arm_compute::Status s =
                 arm_compute::NEPoolingLayer::validate(&srcTensorInfo, &dstTensorInfo, *pool_info, &indTensorInfo);
             if (!s) {
@@ -178,10 +197,13 @@ bool AclPoolingExecutor::init(const PoolingAttrs& poolingAttrs,
                 return false;
             }
             auto indDims = dstDescs[1]->getShape().getStaticDims();
-            TensorInfo indTensorInfo = TensorInfo(shapeCast(indDims),
-                                                  1,
-                                                  precisionToAclDataType(dstDescs[1]->getPrecision()),
-                                                  getAclDataLayoutByMemoryDesc(dstDescs[1]));
+            auto indShape = shapeCast(indDims);
+            if (dstTensorInfo.data_layout() == arm_compute::DataLayout::NHWC) {
+                changeLayoutToNH_C({&indShape});
+            }
+            // U32 is specified since this is the only data type supported by ACL
+            TensorInfo indTensorInfo =
+                TensorInfo(indShape, 1, arm_compute::DataType::U32, getAclDataLayoutByMemoryDesc(dstDescs[1]));
             indTensor.allocator()->init(indTensorInfo);
             exec_func = [this, pool_info]() -> std::unique_ptr<IFunction> {
                 auto acl_op = std::make_unique<arm_compute::NEPoolingLayer>();
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_pooling.hpp
@@ -79,7 +79,7 @@ class AclPoolingExecutorBuilder : public PoolingExecutorBuilder {
             return false;
         }
 
-        if (dstDescs.size() == 2u && dstDescs[1]->getPrecision() != ov::element::u32) {
+        if (dstDescs.size() == 2u && !one_of(dstDescs[1]->getPrecision(), ov::element::u32, ov::element::i32)) {
             DEBUG_LOG("AclPoolingExecutor supports U32 as indices precisions only. ",
                       "Passed indices precision: ",
                       dstDescs[1]->getPrecision());
diff --git a/src/plugins/intel_cpu/src/nodes/pooling.cpp b/src/plugins/intel_cpu/src/nodes/pooling.cpp
@@ -158,6 +158,15 @@ bool Pooling::isSupportedOperation(const std::shared_ptr<const ov::Node>& op, st
             errorMessage = "Supported ops are MaxPool-1, MaxPool-8, MaxPool-14, AvgPool-1 and AvgPool-14";
             return false;
         }
+#if defined(OV_CPU_WITH_ACL)
+        if (ov::as_type_ptr<const ov::op::v8::MaxPool>(op) ||
+            ov::as_type_ptr<const ov::op::v14::MaxPool>(op)) {
+            if (ov::as_type_ptr<const ov::op::util::MaxPoolBase>(op)->get_kernel() != ov::Shape(2,2)) {
+                errorMessage = "Pooling indices returning source tensor coordinates is only supported for pool size 2x2";
+                return false;
+            }
+        }
+#endif
     } catch (...) {
         return false;
     }
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/pooling.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/arm/pooling.cpp
@@ -0,0 +1,52 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "custom/single_layer_tests/classes/pooling.hpp"
+#include "utils/cpu_test_utils.hpp"
+#include "utils/fusing_test_utils.hpp"
+#include "utils/filter_cpu_info.hpp"
+
+using namespace CPUTestUtils;
+
+namespace ov {
+namespace test {
+namespace Pooling {
+
+const std::vector<maxPoolV8SpecificParams>& paramsMaxV144D_2x2kernel = {
+        maxPoolV8SpecificParams{ {2, 2}, {2, 2}, {1, 1}, {0, 0}, {0, 0},
+                                                          ov::element::Type_t::i32, 0,
+                                                          ov::op::RoundingType::CEIL_TORCH, ov::op::PadType::SAME_UPPER },
+        maxPoolV8SpecificParams{ {2, 2}, {2, 2}, {1, 1}, {0, 0}, {0, 0},
+                                                          ov::element::Type_t::i32, 0,
+                                                          ov::op::RoundingType::CEIL_TORCH, ov::op::PadType::SAME_LOWER }
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV14_CPU_4D_2x2Kernel, MaxPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsMaxV144D_2x2kernel),
+                                 ::testing::ValuesIn(inputShapes4D()),
+                                 ::testing::ValuesIn((inpOutPrecision())),
+                                 ::testing::ValuesIn(filterCPUInfo(vecCpuConfigsFusing_4D())),
+                                 ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                         MaxPoolingV14LayerCPUTest::getTestCaseName);
+
+const std::vector<maxPoolV8SpecificParams>& paramsMaxV144D_non2x2kernel = {
+            maxPoolV8SpecificParams{ {11, 7}, {2, 2}, {1, 1}, {2, 2}, {2, 2},
+                                                            ov::element::Type_t::i32, 0,
+                                                            ov::op::RoundingType::CEIL_TORCH, ov::op::PadType::EXPLICIT},
+};
+
+//The test checks that fallback to nGraph works for ACL non-supported cases
+INSTANTIATE_TEST_SUITE_P(smoke_MaxPoolV14_CPU_4D_non2x2Kernel_ref, MaxPoolingV14LayerCPUTest,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(paramsMaxV144D_non2x2kernel),
+                                 ::testing::ValuesIn(inputShapes4D()),
+                                 ::testing::ValuesIn((inpOutPrecision())),
+                                 ::testing::Values(CPUSpecificParams{{}, {}, {"ref_any"}, "ref_any"}),
+                                 ::testing::Values(CPUTestUtils::empty_plugin_config)),
+                         MaxPoolingV14LayerCPUTest::getTestCaseName);
+
+}  // namespace Pooling
+}  // namespace test
+}  // namespace ov

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ class AclPoolingExecutorBuilder : public PoolingExecutorBuilder {`
`79`	`79`	`return false;`
`80`	`80`	`}`
`81`	`81`
`82`		`- if (dstDescs.size() == 2u && dstDescs[1]->getPrecision() != ov::element::u32) {`
	`82`	`+ if (dstDescs.size() == 2u && !one_of(dstDescs[1]->getPrecision(), ov::element::u32, ov::element::i32)) {`
`83`	`83`	`DEBUG_LOG("AclPoolingExecutor supports U32 as indices precisions only. ",`
`84`	`84`	`"Passed indices precision: ",`
`85`	`85`	`dstDescs[1]->getPrecision());`