[CPU]The shape of the memory descriptor is considered in selectPreferPrimitiveDescriptor of Subgraph (#23971)

xipingyan · web-flow · commit 507f31da79cd · 2024-09-26T07:53:34.000Z
### Details:
- *selectPreferPrimitiveDescriptor take into account shapes of the
memory descriptors, because scalar shape node's reorder has less
computation*
 - *New logic only work for Subgraph*

### Tickets:
 - *137307*
 - *139904*

---------

Signed-off-by: xipingya &lt;xiping.yan@intel.com&gt;
Signed-off-by: Yan &lt;xiping.yan@intel.com&gt;
diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp
@@ -281,7 +281,6 @@ void Node::selectPreferPrimitiveDescriptor(const std::vector<impl_desc_type>& pr
                     auto parentDesc = parent_spd->getConfig().outConfs[inNum].getMemDesc();
 
                     const bool isCompatible = curDesc->isCompatible(*parentDesc);
-
                     if (isCompatible) {
                         equalsLocalFormatCount++;
                     }
@@ -316,6 +315,126 @@ void Node::selectPreferPrimitiveDescriptor(const std::vector<impl_desc_type>& pr
     selectPrimitiveDescriptorByIndex(0);
 }
 
+bool Node::isOneDimShape(const ov::PartialShape& pshape) {
+    int value_1_num = 0;
+    int sz = static_cast<int>(pshape.size());
+    for (auto s : pshape) {
+        if (s.is_static() && s.get_length() == 1) {
+            value_1_num++;
+        }
+    }
+    return value_1_num >= sz - 1;
+}
+
+bool Node::isReorderRequired(ov::intel_cpu::MemoryDescPtr desc1, ov::intel_cpu::MemoryDescPtr desc2) {
+    bool samePrec = desc1->getPrecision() == desc2->getPrecision();
+    bool isOneDimShape1 = isOneDimShape(desc1->getShape().toPartialShape());
+    bool isOneDimShape2 = isOneDimShape(desc2->getShape().toPartialShape());
+    return !(isOneDimShape1 && isOneDimShape2 && samePrec);
+}
+
+void Node::selectPreferPrimitiveDescriptorWithShape(const std::vector<impl_desc_type>& priority, bool ignoreConstInputs) {
+    // Filter out dynamic shape.
+    if (isDynamic) {
+        return selectPreferPrimitiveDescriptor(priority, ignoreConstInputs);
+    }
+
+    auto estimateReorderOverhead = [&](const ov::intel_cpu::NodeDesc& supportedPrimitiveDesc, size_t i) {
+        int estimate = 0;
+        auto inputNodesNum = supportedPrimitiveDesc.getConfig().inConfs.size();
+        for (size_t j = 0; j < inputNodesNum; j++) {
+            auto parentEdge = getParentEdgeAt(j);
+            auto parentPtr = parentEdge->getParent();
+
+            // We don't take into account constant edges since reorders on them will be executed on load network
+            // stage
+            if (ignoreConstInputs && j > 0 && parentPtr->isConstant()) {
+                continue;
+            }
+
+            auto parent_spd = parentPtr->getSelectedPrimitiveDescriptor();
+            if (parent_spd != nullptr && !parent_spd->getConfig().outConfs.empty()) {
+                int inNum = parentEdge->getInputNum();
+                if (inNum < 0 || inNum >= static_cast<int>(parent_spd->getConfig().outConfs.size())) {
+                    inNum = 0;
+                }
+                auto curDesc = supportedPrimitiveDesc.getConfig().inConfs[j].getMemDesc();
+                auto parentDesc = parent_spd->getConfig().outConfs[inNum].getMemDesc();
+
+                const bool isCompatible = curDesc->isCompatible(*parentDesc);
+                if (!isCompatible) {
+                    if (!isReorderRequired(parentDesc, curDesc)) {
+                        estimate += 1;
+                    } else {
+                        estimate += ov::shape_size<ov::intel_cpu::VectorDims>(curDesc->getShape().getMinDims());
+                    }
+                }
+
+                DEBUG_LOG(getName(), " pd[", i, "].inConfs[", j, "]"
+                          " is ", (isCompatible ? "compatible" : "not compatible"),
+                          " shape is ", (isOneDimShape(curDesc->getShape().toPartialShape()) ? "one dim shape" : "not one dim shape"),
+                          " with parent ", parentPtr->getName(),
+                          " outConfs[", inNum, "], estimate add to ", estimate);
+            }
+        }
+        return estimate;
+    };
+
+    auto selectSPDwithType = [&](const impl_desc_type type) {
+        int selectedPrimitive = -1;
+        int bestEstimate = std::numeric_limits<int>::max();
+        for (size_t i = 0; i < getSupportedPrimitiveDescriptors().size(); i++) {
+            const auto& supportedPrimitiveDesc = getSupportedPrimitiveDescriptors()[i];
+            const impl_desc_type supportedType = supportedPrimitiveDesc.getImplementationType();
+            if (supportedType != type) {
+                continue;
+            }
+
+            const size_t descInConfSize = supportedPrimitiveDesc.getConfig().inConfs.size();
+
+            if (descInConfSize > getParentEdges().size()) {
+                OPENVINO_THROW(getName(),
+                               " Desc ",
+                               i,
+                               " with type: ",
+                               supportedType,
+                               " has more input ports than node: ",
+                               descInConfSize,
+                               " vs ",
+                               getParentEdges().size());
+                continue;
+            }
+
+            auto estimate = estimateReorderOverhead(supportedPrimitiveDesc, i);
+
+            if (estimate < bestEstimate) {
+                bestEstimate = estimate;
+                selectedPrimitive = static_cast<int>(i);
+                DEBUG_LOG(getName(), " Select primitive desc: ", i, " ", supportedPrimitiveDesc);
+            }
+        }
+        return selectedPrimitive;
+    };
+
+    // loop kernel priority
+    for (auto& type : priority) {
+        int selectedPrimitive = selectSPDwithType(type);
+        if (selectedPrimitive >= 0) {
+            selectPrimitiveDescriptorByIndex(selectedPrimitive);
+            return;
+        }
+    }
+
+    OPENVINO_ASSERT(!getSupportedPrimitiveDescriptors().empty(),
+                    "Supported primitive descriptors list is empty for node: ",
+                    getName(),
+                    " type: ",
+                    NameFromType(getType()));
+
+    // fallback. If there are no primitives from priority list just select a first
+    selectPrimitiveDescriptorByIndex(0);
+}
+
 bool Node::canBeInPlace() const {
     // TODO [DS]: enable inPlace for dynamic shapes
     if (isDynamicNode()) {
diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h
@@ -715,6 +715,9 @@ class Node {
     friend class GraphOptimizer;
 
     void selectPreferPrimitiveDescriptor(const std::vector<impl_desc_type>& priority, bool ignoreConstInputs);
+    void selectPreferPrimitiveDescriptorWithShape(const std::vector<impl_desc_type>& priority, bool ignoreConstInputs);
+    bool isOneDimShape(const ov::PartialShape& pshape);
+    bool isReorderRequired(ov::intel_cpu::MemoryDescPtr desc1, ov::intel_cpu::MemoryDescPtr desc2);
     bool isConfigDefined(const NodeConfig &config) const;
     virtual bool canBeInPlace() const;
 
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -508,7 +508,7 @@ void Subgraph::initSupportedPrimitiveDescriptors() {
 }
 
 void Subgraph::selectOptimalPrimitiveDescriptor() {
-    selectPreferPrimitiveDescriptor(getImplPriority(), true);
+    selectPreferPrimitiveDescriptorWithShape(getImplPriority(), true);
 }
 
 ov::element::Type Subgraph::getRuntimePrecision() const {
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/subgraph_select_pd.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/subgraph_select_pd.cpp
@@ -0,0 +1,120 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "common_test_utils/node_builders/constant.hpp"
+#include "openvino/opsets/opset8.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+#include "utils/cpu_test_utils.hpp"
+
+namespace ov {
+namespace test {
+
+/*
+    input1(f32_abcd_{1,64,32,32})                      input2(f16_abcd_{1,128,1,1})
+        |                                                 |
+    Reorder(f32_acdb_{1,64,32,32})   const             Convert(f32_abcd_{1,128,1,1})
+        |                           /                     |
+        |                          /                      |
+    Convolution(f32_acdb_{1,1,30,30})  Range_1520      VariadicSplit(f32_abcd_{1,64,1,1}, f32_abcd_{1,64,1,1})
+        |                             /                   \                               /
+        |                           /                      \                             /
+        |                         /                         \                           /
+        |                       /                            \                         /
+    MVN(f32_acdb_{1,1,30,30})              Reorder1(f32_acdb_{1,64,1,1})  Reorder2(f32_acdb_{1,64,1,1})
+            \                             /                            /
+             \                           /                            /
+              \                         /                            /
+               \                       /                            /
+               Subgraph(f32_acdb_{1,64,30,30})
+                            |
+                            |
+               Convolution(f32_acdb_{1,1,28,28})
+                            |
+                          Result
+    The Subgraph node have 3 inputs: they don't have same layout.
+    Expected: Reorder is inserted after VariadicSplit[0] and VariadicSplit[1], not inserted after MVN.
+    Because VariadicSplit's output layout is scalar shape([1,64,1,1]), its reorder has less computation.
+*/
+
+class SubgraphSelectPD : virtual public SubgraphBaseStaticTest {
+protected:
+    void SetUp() override {
+        targetDevice = ov::test::utils::DEVICE_CPU;
+        abs_threshold = 2e-2;
+
+        auto type = element::f32;
+        constexpr int const1 = 32;
+        auto input1 = std::make_shared<ov::opset8::Parameter>(type, Shape{1, const1 / 2, 8, 8});
+        input1->set_friendly_name("input1");
+        auto input2 = std::make_shared<ov::opset8::Parameter>(type, Shape{1, const1, 1, 1});
+        input2->set_friendly_name("input2");
+
+        auto variadicSplit = std::make_shared<ov::op::v1::VariadicSplit>(
+            input2,
+            ov::opset8::Constant::create(element::i64, Shape{1}, {1}),
+            ov::opset8::Constant::create(element::i64, Shape{2}, {const1 / 2, const1 / 2}));
+        variadicSplit->set_friendly_name("variadicSplit");
+
+        auto add1 = std::make_shared<ov::opset8::Add>(variadicSplit->output(0),
+                                                      ov::opset8::Constant::create(type, Shape{1}, {0}));
+        add1->set_friendly_name("add1");
+        auto shapeof = std::make_shared<ov::opset8::ShapeOf>(input1);
+        auto rankof = std::make_shared<ov::opset8::ShapeOf>(shapeof);
+        auto squeeze =
+            std::make_shared<ov::opset8::Squeeze>(rankof, ov::opset8::Constant::create(element::i64, Shape{1}, {0}));
+
+        auto range = std::make_shared<ov::opset8::Range>(ov::opset8::Constant::create(element::i64, Shape{}, {2}),
+                                                         squeeze,
+                                                         ov::opset8::Constant::create(element::i64, Shape{}, {1}),
+                                                         ov::element::i64);
+        auto create_conv = [&](const std::shared_ptr<ov::Node>& input_node) {
+            ov::test::utils::InputGenerateData in_gen_data(0, 1);
+            auto conv = std::make_shared<ov::opset8::Convolution>(
+                input_node,
+                ov::test::utils::make_constant(type, Shape{1, const1 / 2u, 3, 3}, ov::test::utils::InputGenerateData(0, 1)),
+                Strides{1, 1},
+                CoordinateDiff{1, 1},
+                CoordinateDiff{1, 1},
+                Strides{1, 1});
+            conv->get_rt_info() =
+                CPUTestUtils::CPUTestsBase::makeCPUInfo({CPUTestUtils::nhwc}, {CPUTestUtils::nhwc}, {});
+            return conv;
+        };
+        auto create_relu = [&](const std::shared_ptr<ov::Node>& input_node) {
+            return std::make_shared<ov::opset8::PRelu>(input_node,
+                                                       ov::opset8::Constant::create(element::f32, Shape{1}, {1}));
+        };
+        auto conv1 = create_conv(input1);
+        auto mvn =
+            std::make_shared<ov::opset8::MVN>(create_relu(conv1), range, false, 0.1, op::MVNEpsMode::INSIDE_SQRT);
+        auto mul = std::make_shared<ov::opset8::Multiply>(create_relu(add1), mvn);
+        auto add2 = std::make_shared<ov::opset8::Add>(variadicSplit->output(1), mul);
+        auto conv2 = create_conv(create_relu(add2));
+        conv2->set_friendly_name("conv2");
+
+        function = std::make_shared<ov::Model>(conv2, ParameterVector{input1, input2});
+    }
+
+    void TearDown() override {
+        auto runtime_function = compiledModel.get_runtime_model();
+        int nodes_found = 0;
+        for (const auto& n : runtime_function->get_ordered_ops()) {
+            auto layer_type = n->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
+            if (layer_type == "Subgraph") {
+                nodes_found++;
+                auto output_layout = n->get_rt_info().at(ov::exec_model_info::OUTPUT_LAYOUTS).as<std::string>();
+                // The optimal choose should be: 'nhwc'.
+                ASSERT_EQ(output_layout, "acdb");
+            }
+        }
+        ASSERT_GT(nodes_found, 0);
+    }
+};
+
+TEST_F(SubgraphSelectPD, smoke_CompareWithRefs) {
+    run();
+}
+
+}  // namespace test
+}  // namespace ov

Original file line number	Diff line number	Diff line change
`@@ -508,7 +508,7 @@ void Subgraph::initSupportedPrimitiveDescriptors() {`
`508`	`508`	`}`
`509`	`509`
`510`	`510`	`void Subgraph::selectOptimalPrimitiveDescriptor() {`
`511`		`- selectPreferPrimitiveDescriptor(getImplPriority(), true);`
	`511`	`+ selectPreferPrimitiveDescriptorWithShape(getImplPriority(), true);`
`512`	`512`	`}`
`513`	`513`
`514`	`514`	`ov::element::Type Subgraph::getRuntimePrecision() const {`