diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp index 31c4a0d2a5b54d..21f38743d32679 100644 --- a/src/plugins/intel_cpu/src/node.cpp +++ b/src/plugins/intel_cpu/src/node.cpp @@ -281,7 +281,6 @@ void Node::selectPreferPrimitiveDescriptor(const std::vector<impl_desc_type>& pr auto parentDesc = parent_spd->getConfig().outConfs[inNum].getMemDesc(); const bool isCompatible = curDesc->isCompatible(*parentDesc); - if (isCompatible) { equalsLocalFormatCount++; } @@ -316,6 +315,126 @@ void Node::selectPreferPrimitiveDescriptor(const std::vector<impl_desc_type>& pr selectPrimitiveDescriptorByIndex(0); } +bool Node::isOneDimShape(const ov::PartialShape& pshape) { + int value_1_num = 0; + int sz = static_cast<int>(pshape.size()); + for (auto s : pshape) { + if (s.is_static() && s.get_length() == 1) { + value_1_num++; + } + } + return value_1_num >= sz - 1; +} + +bool Node::isReorderRequired(ov::intel_cpu::MemoryDescPtr desc1, ov::intel_cpu::MemoryDescPtr desc2) { + bool samePrec = desc1->getPrecision() == desc2->getPrecision(); + bool isOneDimShape1 = isOneDimShape(desc1->getShape().toPartialShape()); + bool isOneDimShape2 = isOneDimShape(desc2->getShape().toPartialShape()); + return !(isOneDimShape1 && isOneDimShape2 && samePrec); +} + +void Node::selectPreferPrimitiveDescriptorWithShape(const std::vector<impl_desc_type>& priority, bool ignoreConstInputs) { + // Filter out dynamic shape. + if (isDynamic) { + return selectPreferPrimitiveDescriptor(priority, ignoreConstInputs); + } + + auto estimateReorderOverhead = [&](const ov::intel_cpu::NodeDesc& supportedPrimitiveDesc, size_t i) { + int estimate = 0; + auto inputNodesNum = supportedPrimitiveDesc.getConfig().inConfs.size(); + for (size_t j = 0; j < inputNodesNum; j++) { + auto parentEdge = getParentEdgeAt(j); + auto parentPtr = parentEdge->getParent(); + + // We don't take into account constant edges since reorders on them will be executed on load network + // stage + if (ignoreConstInputs && j > 0 && parentPtr->isConstant()) { + continue; + } + + auto parent_spd = parentPtr->getSelectedPrimitiveDescriptor(); + if (parent_spd != nullptr && !parent_spd->getConfig().outConfs.empty()) { + int inNum = parentEdge->getInputNum(); + if (inNum < 0 || inNum >= static_cast<int>(parent_spd->getConfig().outConfs.size())) { + inNum = 0; + } + auto curDesc = supportedPrimitiveDesc.getConfig().inConfs[j].getMemDesc(); + auto parentDesc = parent_spd->getConfig().outConfs[inNum].getMemDesc(); + + const bool isCompatible = curDesc->isCompatible(*parentDesc); + if (!isCompatible) { + if (!isReorderRequired(parentDesc, curDesc)) { + estimate += 1; + } else { + estimate += ov::shape_size<ov::intel_cpu::VectorDims>(curDesc->getShape().getMinDims()); + } + } + + DEBUG_LOG(getName(), " pd[", i, "].inConfs[", j, "]" + " is ", (isCompatible ? "compatible" : "not compatible"), + " shape is ", (isOneDimShape(curDesc->getShape().toPartialShape()) ? "one dim shape" : "not one dim shape"), + " with parent ", parentPtr->getName(), + " outConfs[", inNum, "], estimate add to ", estimate); + } + } + return estimate; + }; + + auto selectSPDwithType = [&](const impl_desc_type type) { + int selectedPrimitive = -1; + int bestEstimate = std::numeric_limits<int>::max(); + for (size_t i = 0; i < getSupportedPrimitiveDescriptors().size(); i++) { + const auto& supportedPrimitiveDesc = getSupportedPrimitiveDescriptors()[i]; + const impl_desc_type supportedType = supportedPrimitiveDesc.getImplementationType(); + if (supportedType != type) { + continue; + } + + const size_t descInConfSize = supportedPrimitiveDesc.getConfig().inConfs.size(); + + if (descInConfSize > getParentEdges().size()) { + OPENVINO_THROW(getName(), + " Desc ", + i, + " with type: ", + supportedType, + " has more input ports than node: ", + descInConfSize, + " vs ", + getParentEdges().size()); + continue; + } + + auto estimate = estimateReorderOverhead(supportedPrimitiveDesc, i); + + if (estimate < bestEstimate) { + bestEstimate = estimate; + selectedPrimitive = static_cast<int>(i); + DEBUG_LOG(getName(), " Select primitive desc: ", i, " ", supportedPrimitiveDesc); + } + } + return selectedPrimitive; + }; + + // loop kernel priority + for (auto& type : priority) { + int selectedPrimitive = selectSPDwithType(type); + if (selectedPrimitive >= 0) { + selectPrimitiveDescriptorByIndex(selectedPrimitive); + return; + } + } + + OPENVINO_ASSERT(!getSupportedPrimitiveDescriptors().empty(), + "Supported primitive descriptors list is empty for node: ", + getName(), + " type: ", + NameFromType(getType())); + + // fallback. If there are no primitives from priority list just select a first + selectPrimitiveDescriptorByIndex(0); +} + bool Node::canBeInPlace() const { // TODO [DS]: enable inPlace for dynamic shapes if (isDynamicNode()) { diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index ff8bf87d993a74..4dafc11738c5c4 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -707,6 +707,9 @@ class Node { friend class GraphOptimizer; void selectPreferPrimitiveDescriptor(const std::vector<impl_desc_type>& priority, bool ignoreConstInputs); + void selectPreferPrimitiveDescriptorWithShape(const std::vector<impl_desc_type>& priority, bool ignoreConstInputs); + bool isOneDimShape(const ov::PartialShape& pshape); + bool isReorderRequired(ov::intel_cpu::MemoryDescPtr desc1, ov::intel_cpu::MemoryDescPtr desc2); bool isConfigDefined(const NodeConfig &config) const; virtual bool canBeInPlace() const; diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index e166fc8bf453e7..de76db1befeeb7 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -507,7 +507,7 @@ void Subgraph::initSupportedPrimitiveDescriptors() { } void Subgraph::selectOptimalPrimitiveDescriptor() { - selectPreferPrimitiveDescriptor(getImplPriority(), true); + selectPreferPrimitiveDescriptorWithShape(getImplPriority(), true); } ov::element::Type Subgraph::getRuntimePrecision() const { diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/subgraph_select_pd.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/subgraph_select_pd.cpp new file mode 100644 index 00000000000000..2d44492c13c106 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/subgraph_select_pd.cpp @@ -0,0 +1,120 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/node_builders/constant.hpp" +#include "openvino/opsets/opset8.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "utils/cpu_test_utils.hpp" + +namespace ov { +namespace test { + +/* + input1(f32_abcd_{1,64,32,32}) input2(f16_abcd_{1,128,1,1}) + | | + Reorder(f32_acdb_{1,64,32,32}) const Convert(f32_abcd_{1,128,1,1}) + | / | + | / | + Convolution(f32_acdb_{1,1,30,30}) Range_1520 VariadicSplit(f32_abcd_{1,64,1,1}, f32_abcd_{1,64,1,1}) + | / \ / + | / \ / + | / \ / + | / \ / + MVN(f32_acdb_{1,1,30,30}) Reorder1(f32_acdb_{1,64,1,1}) Reorder2(f32_acdb_{1,64,1,1}) + \ / / + \ / / + \ / / + \ / / + Subgraph(f32_acdb_{1,64,30,30}) + | + | + Convolution(f32_acdb_{1,1,28,28}) + | + Result + The Subgraph node have 3 inputs: they don't have same layout. + Expected: Reorder is inserted after VariadicSplit[0] and VariadicSplit[1], not inserted after MVN. + Because VariadicSplit's output layout is scalar shape([1,64,1,1]), its reorder has less computation. +*/ + +class SubgraphSelectPD : virtual public SubgraphBaseStaticTest { +protected: + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_CPU; + abs_threshold = 2e-2; + + auto type = element::f32; + constexpr int const1 = 32; + auto input1 = std::make_shared<ov::opset8::Parameter>(type, Shape{1, const1 / 2, 8, 8}); + input1->set_friendly_name("input1"); + auto input2 = std::make_shared<ov::opset8::Parameter>(type, Shape{1, const1, 1, 1}); + input2->set_friendly_name("input2"); + + auto variadicSplit = std::make_shared<ov::op::v1::VariadicSplit>( + input2, + ov::opset8::Constant::create(element::i64, Shape{1}, {1}), + ov::opset8::Constant::create(element::i64, Shape{2}, {const1 / 2, const1 / 2})); + variadicSplit->set_friendly_name("variadicSplit"); + + auto add1 = std::make_shared<ov::opset8::Add>(variadicSplit->output(0), + ov::opset8::Constant::create(type, Shape{1}, {0})); + add1->set_friendly_name("add1"); + auto shapeof = std::make_shared<ov::opset8::ShapeOf>(input1); + auto rankof = std::make_shared<ov::opset8::ShapeOf>(shapeof); + auto squeeze = + std::make_shared<ov::opset8::Squeeze>(rankof, ov::opset8::Constant::create(element::i64, Shape{1}, {0})); + + auto range = std::make_shared<ov::opset8::Range>(ov::opset8::Constant::create(element::i64, Shape{}, {2}), + squeeze, + ov::opset8::Constant::create(element::i64, Shape{}, {1}), + ov::element::i64); + auto create_conv = [&](const std::shared_ptr<ov::Node>& input_node) { + ov::test::utils::InputGenerateData in_gen_data(0, 1); + auto conv = std::make_shared<ov::opset8::Convolution>( + input_node, + ov::test::utils::make_constant(type, Shape{1, const1 / 2u, 3, 3}, ov::test::utils::InputGenerateData(0, 1)), + Strides{1, 1}, + CoordinateDiff{1, 1}, + CoordinateDiff{1, 1}, + Strides{1, 1}); + conv->get_rt_info() = + CPUTestUtils::CPUTestsBase::makeCPUInfo({CPUTestUtils::nhwc}, {CPUTestUtils::nhwc}, {}); + return conv; + }; + auto create_relu = [&](const std::shared_ptr<ov::Node>& input_node) { + return std::make_shared<ov::opset8::PRelu>(input_node, + ov::opset8::Constant::create(element::f32, Shape{1}, {1})); + }; + auto conv1 = create_conv(input1); + auto mvn = + std::make_shared<ov::opset8::MVN>(create_relu(conv1), range, false, 0.1, op::MVNEpsMode::INSIDE_SQRT); + auto mul = std::make_shared<ov::opset8::Multiply>(create_relu(add1), mvn); + auto add2 = std::make_shared<ov::opset8::Add>(variadicSplit->output(1), mul); + auto conv2 = create_conv(create_relu(add2)); + conv2->set_friendly_name("conv2"); + + function = std::make_shared<ov::Model>(conv2, ParameterVector{input1, input2}); + } + + void TearDown() override { + auto runtime_function = compiledModel.get_runtime_model(); + int nodes_found = 0; + for (const auto& n : runtime_function->get_ordered_ops()) { + auto layer_type = n->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>(); + if (layer_type == "Subgraph") { + nodes_found++; + auto output_layout = n->get_rt_info().at(ov::exec_model_info::OUTPUT_LAYOUTS).as<std::string>(); + // The optimal choose should be: 'nhwc'. + ASSERT_EQ(output_layout, "acdb"); + } + } + ASSERT_GT(nodes_found, 0); + } +}; + +TEST_F(SubgraphSelectPD, smoke_CompareWithRefs) { + run(); +} + +} // namespace test +} // namespace ov