Skip to content

Commit 507f31d

Browse files
authored
[CPU]The shape of the memory descriptor is considered in selectPreferPrimitiveDescriptor of Subgraph (#23971)
### Details: - *selectPreferPrimitiveDescriptor take into account shapes of the memory descriptors, because scalar shape node's reorder has less computation* - *New logic only work for Subgraph* ### Tickets: - *137307* - *139904* --------- Signed-off-by: xipingya <xiping.yan@intel.com> Signed-off-by: Yan <xiping.yan@intel.com>
1 parent aece8c6 commit 507f31d

File tree

4 files changed

+244
-2
lines changed

4 files changed

+244
-2
lines changed

src/plugins/intel_cpu/src/node.cpp

+120-1
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,6 @@ void Node::selectPreferPrimitiveDescriptor(const std::vector<impl_desc_type>& pr
281281
auto parentDesc = parent_spd->getConfig().outConfs[inNum].getMemDesc();
282282

283283
const bool isCompatible = curDesc->isCompatible(*parentDesc);
284-
285284
if (isCompatible) {
286285
equalsLocalFormatCount++;
287286
}
@@ -316,6 +315,126 @@ void Node::selectPreferPrimitiveDescriptor(const std::vector<impl_desc_type>& pr
316315
selectPrimitiveDescriptorByIndex(0);
317316
}
318317

318+
bool Node::isOneDimShape(const ov::PartialShape& pshape) {
319+
int value_1_num = 0;
320+
int sz = static_cast<int>(pshape.size());
321+
for (auto s : pshape) {
322+
if (s.is_static() && s.get_length() == 1) {
323+
value_1_num++;
324+
}
325+
}
326+
return value_1_num >= sz - 1;
327+
}
328+
329+
bool Node::isReorderRequired(ov::intel_cpu::MemoryDescPtr desc1, ov::intel_cpu::MemoryDescPtr desc2) {
330+
bool samePrec = desc1->getPrecision() == desc2->getPrecision();
331+
bool isOneDimShape1 = isOneDimShape(desc1->getShape().toPartialShape());
332+
bool isOneDimShape2 = isOneDimShape(desc2->getShape().toPartialShape());
333+
return !(isOneDimShape1 && isOneDimShape2 && samePrec);
334+
}
335+
336+
void Node::selectPreferPrimitiveDescriptorWithShape(const std::vector<impl_desc_type>& priority, bool ignoreConstInputs) {
337+
// Filter out dynamic shape.
338+
if (isDynamic) {
339+
return selectPreferPrimitiveDescriptor(priority, ignoreConstInputs);
340+
}
341+
342+
auto estimateReorderOverhead = [&](const ov::intel_cpu::NodeDesc& supportedPrimitiveDesc, size_t i) {
343+
int estimate = 0;
344+
auto inputNodesNum = supportedPrimitiveDesc.getConfig().inConfs.size();
345+
for (size_t j = 0; j < inputNodesNum; j++) {
346+
auto parentEdge = getParentEdgeAt(j);
347+
auto parentPtr = parentEdge->getParent();
348+
349+
// We don't take into account constant edges since reorders on them will be executed on load network
350+
// stage
351+
if (ignoreConstInputs && j > 0 && parentPtr->isConstant()) {
352+
continue;
353+
}
354+
355+
auto parent_spd = parentPtr->getSelectedPrimitiveDescriptor();
356+
if (parent_spd != nullptr && !parent_spd->getConfig().outConfs.empty()) {
357+
int inNum = parentEdge->getInputNum();
358+
if (inNum < 0 || inNum >= static_cast<int>(parent_spd->getConfig().outConfs.size())) {
359+
inNum = 0;
360+
}
361+
auto curDesc = supportedPrimitiveDesc.getConfig().inConfs[j].getMemDesc();
362+
auto parentDesc = parent_spd->getConfig().outConfs[inNum].getMemDesc();
363+
364+
const bool isCompatible = curDesc->isCompatible(*parentDesc);
365+
if (!isCompatible) {
366+
if (!isReorderRequired(parentDesc, curDesc)) {
367+
estimate += 1;
368+
} else {
369+
estimate += ov::shape_size<ov::intel_cpu::VectorDims>(curDesc->getShape().getMinDims());
370+
}
371+
}
372+
373+
DEBUG_LOG(getName(), " pd[", i, "].inConfs[", j, "]"
374+
" is ", (isCompatible ? "compatible" : "not compatible"),
375+
" shape is ", (isOneDimShape(curDesc->getShape().toPartialShape()) ? "one dim shape" : "not one dim shape"),
376+
" with parent ", parentPtr->getName(),
377+
" outConfs[", inNum, "], estimate add to ", estimate);
378+
}
379+
}
380+
return estimate;
381+
};
382+
383+
auto selectSPDwithType = [&](const impl_desc_type type) {
384+
int selectedPrimitive = -1;
385+
int bestEstimate = std::numeric_limits<int>::max();
386+
for (size_t i = 0; i < getSupportedPrimitiveDescriptors().size(); i++) {
387+
const auto& supportedPrimitiveDesc = getSupportedPrimitiveDescriptors()[i];
388+
const impl_desc_type supportedType = supportedPrimitiveDesc.getImplementationType();
389+
if (supportedType != type) {
390+
continue;
391+
}
392+
393+
const size_t descInConfSize = supportedPrimitiveDesc.getConfig().inConfs.size();
394+
395+
if (descInConfSize > getParentEdges().size()) {
396+
OPENVINO_THROW(getName(),
397+
" Desc ",
398+
i,
399+
" with type: ",
400+
supportedType,
401+
" has more input ports than node: ",
402+
descInConfSize,
403+
" vs ",
404+
getParentEdges().size());
405+
continue;
406+
}
407+
408+
auto estimate = estimateReorderOverhead(supportedPrimitiveDesc, i);
409+
410+
if (estimate < bestEstimate) {
411+
bestEstimate = estimate;
412+
selectedPrimitive = static_cast<int>(i);
413+
DEBUG_LOG(getName(), " Select primitive desc: ", i, " ", supportedPrimitiveDesc);
414+
}
415+
}
416+
return selectedPrimitive;
417+
};
418+
419+
// loop kernel priority
420+
for (auto& type : priority) {
421+
int selectedPrimitive = selectSPDwithType(type);
422+
if (selectedPrimitive >= 0) {
423+
selectPrimitiveDescriptorByIndex(selectedPrimitive);
424+
return;
425+
}
426+
}
427+
428+
OPENVINO_ASSERT(!getSupportedPrimitiveDescriptors().empty(),
429+
"Supported primitive descriptors list is empty for node: ",
430+
getName(),
431+
" type: ",
432+
NameFromType(getType()));
433+
434+
// fallback. If there are no primitives from priority list just select a first
435+
selectPrimitiveDescriptorByIndex(0);
436+
}
437+
319438
bool Node::canBeInPlace() const {
320439
// TODO [DS]: enable inPlace for dynamic shapes
321440
if (isDynamicNode()) {

src/plugins/intel_cpu/src/node.h

+3
Original file line numberDiff line numberDiff line change
@@ -715,6 +715,9 @@ class Node {
715715
friend class GraphOptimizer;
716716

717717
void selectPreferPrimitiveDescriptor(const std::vector<impl_desc_type>& priority, bool ignoreConstInputs);
718+
void selectPreferPrimitiveDescriptorWithShape(const std::vector<impl_desc_type>& priority, bool ignoreConstInputs);
719+
bool isOneDimShape(const ov::PartialShape& pshape);
720+
bool isReorderRequired(ov::intel_cpu::MemoryDescPtr desc1, ov::intel_cpu::MemoryDescPtr desc2);
718721
bool isConfigDefined(const NodeConfig &config) const;
719722
virtual bool canBeInPlace() const;
720723

src/plugins/intel_cpu/src/nodes/subgraph.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -508,7 +508,7 @@ void Subgraph::initSupportedPrimitiveDescriptors() {
508508
}
509509

510510
void Subgraph::selectOptimalPrimitiveDescriptor() {
511-
selectPreferPrimitiveDescriptor(getImplPriority(), true);
511+
selectPreferPrimitiveDescriptorWithShape(getImplPriority(), true);
512512
}
513513

514514
ov::element::Type Subgraph::getRuntimePrecision() const {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
// Copyright (C) 2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "common_test_utils/node_builders/constant.hpp"
6+
#include "openvino/opsets/opset8.hpp"
7+
#include "shared_test_classes/base/ov_subgraph.hpp"
8+
#include "utils/cpu_test_utils.hpp"
9+
10+
namespace ov {
11+
namespace test {
12+
13+
/*
14+
input1(f32_abcd_{1,64,32,32}) input2(f16_abcd_{1,128,1,1})
15+
| |
16+
Reorder(f32_acdb_{1,64,32,32}) const Convert(f32_abcd_{1,128,1,1})
17+
| / |
18+
| / |
19+
Convolution(f32_acdb_{1,1,30,30}) Range_1520 VariadicSplit(f32_abcd_{1,64,1,1}, f32_abcd_{1,64,1,1})
20+
| / \ /
21+
| / \ /
22+
| / \ /
23+
| / \ /
24+
MVN(f32_acdb_{1,1,30,30}) Reorder1(f32_acdb_{1,64,1,1}) Reorder2(f32_acdb_{1,64,1,1})
25+
\ / /
26+
\ / /
27+
\ / /
28+
\ / /
29+
Subgraph(f32_acdb_{1,64,30,30})
30+
|
31+
|
32+
Convolution(f32_acdb_{1,1,28,28})
33+
|
34+
Result
35+
The Subgraph node have 3 inputs: they don't have same layout.
36+
Expected: Reorder is inserted after VariadicSplit[0] and VariadicSplit[1], not inserted after MVN.
37+
Because VariadicSplit's output layout is scalar shape([1,64,1,1]), its reorder has less computation.
38+
*/
39+
40+
class SubgraphSelectPD : virtual public SubgraphBaseStaticTest {
41+
protected:
42+
void SetUp() override {
43+
targetDevice = ov::test::utils::DEVICE_CPU;
44+
abs_threshold = 2e-2;
45+
46+
auto type = element::f32;
47+
constexpr int const1 = 32;
48+
auto input1 = std::make_shared<ov::opset8::Parameter>(type, Shape{1, const1 / 2, 8, 8});
49+
input1->set_friendly_name("input1");
50+
auto input2 = std::make_shared<ov::opset8::Parameter>(type, Shape{1, const1, 1, 1});
51+
input2->set_friendly_name("input2");
52+
53+
auto variadicSplit = std::make_shared<ov::op::v1::VariadicSplit>(
54+
input2,
55+
ov::opset8::Constant::create(element::i64, Shape{1}, {1}),
56+
ov::opset8::Constant::create(element::i64, Shape{2}, {const1 / 2, const1 / 2}));
57+
variadicSplit->set_friendly_name("variadicSplit");
58+
59+
auto add1 = std::make_shared<ov::opset8::Add>(variadicSplit->output(0),
60+
ov::opset8::Constant::create(type, Shape{1}, {0}));
61+
add1->set_friendly_name("add1");
62+
auto shapeof = std::make_shared<ov::opset8::ShapeOf>(input1);
63+
auto rankof = std::make_shared<ov::opset8::ShapeOf>(shapeof);
64+
auto squeeze =
65+
std::make_shared<ov::opset8::Squeeze>(rankof, ov::opset8::Constant::create(element::i64, Shape{1}, {0}));
66+
67+
auto range = std::make_shared<ov::opset8::Range>(ov::opset8::Constant::create(element::i64, Shape{}, {2}),
68+
squeeze,
69+
ov::opset8::Constant::create(element::i64, Shape{}, {1}),
70+
ov::element::i64);
71+
auto create_conv = [&](const std::shared_ptr<ov::Node>& input_node) {
72+
ov::test::utils::InputGenerateData in_gen_data(0, 1);
73+
auto conv = std::make_shared<ov::opset8::Convolution>(
74+
input_node,
75+
ov::test::utils::make_constant(type, Shape{1, const1 / 2u, 3, 3}, ov::test::utils::InputGenerateData(0, 1)),
76+
Strides{1, 1},
77+
CoordinateDiff{1, 1},
78+
CoordinateDiff{1, 1},
79+
Strides{1, 1});
80+
conv->get_rt_info() =
81+
CPUTestUtils::CPUTestsBase::makeCPUInfo({CPUTestUtils::nhwc}, {CPUTestUtils::nhwc}, {});
82+
return conv;
83+
};
84+
auto create_relu = [&](const std::shared_ptr<ov::Node>& input_node) {
85+
return std::make_shared<ov::opset8::PRelu>(input_node,
86+
ov::opset8::Constant::create(element::f32, Shape{1}, {1}));
87+
};
88+
auto conv1 = create_conv(input1);
89+
auto mvn =
90+
std::make_shared<ov::opset8::MVN>(create_relu(conv1), range, false, 0.1, op::MVNEpsMode::INSIDE_SQRT);
91+
auto mul = std::make_shared<ov::opset8::Multiply>(create_relu(add1), mvn);
92+
auto add2 = std::make_shared<ov::opset8::Add>(variadicSplit->output(1), mul);
93+
auto conv2 = create_conv(create_relu(add2));
94+
conv2->set_friendly_name("conv2");
95+
96+
function = std::make_shared<ov::Model>(conv2, ParameterVector{input1, input2});
97+
}
98+
99+
void TearDown() override {
100+
auto runtime_function = compiledModel.get_runtime_model();
101+
int nodes_found = 0;
102+
for (const auto& n : runtime_function->get_ordered_ops()) {
103+
auto layer_type = n->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
104+
if (layer_type == "Subgraph") {
105+
nodes_found++;
106+
auto output_layout = n->get_rt_info().at(ov::exec_model_info::OUTPUT_LAYOUTS).as<std::string>();
107+
// The optimal choose should be: 'nhwc'.
108+
ASSERT_EQ(output_layout, "acdb");
109+
}
110+
}
111+
ASSERT_GT(nodes_found, 0);
112+
}
113+
};
114+
115+
TEST_F(SubgraphSelectPD, smoke_CompareWithRefs) {
116+
run();
117+
}
118+
119+
} // namespace test
120+
} // namespace ov

0 commit comments

Comments
 (0)