Skip to content

Commit 25c216f

Browse files
authored
[CPU][ACL] LPT transformations are enabled + FQ decomposition (#28981)
### Details: - It depends on #28870 so this PR should be merged after - LPT is enabled on ARM - ARM LPT transformation pipeline is separated from common LPT transformation pipeline - FQ is decomposed to avoid reference code ### Tickets: - CVS-162444
1 parent ead1db4 commit 25c216f

File tree

8 files changed

+177
-32
lines changed

8 files changed

+177
-32
lines changed

src/plugins/intel_cpu/src/config.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ struct Config {
8989
bool enableNodeSplit = false;
9090
bool enableHyperThreading = true;
9191
bool changedHyperThreading = false;
92-
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
92+
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64)
9393
LPTransformsMode lpTransformsMode = LPTransformsMode::On;
9494
#else
9595
// Currently INT8 mode is not optimized on ARM / RISCV or other non-x86 platforms, fallback to FP32 mode.

src/plugins/intel_cpu/src/graph_optimizer.cpp

+57-4
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,8 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph& graph) {
8282
FuseMultiplyAndAdd(graph);
8383
graph.RemoveDroppedNodes();
8484

85-
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "MergeConvertAndScaleShift");
86-
MergeConvertAndScaleShift(graph);
85+
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "MergeConvertAndEltwise");
86+
MergeConvertAndEltwise(graph);
8787
graph.RemoveDroppedNodes();
8888

8989
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "FuseFCAndConvertOnWeights");
@@ -166,6 +166,10 @@ void GraphOptimizer::ApplyCommonGraphOptimizations(Graph& graph) {
166166
FuseEltwiseAndSimple(graph);
167167
graph.RemoveDroppedNodes();
168168

169+
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "MergeEltwiseAndConvert");
170+
MergeEltwiseAndConvert(graph);
171+
graph.RemoveDroppedNodes();
172+
169173
OV_ITT_SCOPE_NEXT(FIRST_INFERENCE, taskChain, "reshapeRnnSeq");
170174
reshapeRnnSeq(graph);
171175
graph.RemoveDroppedNodes();
@@ -680,12 +684,61 @@ void GraphOptimizer::FuseMultiplyAndAdd(Graph& graph) {
680684
}
681685
}
682686

683-
void GraphOptimizer::MergeConvertAndScaleShift(Graph& graph) {
687+
void GraphOptimizer::MergeEltwiseAndConvert(Graph& graph) {
688+
// The pass is enabled on arm platforms only, however it might be usefull for other platforms as well
689+
// It requires additional perf validation. Ticket: 163388
690+
#if !defined(OPENVINO_ARCH_ARM64)
691+
return;
692+
#endif
693+
auto& graphNodes = graph.GetNodes();
694+
695+
auto parent = graphNodes.begin();
696+
while (parent != graphNodes.end()) {
697+
CPU_GRAPH_OPTIMIZER_SCOPE(MergeEltwiseAndConvert);
698+
auto parentNode = *parent;
699+
if (parentNode->getType() != Type::Eltwise) {
700+
parent++;
701+
continue;
702+
}
703+
704+
const auto& childEdges = parentNode->getChildEdges();
705+
if (childEdges.size() != 1) {
706+
parent++;
707+
continue;
708+
}
709+
710+
const auto edge = childEdges[0].lock();
711+
auto childNode = edge->getChild();
712+
if (childNode->getType() != Type::Convert) {
713+
parent++;
714+
continue;
715+
}
716+
717+
const auto eltwise = dynamic_cast<ov::intel_cpu::node::Eltwise*>(parentNode.get());
718+
if (!eltwise->canFuseConvert(childNode)) {
719+
parent++;
720+
continue;
721+
}
722+
723+
// WA: Eltwise node uses precision of last fused node as output precision
724+
auto fusedOps = parentNode->getFusedWith();
725+
if (!fusedOps.empty()) {
726+
fusedOps[fusedOps.size() - 1]->setOriginalOutputPrecisionAtPort(
727+
0,
728+
childNode->getOriginalOutputPrecisionAtPort(0));
729+
}
730+
parentNode->setOriginalOutputPrecisionAtPort(0, childNode->getOriginalOutputPrecisionAtPort(0));
731+
parentNode->addOriginalLayer(childNode->getOriginalLayers());
732+
graph.DropNode(childNode);
733+
}
734+
}
735+
736+
void GraphOptimizer::MergeConvertAndEltwise(Graph& graph) {
684737
auto& graphNodes = graph.GetNodes();
685738

686739
auto parent = graphNodes.begin();
687740
while (parent != graphNodes.end()) {
688-
CPU_GRAPH_OPTIMIZER_SCOPE(MergeConvertAndScaleShift);
741+
CPU_GRAPH_OPTIMIZER_SCOPE(MergeConvertAndEltwise);
689742
auto parentNode = *parent;
690743
if (parentNode->getType() != Type::Convert) {
691744
parent++;

src/plugins/intel_cpu/src/graph_optimizer.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ class GraphOptimizer {
2323
void FuseConvolutionMatMulDeconvAndBias(Graph& graph);
2424
void FuseDeconvolutionAndSimpleOperation(Graph& graph);
2525
void FuseMultiplyAndAdd(Graph& graph);
26-
void MergeConvertAndScaleShift(Graph& graph);
26+
void MergeEltwiseAndConvert(Graph& graph);
27+
void MergeConvertAndEltwise(Graph& graph);
2728
void FuseFCAndConvertOnWeights(Graph& graph);
2829
void FuseFCAndTransposeOnWeights(Graph& graph);
2930
void FuseFullyConnectedAndSimpleOperation(Graph& graph);

src/plugins/intel_cpu/src/nodes/eltwise.cpp

+17
Original file line numberDiff line numberDiff line change
@@ -2277,6 +2277,23 @@ bool Eltwise::canFuseParent(const NodePtr& parentNode) const {
22772277
return true;
22782278
}
22792279

2280+
bool Eltwise::canFuseConvert(const NodePtr& convertNode) const {
2281+
if (!one_of(convertNode->getOriginalOutputPrecisionAtPort(0),
2282+
ov::element::i8,
2283+
ov::element::u8,
2284+
ov::element::f16,
2285+
ov::element::bf16,
2286+
ov::element::f32)) {
2287+
return false;
2288+
}
2289+
// Convert can be fused into Eltwise only if jit implementation is supported
2290+
#if defined(OPENVINO_ARCH_ARM64)
2291+
return jitIsSupported(this, getAlpha(), getBeta(), getGamma());
2292+
#else
2293+
return false;
2294+
#endif
2295+
}
2296+
22802297
bool Eltwise::canFuse(const NodePtr& node) const {
22812298
auto isIntegerComputeSupported = [](const Node* node) {
22822299
if (!one_of(node->getAlgorithm(),

src/plugins/intel_cpu/src/nodes/eltwise.h

+1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class Eltwise : public Node {
4343
void execute(const dnnl::stream& strm) override;
4444
bool created() const override;
4545
bool canBeInPlace() const override;
46+
bool canFuseConvert(const NodePtr& convertNode) const;
4647
bool canFuseParent(const NodePtr& parentNode) const;
4748
bool canFuse(const NodePtr& node) const override;
4849
void appendPostOps(dnnl::post_ops& ops,

src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp

+86-25
Original file line numberDiff line numberDiff line change
@@ -115,15 +115,24 @@
115115

116116
// LPT transformations
117117
#include "low_precision/add.hpp"
118+
#include "low_precision/avg_pool.hpp"
118119
#include "low_precision/convert_subtract_constant.hpp"
119120
#include "low_precision/convolution_backprop_data.hpp"
120121
#include "low_precision/fold_convert.hpp"
121122
#include "low_precision/fuse_convert.hpp"
122123
#include "low_precision/group_convolution.hpp"
124+
#include "low_precision/interpolate.hpp"
123125
#include "low_precision/mat_mul.hpp"
126+
#include "low_precision/max_pool.hpp"
124127
#include "low_precision/multiply_to_group_convolution.hpp"
128+
#include "low_precision/mvn.hpp"
125129
#include "low_precision/network_helper.hpp"
130+
#include "low_precision/normalize_l2.hpp"
126131
#include "low_precision/recurrent_cell.hpp"
132+
#include "low_precision/reduce_max.hpp"
133+
#include "low_precision/reduce_mean.hpp"
134+
#include "low_precision/reduce_min.hpp"
135+
#include "low_precision/reduce_sum.hpp"
127136
#include "low_precision/rt_info/bias_attribute.hpp"
128137
#include "transformations/low_precision/mark_dequantization_subgraph.hpp"
129138

@@ -159,6 +168,7 @@
159168
#include "snippets/pass/explicit_transpose_matmul_inputs.hpp"
160169
#include "snippets/pass/extract_reshapes_from_mha.hpp"
161170
#include "snippets/pass/fc_tokenization.hpp"
171+
#include "snippets/pass/fq_decomposition.hpp"
162172
#include "snippets/pass/mha_tokenization.hpp"
163173
#include "snippets/pass/split_dimension_m.hpp"
164174
#include "snippets/pass/tokenization.hpp"
@@ -422,7 +432,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
422432
if (config.inferencePrecision == ov::element::f16) {
423433
precisions_map fp_convert_precision_map = {{ov::element::f32, ov::element::f16}};
424434
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
425-
type_to_fuse_map fuse_map = {{ov::opset1::FakeQuantize::get_type_info_static(), fuse_type_to_fq}};
435+
type_to_fuse_map fuse_map = {};
426436
#else
427437
type_to_fuse_map fuse_map = {{ov::op::PagedAttentionExtension::get_type_info_static(), fuse_type_to_pa}};
428438
#endif
@@ -764,12 +774,58 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
764774
manager.run_passes(model);
765775
}
766776

767-
void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecisions) {
768-
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Lpt);
769-
777+
void Transformations::runLptPasses(const std::vector<ov::element::Type>& defaultPrecisions) {
770778
using namespace ov::pass::low_precision;
771-
CPU_LPT_SCOPE(LowPrecisionTransformations_Part4);
772-
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "LowPrecisionTransformations");
779+
ov::pass::Manager lptManager("CPU:LPT");
780+
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
781+
auto supportedPrecisions = std::vector<PrecisionsRestriction>({
782+
PrecisionsRestriction::create<ov::opset1::MatMul>({{{0, 1}, {ov::element::i8}}}),
783+
});
784+
785+
auto quantizationRestrictions = std::vector<QuantizationGranularityRestriction>();
786+
787+
CPU_REGISTER_PASS_COMMON(lptManager,
788+
LowPrecision,
789+
supportedPrecisions,
790+
quantizationRestrictions,
791+
LayerTransformation::Params(true, ov::element::f32, defaultPrecisions));
792+
CPU_DISABLE_PASS_COMMON(lptManager, AvgPoolTransformation);
793+
CPU_DISABLE_PASS_COMMON(lptManager, ConvolutionTransformation);
794+
CPU_DISABLE_PASS_COMMON(lptManager, ConvolutionBackpropDataTransformation);
795+
CPU_DISABLE_PASS_COMMON(lptManager, InterpolateTransformation);
796+
CPU_DISABLE_PASS_COMMON(lptManager, GroupConvolutionTransformation);
797+
CPU_DISABLE_PASS_COMMON(lptManager, MaxPoolTransformation);
798+
CPU_DISABLE_PASS_COMMON(lptManager, MVNTransformation);
799+
CPU_DISABLE_PASS_COMMON(lptManager, NormalizeL2Transformation);
800+
CPU_DISABLE_PASS_COMMON(lptManager, RecurrentCellTransformation);
801+
CPU_DISABLE_PASS_COMMON(lptManager, ReduceMaxTransformation);
802+
CPU_DISABLE_PASS_COMMON(lptManager, ReduceMeanTransformation);
803+
CPU_DISABLE_PASS_COMMON(lptManager, ReduceMinTransformation);
804+
CPU_DISABLE_PASS_COMMON(lptManager, ReduceSumTransformation);
805+
CPU_DISABLE_PASS_COMMON(lptManager, MultiplyToGroupConvolutionTransformation);
806+
807+
CPU_SET_CALLBACK_COMMON(
808+
lptManager,
809+
[](const_node_ptr& node) -> bool {
810+
return ov::marked_as_bias(node);
811+
},
812+
AddTransformation);
813+
814+
// Enable MatMulTransformation against FC nodes only
815+
// int8 MatMul is disabled because acl_lowp_matmul_t supports 2D case only
816+
// most models have 3D/4D cases, so fallback to jit_gemm_i8 gives worse perf than gemm_acl_f16
817+
// oneDNN ticket #2696
818+
CPU_SET_CALLBACK_COMMON(
819+
lptManager,
820+
[&](const_node_ptr& node) -> bool {
821+
if (NetworkHelper::isConstantPath(node->get_input_node_shared_ptr(1)) &&
822+
one_of(node->input_value(1).get_partial_shape().rank().get_length(), 2, 3)) {
823+
return false;
824+
}
825+
return true;
826+
},
827+
MatMulTransformation);
828+
#else
773829
// Only enable conv/group conv signed input on AMX and avx2_vnni_2 platform.
774830
std::vector<ov::element::Type> input0LowPrecisionList;
775831
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) ||
@@ -807,7 +863,6 @@ void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecision
807863
{QuantizationGranularityRestriction::create<ov::opset1::Convolution>({0}),
808864
QuantizationGranularityRestriction::create<ov::opset1::ConvolutionBackpropData>({0})});
809865

810-
ov::pass::Manager lptManager("CPU:LPT");
811866
CPU_REGISTER_PASS_COMMON(lptManager,
812867
LowPrecision,
813868
supportedPrecisions,
@@ -857,27 +912,20 @@ void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecision
857912
},
858913
FuseConvertTransformation);
859914

860-
// Enable MatMulTransformation against FC nodes only
861-
// int8 MatMul is disabled because acl_lowp_matmul_t supports 2D case only
862-
// most models have 3D/4D cases, so fallback to jit_gemm_i8 gives worse perf than gemm_acl_f16
863-
// oneDNN ticket #2696
864-
CPU_SET_CALLBACK_ARM(
865-
lptManager,
866-
[&](const_node_ptr& node) -> bool {
867-
if (NetworkHelper::isConstantPath(node->get_input_node_shared_ptr(1)) &&
868-
one_of(node->input_value(1).get_partial_shape().rank().get_length(), 2, 3)) {
869-
return false;
870-
}
871-
return true;
872-
},
873-
MatMulTransformation);
874-
875-
CPU_DISABLE_PASS_ARM(lptManager, RecurrentCellTransformation);
876915
CPU_DISABLE_PASS_COMMON(lptManager, MultiplyToGroupConvolutionTransformation);
877-
916+
#endif
878917
lptManager.run_passes(model);
879918
}
880919

920+
void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecisions) {
921+
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Lpt);
922+
923+
CPU_LPT_SCOPE(LowPrecisionTransformations_Part4);
924+
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "LowPrecisionTransformations");
925+
926+
runLptPasses(defaultPrecisions);
927+
}
928+
881929
void Transformations::PostLpt() {
882930
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, PostLpt);
883931

@@ -997,6 +1045,19 @@ void Transformations::PostLpt() {
9971045
}
9981046

9991047
void Transformations::MainSnippets(void) {
1048+
// Disable MainSnippets for int8 models on arm platforms due to performance issues
1049+
// Ticket: 163408
1050+
#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
1051+
using namespace ov::pass::low_precision;
1052+
static const std::set<levels>& supported_fq_levels = {levels::int4,
1053+
levels::int4_narrow_range,
1054+
levels::int8,
1055+
levels::int8_narrow_range};
1056+
if (LowPrecision::isFunctionQuantized(model, supported_fq_levels)) {
1057+
return;
1058+
}
1059+
#endif
1060+
10001061
auto is_supported_isa = []() {
10011062
#if defined(OPENVINO_ARCH_X86_64)
10021063
return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2);
@@ -1342,7 +1403,7 @@ void Transformations::PostSnippets(void) {
13421403
ov::pass::Manager postSnippetsManager("CPU:PostSnippets");
13431404
postSnippetsManager.set_per_pass_validation(false);
13441405
CPU_REGISTER_PASS_COMMON(postSnippetsManager, ov::pass::FakeQuantizeDecomposition);
1345-
CPU_SET_CALLBACK_COMMON(
1406+
CPU_SET_CALLBACK_X64(
13461407
postSnippetsManager,
13471408
[](const_node_ptr& node) -> bool {
13481409
std::string errMsg;

src/plugins/intel_cpu/src/transformations/transformation_pipeline.h

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ class Transformations {
3838
void PreLpt(const std::vector<ov::element::Type>& defaultPrecisions);
3939

4040
void Lpt(const std::vector<ov::element::Type>& defaultPrecisions);
41+
void runLptPasses(const std::vector<ov::element::Type>& defaultPrecisions);
4142

4243
void MainSnippets(void);
4344

src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/stateful_init_graph.cpp

+12-1
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,6 @@ class InitGraphStatefulDiffPrimitiveModel : public InitGraphStatefulModelBase {
217217

218218
configuration.insert({"SNIPPETS_MODE", "DISABLE"});
219219

220-
bool directPair;
221220
std::tie(inputShapes, directPair) = this->GetParam();
222221

223222
init_input_shapes(inputShapes);
@@ -250,12 +249,24 @@ class InitGraphStatefulDiffPrimitiveModel : public InitGraphStatefulModelBase {
250249
}
251250

252251
void check_init_graph_node() override {
252+
#if defined(OPENVINO_ARCH_ARM64)
253+
// Convert node is fused into Eltwise on arm platforms
254+
if (directPair) {
255+
CheckNumberOfNodesWithType(compiledModel, "Convert", 0);
256+
} else {
257+
CheckNumberOfNodesWithType(compiledModel, "Convert", 1);
258+
}
259+
#else
253260
CheckNumberOfNodesWithType(compiledModel, "Convert", 1);
261+
#endif
254262
}
255263

256264
ov::Shape get_state_shape(size_t i) override {
257265
return inputShapes[0].second[i];
258266
}
267+
268+
private:
269+
bool directPair;
259270
};
260271

261272
TEST_P(InitGraphStatefulDiffPrimitiveModel, CompareWithRefs) {

0 commit comments

Comments
 (0)