|
115 | 115 |
|
116 | 116 | // LPT transformations
|
117 | 117 | #include "low_precision/add.hpp"
|
| 118 | +#include "low_precision/avg_pool.hpp" |
118 | 119 | #include "low_precision/convert_subtract_constant.hpp"
|
119 | 120 | #include "low_precision/convolution_backprop_data.hpp"
|
120 | 121 | #include "low_precision/fold_convert.hpp"
|
121 | 122 | #include "low_precision/fuse_convert.hpp"
|
122 | 123 | #include "low_precision/group_convolution.hpp"
|
| 124 | +#include "low_precision/interpolate.hpp" |
123 | 125 | #include "low_precision/mat_mul.hpp"
|
| 126 | +#include "low_precision/max_pool.hpp" |
124 | 127 | #include "low_precision/multiply_to_group_convolution.hpp"
|
| 128 | +#include "low_precision/mvn.hpp" |
125 | 129 | #include "low_precision/network_helper.hpp"
|
| 130 | +#include "low_precision/normalize_l2.hpp" |
126 | 131 | #include "low_precision/recurrent_cell.hpp"
|
| 132 | +#include "low_precision/reduce_max.hpp" |
| 133 | +#include "low_precision/reduce_mean.hpp" |
| 134 | +#include "low_precision/reduce_min.hpp" |
| 135 | +#include "low_precision/reduce_sum.hpp" |
127 | 136 | #include "low_precision/rt_info/bias_attribute.hpp"
|
128 | 137 | #include "transformations/low_precision/mark_dequantization_subgraph.hpp"
|
129 | 138 |
|
|
159 | 168 | #include "snippets/pass/explicit_transpose_matmul_inputs.hpp"
|
160 | 169 | #include "snippets/pass/extract_reshapes_from_mha.hpp"
|
161 | 170 | #include "snippets/pass/fc_tokenization.hpp"
|
| 171 | +#include "snippets/pass/fq_decomposition.hpp" |
162 | 172 | #include "snippets/pass/mha_tokenization.hpp"
|
163 | 173 | #include "snippets/pass/split_dimension_m.hpp"
|
164 | 174 | #include "snippets/pass/tokenization.hpp"
|
@@ -422,7 +432,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
|
422 | 432 | if (config.inferencePrecision == ov::element::f16) {
|
423 | 433 | precisions_map fp_convert_precision_map = {{ov::element::f32, ov::element::f16}};
|
424 | 434 | #if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
|
425 |
| - type_to_fuse_map fuse_map = {{ov::opset1::FakeQuantize::get_type_info_static(), fuse_type_to_fq}}; |
| 435 | + type_to_fuse_map fuse_map = {}; |
426 | 436 | #else
|
427 | 437 | type_to_fuse_map fuse_map = {{ov::op::PagedAttentionExtension::get_type_info_static(), fuse_type_to_pa}};
|
428 | 438 | #endif
|
@@ -764,12 +774,58 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
|
764 | 774 | manager.run_passes(model);
|
765 | 775 | }
|
766 | 776 |
|
767 |
| -void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecisions) { |
768 |
| - CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Lpt); |
769 |
| - |
| 777 | +void Transformations::runLptPasses(const std::vector<ov::element::Type>& defaultPrecisions) { |
770 | 778 | using namespace ov::pass::low_precision;
|
771 |
| - CPU_LPT_SCOPE(LowPrecisionTransformations_Part4); |
772 |
| - OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "LowPrecisionTransformations"); |
| 779 | + ov::pass::Manager lptManager("CPU:LPT"); |
| 780 | +#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) |
| 781 | + auto supportedPrecisions = std::vector<PrecisionsRestriction>({ |
| 782 | + PrecisionsRestriction::create<ov::opset1::MatMul>({{{0, 1}, {ov::element::i8}}}), |
| 783 | + }); |
| 784 | + |
| 785 | + auto quantizationRestrictions = std::vector<QuantizationGranularityRestriction>(); |
| 786 | + |
| 787 | + CPU_REGISTER_PASS_COMMON(lptManager, |
| 788 | + LowPrecision, |
| 789 | + supportedPrecisions, |
| 790 | + quantizationRestrictions, |
| 791 | + LayerTransformation::Params(true, ov::element::f32, defaultPrecisions)); |
| 792 | + CPU_DISABLE_PASS_COMMON(lptManager, AvgPoolTransformation); |
| 793 | + CPU_DISABLE_PASS_COMMON(lptManager, ConvolutionTransformation); |
| 794 | + CPU_DISABLE_PASS_COMMON(lptManager, ConvolutionBackpropDataTransformation); |
| 795 | + CPU_DISABLE_PASS_COMMON(lptManager, InterpolateTransformation); |
| 796 | + CPU_DISABLE_PASS_COMMON(lptManager, GroupConvolutionTransformation); |
| 797 | + CPU_DISABLE_PASS_COMMON(lptManager, MaxPoolTransformation); |
| 798 | + CPU_DISABLE_PASS_COMMON(lptManager, MVNTransformation); |
| 799 | + CPU_DISABLE_PASS_COMMON(lptManager, NormalizeL2Transformation); |
| 800 | + CPU_DISABLE_PASS_COMMON(lptManager, RecurrentCellTransformation); |
| 801 | + CPU_DISABLE_PASS_COMMON(lptManager, ReduceMaxTransformation); |
| 802 | + CPU_DISABLE_PASS_COMMON(lptManager, ReduceMeanTransformation); |
| 803 | + CPU_DISABLE_PASS_COMMON(lptManager, ReduceMinTransformation); |
| 804 | + CPU_DISABLE_PASS_COMMON(lptManager, ReduceSumTransformation); |
| 805 | + CPU_DISABLE_PASS_COMMON(lptManager, MultiplyToGroupConvolutionTransformation); |
| 806 | + |
| 807 | + CPU_SET_CALLBACK_COMMON( |
| 808 | + lptManager, |
| 809 | + [](const_node_ptr& node) -> bool { |
| 810 | + return ov::marked_as_bias(node); |
| 811 | + }, |
| 812 | + AddTransformation); |
| 813 | + |
| 814 | + // Enable MatMulTransformation against FC nodes only |
| 815 | + // int8 MatMul is disabled because acl_lowp_matmul_t supports 2D case only |
| 816 | + // most models have 3D/4D cases, so fallback to jit_gemm_i8 gives worse perf than gemm_acl_f16 |
| 817 | + // oneDNN ticket #2696 |
| 818 | + CPU_SET_CALLBACK_COMMON( |
| 819 | + lptManager, |
| 820 | + [&](const_node_ptr& node) -> bool { |
| 821 | + if (NetworkHelper::isConstantPath(node->get_input_node_shared_ptr(1)) && |
| 822 | + one_of(node->input_value(1).get_partial_shape().rank().get_length(), 2, 3)) { |
| 823 | + return false; |
| 824 | + } |
| 825 | + return true; |
| 826 | + }, |
| 827 | + MatMulTransformation); |
| 828 | +#else |
773 | 829 | // Only enable conv/group conv signed input on AMX and avx2_vnni_2 platform.
|
774 | 830 | std::vector<ov::element::Type> input0LowPrecisionList;
|
775 | 831 | if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) ||
|
@@ -807,7 +863,6 @@ void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecision
|
807 | 863 | {QuantizationGranularityRestriction::create<ov::opset1::Convolution>({0}),
|
808 | 864 | QuantizationGranularityRestriction::create<ov::opset1::ConvolutionBackpropData>({0})});
|
809 | 865 |
|
810 |
| - ov::pass::Manager lptManager("CPU:LPT"); |
811 | 866 | CPU_REGISTER_PASS_COMMON(lptManager,
|
812 | 867 | LowPrecision,
|
813 | 868 | supportedPrecisions,
|
@@ -857,27 +912,20 @@ void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecision
|
857 | 912 | },
|
858 | 913 | FuseConvertTransformation);
|
859 | 914 |
|
860 |
| - // Enable MatMulTransformation against FC nodes only |
861 |
| - // int8 MatMul is disabled because acl_lowp_matmul_t supports 2D case only |
862 |
| - // most models have 3D/4D cases, so fallback to jit_gemm_i8 gives worse perf than gemm_acl_f16 |
863 |
| - // oneDNN ticket #2696 |
864 |
| - CPU_SET_CALLBACK_ARM( |
865 |
| - lptManager, |
866 |
| - [&](const_node_ptr& node) -> bool { |
867 |
| - if (NetworkHelper::isConstantPath(node->get_input_node_shared_ptr(1)) && |
868 |
| - one_of(node->input_value(1).get_partial_shape().rank().get_length(), 2, 3)) { |
869 |
| - return false; |
870 |
| - } |
871 |
| - return true; |
872 |
| - }, |
873 |
| - MatMulTransformation); |
874 |
| - |
875 |
| - CPU_DISABLE_PASS_ARM(lptManager, RecurrentCellTransformation); |
876 | 915 | CPU_DISABLE_PASS_COMMON(lptManager, MultiplyToGroupConvolutionTransformation);
|
877 |
| - |
| 916 | +#endif |
878 | 917 | lptManager.run_passes(model);
|
879 | 918 | }
|
880 | 919 |
|
| 920 | +void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecisions) { |
| 921 | + CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Lpt); |
| 922 | + |
| 923 | + CPU_LPT_SCOPE(LowPrecisionTransformations_Part4); |
| 924 | + OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "LowPrecisionTransformations"); |
| 925 | + |
| 926 | + runLptPasses(defaultPrecisions); |
| 927 | +} |
| 928 | + |
881 | 929 | void Transformations::PostLpt() {
|
882 | 930 | CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, PostLpt);
|
883 | 931 |
|
@@ -997,6 +1045,19 @@ void Transformations::PostLpt() {
|
997 | 1045 | }
|
998 | 1046 |
|
999 | 1047 | void Transformations::MainSnippets(void) {
|
| 1048 | +// Disable MainSnippets for int8 models on arm platforms due to performance issues |
| 1049 | +// Ticket: 163408 |
| 1050 | +#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64) |
| 1051 | + using namespace ov::pass::low_precision; |
| 1052 | + static const std::set<levels>& supported_fq_levels = {levels::int4, |
| 1053 | + levels::int4_narrow_range, |
| 1054 | + levels::int8, |
| 1055 | + levels::int8_narrow_range}; |
| 1056 | + if (LowPrecision::isFunctionQuantized(model, supported_fq_levels)) { |
| 1057 | + return; |
| 1058 | + } |
| 1059 | +#endif |
| 1060 | + |
1000 | 1061 | auto is_supported_isa = []() {
|
1001 | 1062 | #if defined(OPENVINO_ARCH_X86_64)
|
1002 | 1063 | return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2);
|
@@ -1342,7 +1403,7 @@ void Transformations::PostSnippets(void) {
|
1342 | 1403 | ov::pass::Manager postSnippetsManager("CPU:PostSnippets");
|
1343 | 1404 | postSnippetsManager.set_per_pass_validation(false);
|
1344 | 1405 | CPU_REGISTER_PASS_COMMON(postSnippetsManager, ov::pass::FakeQuantizeDecomposition);
|
1345 |
| - CPU_SET_CALLBACK_COMMON( |
| 1406 | + CPU_SET_CALLBACK_X64( |
1346 | 1407 | postSnippetsManager,
|
1347 | 1408 | [](const_node_ptr& node) -> bool {
|
1348 | 1409 | std::string errMsg;
|
|
0 commit comments