[CPU] [LPT] CPU limitation (openvinotoolkit#22522)

eshoguli · web-flow · commit 2e6d061583f1 · 2024-01-31T22:27:54.000Z
* [CPU] [LPT] CPU limitation

* tests

* comments fixes

* tests fix

* tests refactoring
diff --git a/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp b/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp
@@ -41,16 +41,7 @@ namespace precision_set {
     LP_TRANSFORMATIONS_API const std::vector<element::Type>& get_int8_support();
     LP_TRANSFORMATIONS_API const std::vector<element::Type>& get_int8_int16_int32_support();
 } // namespace precision_set
-enum levels : size_t {
-    int4 = 16,
-    int4_narrow_range = 15,
-    int8 = 256,
-    int8_narrow_range = 255,
-    int16 = 65536,
-    int16_narrow_range = 65535,
-    int32 = size_t(4294967296),  // for ARM and ia32 platforms where this number bigger than size_t but never used
-    int32_narrow_range = 4294967295
-};
+
 class LP_TRANSFORMATIONS_API DataPrecision {
 public:
     DataPrecision() : precision(element::undefined), min(0.f), max(0.f), hasZeroPoint(false) {}
diff --git a/src/common/low_precision_transformations/include/low_precision/low_precision.hpp b/src/common/low_precision_transformations/include/low_precision/low_precision.hpp
@@ -6,6 +6,7 @@
 
 #include <vector>
 #include <memory>
+#include <unordered_set>
 
 // one place to include all Low Precision Transformations from ov::pass::low_precision
 #include "low_precision/rt_info/intervals_alignment_attribute.hpp"
@@ -59,7 +60,7 @@ class ov::pass::low_precision::TypeRelaxedReplacer : public ov::pass::GraphRewri
     TypeRelaxedReplacer();
 };
 
-class ov::pass::low_precision::LowPrecision : public ov::pass::ModelPass {
+class LP_TRANSFORMATIONS_API ov::pass::low_precision::LowPrecision : public ov::pass::ModelPass {
 public:
     OPENVINO_RTTI("LowPrecision", "0");
     LowPrecision(
@@ -68,7 +69,9 @@ class ov::pass::low_precision::LowPrecision : public ov::pass::ModelPass {
         const LayerTransformation::Params = LayerTransformation::Params());
     bool run_on_model(const std::shared_ptr<ov::Model>& m) override;
 
-    static bool isFunctionQuantized(const std::shared_ptr<const ov::Model>& model);
+    static bool isFunctionQuantized(
+        const std::shared_ptr<const ov::Model>& model,
+        const std::set<levels>& supported_levels = all_levels);
     static bool isFQLevelsPresent(const std::shared_ptr<const ov::Model>& model, const std::set<size_t>& levels);
 
     template <typename T, class... Args>
diff --git a/src/common/low_precision_transformations/include/low_precision/quantization_details.hpp b/src/common/low_precision_transformations/include/low_precision/quantization_details.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <memory>
+#include <unordered_set>
 #include <ostream>
 #include <vector>
 
@@ -15,6 +16,24 @@ namespace ov {
 namespace pass {
 namespace low_precision {
 
+enum levels : size_t {
+    int4 = 16,
+    int4_narrow_range = 15,
+    int8 = 256,
+    int8_narrow_range = 255,
+    int16 = 65536,
+    int16_narrow_range = 65535,
+    int32 = size_t(4294967296),  // for ARM and ia32 platforms where this number bigger than size_t but never used
+    int32_narrow_range = 4294967295
+};
+
+static std::set<levels> all_levels = {
+    levels::int4,  levels::int4_narrow_range,
+    levels::int8,  levels::int8_narrow_range,
+    levels::int16, levels::int16_narrow_range,
+    levels::int32, levels::int32_narrow_range
+};
+
 class LP_TRANSFORMATIONS_API QuantizationDetails {
 public:
     QuantizationDetails();
@@ -50,7 +69,9 @@ class LP_TRANSFORMATIONS_API QuantizationDetails {
 
     bool empty() const noexcept;
 
-    static bool isSupportedLevel(const size_t level);
+    static bool isSupportedLevel(
+        const size_t level,
+        const std::set<levels>& supported_levels = all_levels);
 
     const size_t levels;
     const std::vector<float> inputLowValues;
diff --git a/src/common/low_precision_transformations/src/low_precision.cpp b/src/common/low_precision_transformations/src/low_precision.cpp
@@ -297,7 +297,9 @@ bool ov::pass::low_precision::LowPrecision::run_on_model(const std::shared_ptr<o
     return false;
 }
 
-bool ov::pass::low_precision::LowPrecision::isFunctionQuantized(const std::shared_ptr<const ov::Model>& model) {
+bool ov::pass::low_precision::LowPrecision::isFunctionQuantized(
+        const std::shared_ptr<const ov::Model>& model,
+        const std::set<levels>& supported_levels) {
     std::set<std::shared_ptr<ov::Node>> handledNodes;
     std::deque<std::shared_ptr<ov::Node>> nodes;
     for (const auto& result : model->get_results()) {
@@ -316,7 +318,7 @@ bool ov::pass::low_precision::LowPrecision::isFunctionQuantized(const std::share
 
             if (const auto fakeQuantize = ov::as_type_ptr<ov::opset1::FakeQuantize>(parent)) {
                 if (QuantizationDetails::outputLayoutIsSupported(fakeQuantize, true) &&
-                    QuantizationDetails::isSupportedLevel(fakeQuantize->get_levels())) {
+                    QuantizationDetails::isSupportedLevel(fakeQuantize->get_levels(), supported_levels)) {
                     return true;
                 }
             } else if (const auto multiSubGraph = ov::as_type_ptr<ov::op::util::MultiSubGraphOp>(parent)) {
diff --git a/src/common/low_precision_transformations/src/quantization_details.cpp b/src/common/low_precision_transformations/src/quantization_details.cpp
@@ -170,15 +170,10 @@ bool QuantizationDetails::empty() const noexcept {
     return (levels == 0ul) && inputLowValues.empty() && inputHighValues.empty() && outputLowValues.empty() && outputHighValues.empty();
 }
 
-bool QuantizationDetails::isSupportedLevel(const size_t level) {
-    using ov::pass::low_precision::levels;
-    static const std::unordered_set<size_t> supported_levels = {
-        levels::int4,  levels::int4_narrow_range,
-        levels::int8,  levels::int8_narrow_range,
-        levels::int16, levels::int16_narrow_range,
-        levels::int32, levels::int32_narrow_range
-    };
-    return supported_levels.find(level) != supported_levels.end();
+bool QuantizationDetails::isSupportedLevel(
+        const size_t quantization_level,
+        const std::set<ov::pass::low_precision::levels>& supported_levels) {
+    return supported_levels.find(static_cast<ov::pass::low_precision::levels>(quantization_level)) != supported_levels.end();
 }
 
 } // namespace low_precision
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -210,28 +210,24 @@ bool Transformations::fuse_type_to_convert(const std::shared_ptr<ov::Node>& node
 }
 
 void Transformations::UpToLpt() {
+    using namespace ov::pass::low_precision;
+    static const std::set<levels>& supported_fq_levels = {
+        levels::int4,
+        levels::int4_narrow_range,
+        levels::int8,
+        levels::int8_narrow_range
+    };
+
     const bool useLpt = enableLpt &&
-        ov::pass::low_precision::LowPrecision::isFunctionQuantized(model) &&
+        LowPrecision::isFunctionQuantized(model, supported_fq_levels) &&
         CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(config.debugCaps, Lpt);
 
-    auto defaultPrecisions = useLpt ? ov::pass::low_precision::precision_set::get_int8_support() : std::vector<ov::element::Type>{};
-    bool hasINT16orINT32Levels = false;
-
-    if (useLpt) {
-        CPU_LPT_SCOPE(LowPrecisionTransformations_Part1);
-        hasINT16orINT32Levels = ov::pass::low_precision::LowPrecision::isFQLevelsPresent(
-            model,
-            {ov::pass::low_precision::levels::int16, ov::pass::low_precision::levels::int16_narrow_range,
-             ov::pass::low_precision::levels::int32, ov::pass::low_precision::levels::int32_narrow_range});
-        if (hasINT16orINT32Levels) {
-            defaultPrecisions = ov::pass::low_precision::precision_set::get_int8_int16_int32_support();
-        }
-    }
+    const auto defaultPrecisions = useLpt ? precision_set::get_int8_support() : std::vector<ov::element::Type>{};
 
     PreLpt(defaultPrecisions, isLegacyApi);
 
     if (useLpt)
-        Lpt(hasINT16orINT32Levels, defaultPrecisions);
+        Lpt(defaultPrecisions);
 }
 
 void Transformations::CpuSpecificOpSet(void) {
@@ -512,7 +508,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
     manager.run_passes(model);
 }
 
-void Transformations::Lpt(const bool hasINT16orINT32Levels, const std::vector<ov::element::Type>& defaultPrecisions) {
+void Transformations::Lpt(const std::vector<ov::element::Type>& defaultPrecisions) {
     CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Lpt);
 
     using namespace ov::pass::low_precision;
@@ -571,18 +567,11 @@ void Transformations::Lpt(const bool hasINT16orINT32Levels, const std::vector<ov
             QuantizationGranularityRestriction::create<ov::opset1::ConvolutionBackpropData>({0})
         });
 
-    // for GNA networks reference execution
-    bool updatePrecision = true;
-    if (hasINT16orINT32Levels) {
-        updatePrecision = false;
-        supportedPrecisions = std::vector<PrecisionsRestriction>({});
-    }
-
     ov::pass::Manager lptManager;
     CPU_REGISTER_PASS_COMMON(lptManager, LowPrecision,
         supportedPrecisions,
         quantizationRestrictions,
-        LayerTransformation::Params(updatePrecision, ov::element::f32, defaultPrecisions));
+        LayerTransformation::Params(true, ov::element::f32, defaultPrecisions));
 
     CPU_SET_CALLBACK_COMMON(lptManager, [](const_node_ptr& node) -> bool {
         return ov::is_type<ov::opset1::Multiply>(node) &&
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.h
@@ -50,7 +50,7 @@ class Transformations {
 
     void PreLpt(const std::vector<ov::element::Type>& defaultPrecisions, const bool isLegacyApi);
 
-    void Lpt(const bool hasINT16orINT32Levels, const std::vector<ov::element::Type>& defaultPrecisions);
+    void Lpt(const std::vector<ov::element::Type>& defaultPrecisions);
 
     void MainSnippets(void);
 
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/reshape_transformation.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/low_precision_transformations/reshape_transformation.cpp
@@ -163,6 +163,16 @@ const std::vector<ReshapeTransformationParam> params = {
         "Reshape",
         "f32"
     },
+
+    // int16 is not supported: no dequantization after Reshape: Reshape => Output
+    {
+        { 1, 3, 32 },
+        { 1, 3, 4, 8 },
+        { 65536ul, ov::Shape{ 1, 1, 1 }, { 0.f }, { 255.f }, { 0.f }, { 25.5f } },
+        "Reshape",
+        "f32",
+        { "Reshape", "Output" }
+    },
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_LPT, ReshapeTransformation,
diff --git a/src/tests/functional/plugin/shared/include/low_precision_transformations/reshape_transformation.hpp b/src/tests/functional/plugin/shared/include/low_precision_transformations/reshape_transformation.hpp
@@ -19,6 +19,7 @@ class ReshapeTransformationParam {
     ov::builder::subgraph::FakeQuantizeOnData fakeQuantize;
     std::string layerType;
     std::string expectedKernelType;
+    std::vector<std::string> executionOrder;
 };
 
 typedef std::tuple<
diff --git a/src/tests/functional/plugin/shared/src/low_precision_transformations/reshape_transformation.cpp b/src/tests/functional/plugin/shared/src/low_precision_transformations/reshape_transformation.cpp
@@ -51,6 +51,9 @@ void ReshapeTransformation::run() {
     LayerTransformation::run();
 
     const auto params = std::get<3>(GetParam());
+
+    EXPECT_TRUE(check_execution_order(params.executionOrder));
+
     auto actualPrecision = get_runtime_precision_by_type(params.layerType);
     const auto expectedPrecision = params.expectedKernelType;
     if ((expectedPrecision == "FP32") && (actualPrecision == "FP16")) {
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/base/low_precision_transformations/layer_transformation.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/base/low_precision_transformations/layer_transformation.hpp
@@ -53,6 +53,10 @@ class LayerTransformation : virtual public ov::test::SubgraphBaseTest {
     // get runtime precision by operation friendly name which can be fused
     std::string get_runtime_precision_by_fused_name(const std::string& layerName);
 
+    // check operation sequence in an execution graph and orderedOpsTypes
+    // orderedOpsTypes can consist only necessary operations (fewer than exist in the execution graph)
+    bool check_execution_order(const std::vector<std::string>& orderedOpsTypes);
+
     std::map<std::string, ov::Node::RTMap> get_runtime_info();
 
     void init_input_shapes(const ov::PartialShape& shape);
diff --git a/src/tests/functional/shared_test_classes/src/base/low_precision_transformations/layer_transformation.cpp b/src/tests/functional/shared_test_classes/src/base/low_precision_transformations/layer_transformation.cpp
@@ -127,6 +127,39 @@ std::string LayerTransformation::get_runtime_precision_by_fused_name(const std::
     return find_node_by_runtime_precision(compiledModel, is_node_f);
 }
 
+bool LayerTransformation::check_execution_order(const std::vector<std::string>& orderedOpsTypes) {
+    if (orderedOpsTypes.empty()) {
+        return true;
+    }
+
+    size_t comparisonIndex = 0;
+    const std::shared_ptr<const ov::Model>& execFunction = compiledModel.get_runtime_model();
+    for (const auto& op : execFunction->get_ordered_ops()) {
+        const auto& rtInfo = op->get_rt_info();
+        const auto& typeIt = rtInfo.find("layerType");
+        OPENVINO_ASSERT(typeIt != rtInfo.end(), "layerType is not found");
+
+        const auto layerType = typeIt->second.as<std::string>();
+        if (orderedOpsTypes[comparisonIndex] == layerType) {
+            // if comparisonIndex == 0 then start comparision
+            // if comparisonIndex != 0 then comparision has been started, check next operation type in sequence
+            comparisonIndex++;
+
+            if (comparisonIndex >= orderedOpsTypes.size()) {
+                // all operation types in sequence were checked, comparision is ended
+                return true;
+            }
+        } else if (comparisonIndex != 0) {
+            // if comparision has been started and operation type is not equal then exit
+            return false;
+        }
+    }
+
+    // actually we can be here only if operation sequence too long
+    // (execution graph doesn't have some operations from operations sequence)
+    return comparisonIndex == orderedOpsTypes.size();
+}
+
 std::map<std::string, ov::Node::RTMap> LayerTransformation::get_runtime_info() {
     const ov::CompiledModel& execNet = compiledModel;
     const std::shared_ptr<const ov::Model>& function = execNet.get_runtime_model();