[intel-npu] Adding NPU_QDQ_OPTIMIZATION property (#29377)

csoka · web-flow · commit b274da878a6e · 2025-03-11T10:19:49.000Z
### Details: - Adding new property NPU_QDQ_OPTIMIZATION for enabling/disabling additional optimizations and balances performance and accuracy for QDQ format models, quantized using ONNX Runtime - intends to map to "NPU_COMPILATION_MODE_PARAMS":"enable-adaptive-stripping=true" ### Tickets: - *EISW-159283*
diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst
@@ -144,6 +144,7 @@ offer a limited set of supported OpenVINO features.
          ov::workload_type
          ov::intel_npu::compilation_mode_params
          ov::intel_npu::compiler_dynamic_quantization
+         ov::intel_npu::qdq_optimization
          ov::intel_npu::turbo
          ov::intel_npu::tiles
          ov::intel_npu::max_tiles
diff --git a/src/bindings/python/src/pyopenvino/core/properties/properties.cpp b/src/bindings/python/src/pyopenvino/core/properties/properties.cpp
@@ -337,4 +337,5 @@ void regmodule_properties(py::module m) {
     wrap_property_RW(m_intel_npu, ov::intel_npu::bypass_umd_caching, "bypass_umd_caching");
     wrap_property_RW(m_intel_npu, ov::intel_npu::defer_weights_load, "defer_weights_load");
     wrap_property_RW(m_intel_npu, ov::intel_npu::compiler_dynamic_quantization, "compiler_dynamic_quantization");
+    wrap_property_RW(m_intel_npu, ov::intel_npu::qdq_optimization, "qdq_optimization");
 }
diff --git a/src/inference/include/openvino/runtime/intel_npu/properties.hpp b/src/inference/include/openvino/runtime/intel_npu/properties.hpp
@@ -77,6 +77,15 @@ static constexpr ov::Property<std::string> compilation_mode_params{"NPU_COMPILAT
  */
 static constexpr ov::Property<bool> compiler_dynamic_quantization{"NPU_COMPILER_DYNAMIC_QUANTIZATION"};
 
+/**
+ * @brief [Only for NPU compiler]
+ * Type: boolean
+ * This option enables additional optimizations and balances performance and accuracy for QDQ format models, quantized
+ * using ONNX Runtime
+ * @ingroup ov_runtime_npu_prop_cpp_api
+ */
+static constexpr ov::Property<bool> qdq_optimization{"NPU_QDQ_OPTIMIZATION"};
+
 /**
  * @brief [Only for NPU plugin]
  * Type: std::bool
diff --git a/src/plugins/intel_npu/README.md b/src/plugins/intel_npu/README.md
@@ -174,6 +174,7 @@ The following properties are supported:
 | `ov::intel_npu::compiler_version`/</br>`NPU_COMPILER_VERSION` | RO | NPU compiler version. MSB 16 bits are Major version, LSB 16 bits are Minor version | `N/A` | `N/A` |
 | `ov::intel_npu::compilation_mode_params`/</br>`NPU_COMPILATION_MODE_PARAMS` | RW | Set various parameters supported by the NPU compiler. (See bellow) | `<std::string>`| `N/A` |
 | `ov::intel_npu::compiler_dynamic_quantization`/</br>`NPU_COMPILER_DYNAMIC_QUANTIZATION` | RW | Enable/Disable dynamic quantization by NPU compiler | `YES` / `NO` | `N/A` |
+| `ov::intel_npu::qdq_optimization`/</br>`NPU_QDQ_OPTIMIZATION` | RW | Enable/Disable additional optimizations and balances performance and accuracy for QDQ format models, quantized using ONNX Runtime | `YES` / `NO` | `NO` |
 | `ov::intel_npu::turbo`/</br>`NPU_TURBO` | RW | Set Turbo mode on/off | `YES`/ `NO`| `NO` |
 | `ov::intel_npu::tiles`/</br>`NPU_TILES` | RW | Sets the number of npu tiles to compile the model for | `[0-]` | `-1` |
 | `ov::intel_npu::max_tiles`/</br>`NPU_MAX_TILES` | RW | Maximum number of tiles supported by the device we compile for. Can be set for offline compilation. If not set, it will be populated by driver.| `[0-]` | `[1-6] depends on npu platform` |
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/compiler.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/compiler.hpp
@@ -379,4 +379,26 @@ struct COMPILER_DYNAMIC_QUANTIZATION final : OptionBase<COMPILER_DYNAMIC_QUANTIZ
     }
 };
 
+//
+// NPU_QDQ_OPTIMIZATION
+//
+
+struct QDQ_OPTIMIZATION final : OptionBase<QDQ_OPTIMIZATION, bool> {
+    static std::string_view key() {
+        return ov::intel_npu::qdq_optimization.name();
+    }
+
+    static bool defaultValue() {
+        return false;
+    }
+
+    static OptionMode mode() {
+        return OptionMode::CompileTime;
+    }
+
+    static bool isPublic() {
+        return true;
+    }
+};
+
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/al/src/config/compiler.cpp b/src/plugins/intel_npu/src/al/src/config/compiler.cpp
@@ -25,6 +25,7 @@ void intel_npu::registerCompilerOptions(OptionsDesc& desc) {
     desc.add<DYNAMIC_SHAPE_TO_STATIC>();
     desc.add<EXECUTION_MODE_HINT>();
     desc.add<COMPILER_DYNAMIC_QUANTIZATION>();
+    desc.add<QDQ_OPTIMIZATION>();
 }
 
 //
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp
@@ -552,6 +552,16 @@ std::string DriverCompilerAdapter::serializeConfig(const Config& config,
         content = std::regex_replace(content, std::regex(dqstr.str()), "");
     }
 
+    // QDQ_OPTIMIZATION is not supported in versions < 7.5 - need to remove it
+    if ((compilerVersion.major < 7) || (compilerVersion.major == 7 && compilerVersion.minor < 5)) {
+        std::ostringstream qdqstr;
+        qdqstr << ov::intel_npu::qdq_optimization.name() << KEY_VALUE_SEPARATOR << VALUE_DELIMITER << "\\S+"
+               << VALUE_DELIMITER;
+        logger.warning("NPU_QDQ_OPTIMIZATION property is not supported by this compiler version. Removing from "
+                       "parameters");
+        content = std::regex_replace(content, std::regex(qdqstr.str()), "");
+    }
+
     // NPU_DEFER_WEIGHTS_LOAD is needed at runtime only
     {
         std::ostringstream batchstr;
diff --git a/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/src/compiled_model.cpp
@@ -271,6 +271,12 @@ void CompiledModel::initialize_properties() {
           [](const Config& config) {
               return config.get<COMPILER_DYNAMIC_QUANTIZATION>();
           }}},
+        {ov::intel_npu::qdq_optimization.name(),
+         {true,
+          ov::PropertyMutability::RO,
+          [](const Config& config) {
+              return config.get<QDQ_OPTIMIZATION>();
+          }}},
         {ov::intel_npu::turbo.name(),
          {isPropertySupported(ov::intel_npu::turbo.name()),
           ov::PropertyMutability::RO,
diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp
@@ -462,6 +462,12 @@ Plugin::Plugin()
           [](const Config& config) {
               return config.get<COMPILER_DYNAMIC_QUANTIZATION>();
           }}},
+        {ov::intel_npu::qdq_optimization.name(),
+         {false,
+          ov::PropertyMutability::RW,
+          [](const Config& config) {
+              return config.get<QDQ_OPTIMIZATION>();
+          }}},
         {ov::intel_npu::turbo.name(),
          {_backends->isCommandQueueExtSupported(),
           ov::PropertyMutability::RW,
@@ -613,6 +619,15 @@ void Plugin::reset_compiler_dependent_properties() const {
             std::get<0>(_properties[ov::intel_npu::compiler_dynamic_quantization.name()]) = false;  // mark unsupported
         }
     }
+    // NPU_QDQ_OPTIMIZATION
+    // unpublish if compiler version requirement is not met
+    if (_properties.find(ov::intel_npu::qdq_optimization.name()) != _properties.end()) {
+        if (active_compiler_version >= ICOMPILER_MAKE_VERSION(7, 5)) {
+            std::get<0>(_properties[ov::intel_npu::qdq_optimization.name()]) = true;  /// mark supported
+        } else {
+            std::get<0>(_properties[ov::intel_npu::qdq_optimization.name()]) = false;  // mark unsupported
+        }
+    }
 }
 
 void Plugin::set_property(const ov::AnyMap& properties) {

Original file line number	Diff line number	Diff line change
`@@ -337,4 +337,5 @@ void regmodule_properties(py::module m) {`
`337`	`337`	`wrap_property_RW(m_intel_npu, ov::intel_npu::bypass_umd_caching, "bypass_umd_caching");`
`338`	`338`	`wrap_property_RW(m_intel_npu, ov::intel_npu::defer_weights_load, "defer_weights_load");`
`339`	`339`	`wrap_property_RW(m_intel_npu, ov::intel_npu::compiler_dynamic_quantization, "compiler_dynamic_quantization");`
	`340`	`+ wrap_property_RW(m_intel_npu, ov::intel_npu::qdq_optimization, "qdq_optimization");`
`340`	`341`	`}`
Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@ void intel_npu::registerCompilerOptions(OptionsDesc& desc) {`
`25`	`25`	`desc.add<DYNAMIC_SHAPE_TO_STATIC>();`
`26`	`26`	`desc.add<EXECUTION_MODE_HINT>();`
`27`	`27`	`desc.add<COMPILER_DYNAMIC_QUANTIZATION>();`
	`28`	`+ desc.add<QDQ_OPTIMIZATION>();`
`28`	`29`	`}`
`29`	`30`
`30`	`31`	`//`