[CPU]Check runtime_options from IR model (#27765)

zhangYiIntel · web-flow · commit 09d1e50b131e · 2024-11-30T11:38:37.000Z
### Details: - *Check `runtim_options` from IR model* - *Set `KV_CACHE_PRECISION` & `DYNAMIC_QUANTIZATION_GROUP_SIZE`* from `runtim_options` of IR model - Example IR model with `runtim_options` - #27778 to releases/2024/5 ``` <rt_info> <runtime_options> <KV_CACHE_PRECISION value="f16" /> </runtime_options> </rt_info> ``` ### Tickets: - *CVS-157571*
diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp
@@ -460,5 +460,15 @@ void Config::updateProperties() {
     _config.insert({ov::hint::num_requests.name(), std::to_string(hintNumRequests)});
 }
 
+void Config::applyRtInfo(const std::shared_ptr<const ov::Model>& model) {
+    if (model->has_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()})) {
+        this->kvCachePrecision = model->get_rt_info<ov::element::Type>({"runtime_options", ov::hint::kv_cache_precision.name()});
+    }
+    if (model->has_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()})) {
+        this->fcDynamicQuantizationGroupSize =
+            model->get_rt_info<uint64_t>({"runtime_options", ov::hint::dynamic_quantization_group_size.name()});
+    }
+}
+
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h
@@ -106,6 +106,8 @@ struct Config {
 
     void updateProperties();
 
+    void applyRtInfo(const std::shared_ptr<const ov::Model>& model);
+
     std::map<std::string, std::string> _config;
 
     int modelPreferThreads = -1;
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
@@ -247,6 +247,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
     // update the props after the perf mode translated to configs
     // TODO: Clarify the behavior of SetConfig method. Skip eng_config or not?
     Config conf = engConfig;
+    conf.applyRtInfo(cloned_model);
     conf.readProperties(config, modelType);
 
     Transformations transformations(cloned_model, conf);
@@ -520,6 +521,7 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr<const ov::Model>&
 
     Config conf = engConfig;
     Config::ModelType modelType = getModelType(model);
+    conf.applyRtInfo(model);
     conf.readProperties(config, modelType);
 
     auto context = std::make_shared<GraphContext>(conf, fake_w_cache, false);
@@ -575,7 +577,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::import_model(std::istream& model_str
 
     Config conf = engConfig;
     Config::ModelType modelType = getModelType(model);
-
+    conf.applyRtInfo(model);
     // check ov::loaded_from_cache property and erase it to avoid exception in readProperties.
     auto _config = config;
     const auto& it = _config.find(ov::loaded_from_cache.name());
diff --git a/src/plugins/intel_cpu/src/plugin.h b/src/plugins/intel_cpu/src/plugin.h
@@ -50,7 +50,6 @@ class Plugin : public ov::IPlugin {
 
     void get_performance_streams(Config& config, const std::shared_ptr<ov::Model>& model) const;
     void calculate_streams(Config& conf, const std::shared_ptr<ov::Model>& model, bool imported = false) const;
-
     Config engConfig;
     /* Explicily configured streams have higher priority than performance hints.
        So track if streams is set explicitly (not auto-configured) */
diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp
@@ -327,4 +327,35 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPUExecutionDevice) {
     ASSERT_EQ(value.as<std::string>(), "CPU");
 }
 
+TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptions) {
+    ov::Core ie;
+    ov::Any type;
+    ov::Any size;
+    ov::CompiledModel compiledModel;
+    model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
+    model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
+    OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName));
+    OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision));
+    OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
+    ASSERT_EQ(type.as<ov::element::Type>(), ov::element::f16);
+    ASSERT_EQ(size.as<uint64_t>(), 0);
+}
+
+TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptionsWithCompileConfig) {
+    ov::Core ie;
+    ov::Any type;
+    ov::Any size;
+    ov::CompiledModel compiledModel;
+    model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
+    model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
+    ov::AnyMap config;
+    config[ov::hint::kv_cache_precision.name()] = "u8";
+    config[ov::hint::dynamic_quantization_group_size.name()] = "16";
+    OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName, config));
+    OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision));
+    OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
+    ASSERT_EQ(type.as<ov::element::Type>(), ov::element::u8);
+    ASSERT_EQ(size.as<uint64_t>(), 16);
+}
+
 } // namespace