Skip to content

Commit 09d1e50

Browse files
authored
[CPU]Check runtime_options from IR model (#27765)
### Details: - *Check `runtim_options` from IR model* - *Set `KV_CACHE_PRECISION` & `DYNAMIC_QUANTIZATION_GROUP_SIZE`* from `runtim_options` of IR model - Example IR model with `runtim_options` - #27778 to releases/2024/5 ``` <rt_info> <runtime_options> <KV_CACHE_PRECISION value="f16" /> </runtime_options> </rt_info> ``` ### Tickets: - *CVS-157571*
1 parent f89b8de commit 09d1e50

File tree

5 files changed

+46
-2
lines changed

5 files changed

+46
-2
lines changed

src/plugins/intel_cpu/src/config.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -460,5 +460,15 @@ void Config::updateProperties() {
460460
_config.insert({ov::hint::num_requests.name(), std::to_string(hintNumRequests)});
461461
}
462462

463+
void Config::applyRtInfo(const std::shared_ptr<const ov::Model>& model) {
464+
if (model->has_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()})) {
465+
this->kvCachePrecision = model->get_rt_info<ov::element::Type>({"runtime_options", ov::hint::kv_cache_precision.name()});
466+
}
467+
if (model->has_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()})) {
468+
this->fcDynamicQuantizationGroupSize =
469+
model->get_rt_info<uint64_t>({"runtime_options", ov::hint::dynamic_quantization_group_size.name()});
470+
}
471+
}
472+
463473
} // namespace intel_cpu
464474
} // namespace ov

src/plugins/intel_cpu/src/config.h

+2
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ struct Config {
106106

107107
void updateProperties();
108108

109+
void applyRtInfo(const std::shared_ptr<const ov::Model>& model);
110+
109111
std::map<std::string, std::string> _config;
110112

111113
int modelPreferThreads = -1;

src/plugins/intel_cpu/src/plugin.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
247247
// update the props after the perf mode translated to configs
248248
// TODO: Clarify the behavior of SetConfig method. Skip eng_config or not?
249249
Config conf = engConfig;
250+
conf.applyRtInfo(cloned_model);
250251
conf.readProperties(config, modelType);
251252

252253
Transformations transformations(cloned_model, conf);
@@ -520,6 +521,7 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr<const ov::Model>&
520521

521522
Config conf = engConfig;
522523
Config::ModelType modelType = getModelType(model);
524+
conf.applyRtInfo(model);
523525
conf.readProperties(config, modelType);
524526

525527
auto context = std::make_shared<GraphContext>(conf, fake_w_cache, false);
@@ -575,7 +577,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::import_model(std::istream& model_str
575577

576578
Config conf = engConfig;
577579
Config::ModelType modelType = getModelType(model);
578-
580+
conf.applyRtInfo(model);
579581
// check ov::loaded_from_cache property and erase it to avoid exception in readProperties.
580582
auto _config = config;
581583
const auto& it = _config.find(ov::loaded_from_cache.name());

src/plugins/intel_cpu/src/plugin.h

-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ class Plugin : public ov::IPlugin {
5050

5151
void get_performance_streams(Config& config, const std::shared_ptr<ov::Model>& model) const;
5252
void calculate_streams(Config& conf, const std::shared_ptr<ov::Model>& model, bool imported = false) const;
53-
5453
Config engConfig;
5554
/* Explicily configured streams have higher priority than performance hints.
5655
So track if streams is set explicitly (not auto-configured) */

src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp

+31
Original file line numberDiff line numberDiff line change
@@ -327,4 +327,35 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPUExecutionDevice) {
327327
ASSERT_EQ(value.as<std::string>(), "CPU");
328328
}
329329

330+
TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptions) {
331+
ov::Core ie;
332+
ov::Any type;
333+
ov::Any size;
334+
ov::CompiledModel compiledModel;
335+
model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
336+
model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
337+
OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName));
338+
OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision));
339+
OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
340+
ASSERT_EQ(type.as<ov::element::Type>(), ov::element::f16);
341+
ASSERT_EQ(size.as<uint64_t>(), 0);
342+
}
343+
344+
TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptionsWithCompileConfig) {
345+
ov::Core ie;
346+
ov::Any type;
347+
ov::Any size;
348+
ov::CompiledModel compiledModel;
349+
model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name());
350+
model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name());
351+
ov::AnyMap config;
352+
config[ov::hint::kv_cache_precision.name()] = "u8";
353+
config[ov::hint::dynamic_quantization_group_size.name()] = "16";
354+
OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName, config));
355+
OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision));
356+
OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size));
357+
ASSERT_EQ(type.as<ov::element::Type>(), ov::element::u8);
358+
ASSERT_EQ(size.as<uint64_t>(), 16);
359+
}
360+
330361
} // namespace

0 commit comments

Comments
 (0)