Skip to content

Commit f332cb4

Browse files
[NPUW] Dynamic stateful model support (#27651)
### Details: - *item1* - *...* ### Related PRs: - GenAI: *openvinotoolkit/openvino.genai#1240 ### Tickets: - *ticket-id* --------- Co-authored-by: TolyaTalamanov <anatoliy.talamanov@intel.com>
1 parent 09d1e50 commit f332cb4

13 files changed

+882
-5
lines changed

src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp

+24
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ struct OptionParser<int32_t> final {
7474
static int32_t parse(std::string_view val);
7575
};
7676

77+
template <>
78+
struct OptionParser<uint32_t> final {
79+
static uint32_t parse(std::string_view val);
80+
};
81+
7782
template <>
7883
struct OptionParser<int64_t> final {
7984
static int64_t parse(std::string_view val);
@@ -167,6 +172,25 @@ struct OptionPrinter final {
167172
}
168173
};
169174

175+
template <typename K, typename V>
176+
struct OptionPrinter<std::map<K, V>> final {
177+
static std::string toString(const std::map<K, V>& val) {
178+
std::stringstream ss;
179+
std::size_t counter = 0;
180+
std::size_t size = val.size();
181+
for (auto& [key, value] : val) {
182+
std::string key_str = OptionPrinter<K>::toString(key);
183+
std::string value_str = OptionPrinter<V>::toString(value);
184+
ss << key_str << ":" << value_str;
185+
if (counter < size - 1) {
186+
ss << ",";
187+
}
188+
++counter;
189+
}
190+
return ss.str();
191+
}
192+
};
193+
170194
// NB: boolean config option has values YES for true, NO for false
171195
template <>
172196
struct OptionPrinter<bool> final {

src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp

+107
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ namespace intel_npu {
1717
//
1818

1919
void registerNPUWOptions(OptionsDesc& desc);
20+
void registerNPUWLLMOptions(OptionsDesc& desc);
2021

2122
#define DEFINE_OPT(Name, Type, DefaultValue, PropertyKey, Mode) \
2223
struct Name final : OptionBase<Name, Type> { \
@@ -66,4 +67,110 @@ DEFINE_OPT(NPUW_DUMP_SUBS, std::string, "", npuw::dump::subgraphs, CompileTime);
6667
DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fail, CompileTime);
6768
DEFINE_OPT(NPUW_DUMP_IO, std::string, "", npuw::dump::inputs_outputs, RunTime);
6869
DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime);
70+
DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime);
71+
DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
72+
DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);
73+
74+
namespace npuw {
75+
namespace llm {
76+
struct ModelDesc {
77+
std::string type;
78+
std::string name_or_path;
79+
int num_key_value_heads;
80+
};
81+
enum class GenerateHint { FAST_COMPILE, BEST_PERF };
82+
} // namespace llm
83+
} // namespace npuw
84+
85+
struct NPUW_LLM_MODEL_DESC final : OptionBase<NPUW_LLM_MODEL_DESC, ::intel_npu::npuw::llm::ModelDesc> {
86+
static std::string_view key() {
87+
return ov::intel_npu::npuw::llm::model_desc.name();
88+
}
89+
90+
static constexpr std::string_view getTypeName() {
91+
return "::intel_npu::npuw::llm::ModelDesc";
92+
}
93+
94+
static ::intel_npu::npuw::llm::ModelDesc defaultValue() {
95+
return {};
96+
}
97+
98+
static ::intel_npu::npuw::llm::ModelDesc parse(std::string_view val) {
99+
::intel_npu::npuw::llm::ModelDesc res;
100+
std::map<std::string, std::string> res_map = OptionParser<std::map<std::string, std::string>>::parse(val);
101+
res.type = res_map["type"];
102+
res.name_or_path = res_map["name_or_path"];
103+
res.num_key_value_heads = std::stoi(res_map["num_key_value_heads"]);
104+
return res;
105+
}
106+
107+
static std::string toString(const ::intel_npu::npuw::llm::ModelDesc& val) {
108+
std::string res;
109+
std::map<std::string, std::string> res_map;
110+
res_map["type"] = val.type;
111+
res_map["name_or_path"] = val.name_or_path;
112+
res_map["num_key_value_heads"] = std::to_string(val.num_key_value_heads);
113+
return OptionPrinter<std::map<std::string, std::string>>::toString(res_map);
114+
}
115+
116+
static OptionMode mode() {
117+
return OptionMode::CompileTime;
118+
}
119+
120+
static bool isPublic() {
121+
return true;
122+
}
123+
};
124+
125+
struct NPUW_LLM_GENERATE_HINT final : OptionBase<NPUW_LLM_GENERATE_HINT, ::intel_npu::npuw::llm::GenerateHint> {
126+
static std::string_view key() {
127+
return ov::intel_npu::npuw::llm::generate_hint.name();
128+
}
129+
130+
static constexpr std::string_view getTypeName() {
131+
return "::intel_npu::npuw::llm::GenerateHint";
132+
}
133+
134+
static ::intel_npu::npuw::llm::GenerateHint defaultValue() {
135+
return ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE;
136+
}
137+
138+
static ::intel_npu::npuw::llm::GenerateHint parse(std::string_view val) {
139+
::intel_npu::npuw::llm::GenerateHint res;
140+
141+
if (val == "FAST_COMPILE") {
142+
res = ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE;
143+
} else if (val == "BEST_PERF") {
144+
res = ::intel_npu::npuw::llm::GenerateHint::BEST_PERF;
145+
} else {
146+
OPENVINO_THROW("Unsupported \"GENERATE_HINT\" provided: ",
147+
val,
148+
". Please select either \"FAST_COMPILE\" or \"BEST_PERF\".");
149+
}
150+
return res;
151+
}
152+
153+
static std::string toString(const ::intel_npu::npuw::llm::GenerateHint& val) {
154+
std::string res;
155+
switch (val) {
156+
case ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE:
157+
res = "FAST_COMPILE";
158+
break;
159+
case ::intel_npu::npuw::llm::GenerateHint::BEST_PERF:
160+
res = "BEST_PERF";
161+
break;
162+
default:
163+
OPENVINO_THROW("Can't convert provided \"GENERATE_HINT\" : ", int(val), " to string.");
164+
}
165+
return res;
166+
}
167+
168+
static OptionMode mode() {
169+
return OptionMode::CompileTime;
170+
}
171+
172+
static bool isPublic() {
173+
return true;
174+
}
175+
};
69176
} // namespace intel_npu

src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp

+45
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,51 @@ static constexpr ov::Property<std::string> inputs_outputs{"NPUW_DUMP_IO"};
378378
static constexpr ov::Property<std::string> io_iters{"NPUW_DUMP_IO_ITERS"};
379379
} // namespace dump
380380

381+
namespace llm {
382+
/**
383+
* @brief
384+
* Type: bool.
385+
* Tell NPUW that you want to pass dynamic stateful LLM model.
386+
* Default value: false.
387+
*/
388+
static constexpr ov::Property<bool> enabled{"NPUW_LLM"};
389+
390+
/**
391+
* @brief
392+
* Type: std::map<std::string, std::string>.
393+
* Tell NPUW about your LLM model. Use following structure for that:
394+
* "type:<type>,name_or_path:<name_or_path>,num_key_value_heads:<number>".
395+
* Default value: empty structure defined above.
396+
*/
397+
static constexpr ov::Property<std::string> model_desc{"NPUW_LLM_MODEL_DESC"};
398+
399+
/**
400+
* @brief
401+
* Type: uint32_t.
402+
* Tell NPUW your desirable max prompt length.
403+
* Default value: 1024.
404+
*/
405+
static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"};
406+
407+
/**
408+
* @brief
409+
* Type: uint32_t.
410+
* Tell NPUW your desirable min response length.
411+
* Default value: 128.
412+
*/
413+
static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"};
414+
415+
/**
416+
* @brief
417+
* Type: std::string.
418+
* Tell NPUW the preferrable hint for generation stage, that leads to usage of optimal configuration for it.
419+
* Possible values: "FAST_COMPILE", "BEST_PERF".
420+
* Default value: "FAST_COMPILE".
421+
*/
422+
static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT"};
423+
424+
} // namespace llm
425+
381426
} // namespace npuw
382427
} // namespace intel_npu
383428
} // namespace ov

src/plugins/intel_npu/src/al/src/config/config.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,14 @@ int32_t OptionParser<int32_t>::parse(std::string_view val) {
5050
}
5151
}
5252

53+
uint32_t OptionParser<uint32_t>::parse(std::string_view val) {
54+
try {
55+
return std::stoul(val.data());
56+
} catch (...) {
57+
OPENVINO_THROW("Value '%s' is not a valid UINT32 option", val.data());
58+
}
59+
}
60+
5361
int64_t OptionParser<int64_t>::parse(std::string_view val) {
5462
try {
5563
return std::stoll(val.data());

src/plugins/intel_npu/src/al/src/config/npuw.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,11 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
5454
desc.add<NPUW_DUMP_IO_ITERS>();
5555
#endif
5656
}
57+
58+
void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
59+
desc.add<NPUW_LLM>();
60+
desc.add<NPUW_LLM_MODEL_DESC>();
61+
desc.add<NPUW_LLM_MAX_PROMPT_LEN>();
62+
desc.add<NPUW_LLM_MIN_RESPONSE_LEN>();
63+
desc.add<NPUW_LLM_GENERATE_HINT>();
64+
}

src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp

+25-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "intel_npu/config/config.hpp"
2929
#include "intel_npu/config/npuw.hpp"
3030
#include "intel_npu/npuw_private_properties.hpp"
31+
#include "llm_compiled_model.hpp"
3132
#include "openvino/runtime/device_id_parser.hpp"
3233
#include "openvino/runtime/internal_properties.hpp"
3334
#include "openvino/runtime/properties.hpp"
@@ -85,10 +86,33 @@ ov::npuw::DeviceProperties get_properties_per_device(const std::shared_ptr<const
8586
} // namespace npuw
8687
} // namespace ov
8788

89+
std::shared_ptr<ov::npuw::ICompiledModel> ov::npuw::ICompiledModel::create(
90+
const std::shared_ptr<ov::Model>& model,
91+
const std::shared_ptr<const ov::IPlugin>& plugin,
92+
const ov::AnyMap& properties) {
93+
LOG_INFO("Choosing which NPUW CompiledModel to create");
94+
LOG_BLOCK();
95+
std::shared_ptr<ov::npuw::ICompiledModel> compiled_model;
96+
auto use_llm_key = ov::intel_npu::npuw::llm::enabled.name();
97+
if (properties.count(use_llm_key) && properties.at(use_llm_key).as<bool>() == true) {
98+
LOG_INFO("ov::npuw::LLMCompiledModel will be created.");
99+
compiled_model = std::make_shared<ov::npuw::LLMCompiledModel>(model, plugin, properties);
100+
} else {
101+
LOG_INFO("ov::npuw::CompiledModel will be created.");
102+
compiled_model = std::make_shared<ov::npuw::CompiledModel>(model, plugin, properties);
103+
}
104+
LOG_INFO("Done");
105+
return compiled_model;
106+
}
107+
108+
ov::npuw::ICompiledModel::ICompiledModel(const std::shared_ptr<ov::Model>& model,
109+
const std::shared_ptr<const ov::IPlugin>& plugin)
110+
: ov::ICompiledModel(model, plugin) {}
111+
88112
ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
89113
const std::shared_ptr<const ov::IPlugin>& plugin,
90114
const ov::AnyMap& properties)
91-
: ov::ICompiledModel(model, plugin),
115+
: ov::npuw::ICompiledModel(model, plugin),
92116
m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()),
93117
m_cfg(m_options_desc),
94118
m_name(model->get_friendly_name()),

src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp

+8-2
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,16 @@ class Plugin;
2222

2323
namespace ov {
2424
namespace npuw {
25+
class ICompiledModel : public ov::ICompiledModel {
26+
public:
27+
static std::shared_ptr<ov::npuw::ICompiledModel> create(const std::shared_ptr<ov::Model>& model,
28+
const std::shared_ptr<const ov::IPlugin>& plugin,
29+
const ov::AnyMap& properties);
30+
ICompiledModel(const std::shared_ptr<ov::Model>& model, const std::shared_ptr<const ov::IPlugin>& plugin);
31+
};
2532

2633
class InferRequest;
27-
28-
class CompiledModel : public ov::ICompiledModel {
34+
class CompiledModel : public ov::npuw::ICompiledModel {
2935
using DevList = std::vector<std::string>;
3036
using GetPropertiesMap =
3137
std::map<std::string, std::tuple<ov::PropertyMutability, std::function<ov::Any(const ::intel_npu::Config&)>>>;

0 commit comments

Comments
 (0)