Skip to content

Commit 91ba886

Browse files
[NPUW] Weightless serialization (#28469)
Related to openvinotoolkit/openvino.genai#1635 --------- Co-authored-by: Dmitry Matveev <dmitry.matveev@intel.com>
1 parent 926e6a5 commit 91ba886

13 files changed

+630
-96
lines changed

src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp

+21
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,27 @@ struct NUM_STREAMS final : OptionBase<NUM_STREAMS, ov::streams::Num> {
198198
}
199199
};
200200

201+
//
202+
// WEIGHTS_PATH
203+
//
204+
struct WEIGHTS_PATH final : OptionBase<WEIGHTS_PATH, std::string> {
205+
static std::string_view key() {
206+
return ov::weights_path.name();
207+
}
208+
209+
static constexpr std::string_view getTypeName() {
210+
return "std::string";
211+
}
212+
213+
static std::string defaultValue() {
214+
return "";
215+
}
216+
217+
static OptionMode mode() {
218+
return OptionMode::CompileTime;
219+
}
220+
};
221+
201222
//
202223
// ENABLE_CPU_PINNING
203224
//

src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp

+170-61
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
#include "openvino/util/common_util.hpp"
2222
#include "partitioning/patterns/opt.hpp"
2323
#include "plugin.hpp"
24-
#include "serialization.hpp"
2524
#include "unfold_sync_infer_request.hpp"
2625
#include "util.hpp"
2726

@@ -30,6 +29,7 @@
3029
#include "intel_npu/config/npuw.hpp"
3130
#include "intel_npu/npuw_private_properties.hpp"
3231
#include "llm_compiled_model.hpp"
32+
#include "openvino/core/rt_info/weightless_caching_attributes.hpp"
3333
#include "openvino/runtime/device_id_parser.hpp"
3434
#include "openvino/runtime/internal_properties.hpp"
3535
#include "openvino/runtime/properties.hpp"
@@ -206,6 +206,9 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
206206
}
207207
}
208208

209+
// Store original constants' offset for serialization purposes
210+
store_const_offsets(model);
211+
209212
auto partitioning = getPartitioning(model, m_cfg);
210213
m_total_stat.gflops = partitioning.total_gflops;
211214
m_total_stat.ops = partitioning.total_ops;
@@ -509,7 +512,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
509512
LOG_DEBUG("CompiledModel is being deserialized, skipping the full constructor flow...");
510513
}
511514

512-
void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream) const {
515+
void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream,
516+
const ov::npuw::s11n::Context& ctx) const {
513517
using namespace ov::npuw::s11n;
514518

515519
LOG_DEBUG("Serializing CompiledModelDesc...");
@@ -526,37 +530,58 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream)
526530

527531
write(stream, spatial);
528532

529-
write(stream, scales);
530-
write(stream, zerops);
531533
write(stream, is_remote);
532-
533-
// NOTE: for closure only serialize uids - full flow
534534
write(stream, closure_uid);
535535

536-
// Some tensors might be present in CPU closure already - need to serialize as is
537-
// FIXME: When weightless serialization is introduced, this should be handled differently
538-
write(stream, closure.size());
539-
std::vector<ov::Tensor> cpu_closures;
540-
std::vector<std::size_t> cpu_closure_ids;
541-
for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) {
542-
if (closure_uid[cidx] == -1) { // CPU closure, not in the bank
543-
cpu_closure_ids.push_back(cidx);
544-
cpu_closures.push_back(closure[cidx]);
536+
if (ctx.is_weightless) {
537+
write_weightless(stream, scales, ctx);
538+
write_weightless(stream, zerops, ctx);
539+
540+
write(stream, closure.size());
541+
std::vector<ov::Tensor> cpu_closures;
542+
std::vector<std::size_t> cpu_closure_ids;
543+
std::vector<ov::npuw::weights::LazyTensor> non_cpu_tensors;
544+
std::vector<std::size_t> non_cpu_tensors_ids;
545+
for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) {
546+
if (closure_uid[cidx] == -1) { // CPU closure
547+
cpu_closure_ids.push_back(cidx);
548+
cpu_closures.push_back(closure[cidx]);
549+
} else {
550+
non_cpu_tensors_ids.push_back(cidx);
551+
non_cpu_tensors.push_back(lazy_closure[cidx]); // must be there
552+
}
545553
}
546-
}
547554

548-
write(stream, cpu_closure_ids);
555+
write(stream, cpu_closure_ids);
556+
write_weightless(stream, cpu_closures, ctx);
557+
write(stream, non_cpu_tensors_ids);
558+
write(stream, non_cpu_tensors);
559+
} else {
560+
write(stream, scales);
561+
write(stream, zerops);
562+
563+
write(stream, closure.size());
564+
std::vector<ov::Tensor> cpu_closures;
565+
std::vector<std::size_t> cpu_closure_ids;
566+
for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) {
567+
if (closure_uid[cidx] == -1) { // CPU closure, not in the bank
568+
cpu_closure_ids.push_back(cidx);
569+
cpu_closures.push_back(closure[cidx]);
570+
}
571+
}
549572

550-
for (const auto& tensor : cpu_closures) {
551-
write(stream, tensor);
552-
}
573+
write(stream, cpu_closure_ids);
553574

554-
// FIXME: support weightless flow!
575+
for (const auto& tensor : cpu_closures) {
576+
write(stream, tensor);
577+
}
578+
}
555579

556580
LOG_DEBUG("DONE.");
557581
}
558582

559-
void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& stream) {
583+
void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& stream,
584+
const ov::npuw::s11n::Weights& weights) {
560585
using namespace ov::npuw::s11n;
561586

562587
LOG_DEBUG("Deserializing CompiledModelDesc...");
@@ -573,25 +598,57 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea
573598

574599
read(stream, spatial);
575600

576-
read(stream, scales);
577-
read(stream, zerops);
578601
read(stream, is_remote);
579-
580-
// NOTE: for closure only deserialize uids - full flow
581602
read(stream, closure_uid);
582603

583-
// Some tensors might be present in CPU closure already - need to deserialize as is
584-
// FIXME: When weightless serialization is introduced, this should be handled differently
585-
std::size_t closure_size = 0;
586-
read(stream, closure_size);
587-
std::vector<std::size_t> cpu_closure_ids;
588-
read(stream, cpu_closure_ids);
589-
closure.resize(closure_size);
590-
for (const auto& cidx : cpu_closure_ids) {
591-
read(stream, closure[cidx]);
592-
}
604+
if (weights) {
605+
read_weightless(stream, scales, weights);
606+
read_weightless(stream, zerops, weights);
607+
608+
std::size_t closure_size = 0;
609+
read(stream, closure_size);
610+
closure.resize(closure_size);
611+
lazy_closure.resize(closure_size);
612+
613+
std::vector<std::size_t> cpu_closure_ids;
614+
read(stream, cpu_closure_ids);
615+
616+
std::vector<ov::Tensor> cpu_closures;
617+
read_weightless(stream, cpu_closures, weights);
618+
std::size_t tidx = 0;
619+
for (const auto& idx : cpu_closure_ids) {
620+
closure[idx] = std::move(cpu_closures[tidx++]);
621+
}
622+
623+
std::vector<std::size_t> non_cpu_tensors_ids;
624+
read(stream, non_cpu_tensors_ids);
593625

594-
// FIXME: support weightless flow!
626+
std::vector<ov::npuw::weights::LazyTensor> non_cpu_tensors;
627+
read(stream, non_cpu_tensors);
628+
std::size_t ltidx = 0;
629+
for (const auto& idx : non_cpu_tensors_ids) {
630+
lazy_closure[idx] = std::move(non_cpu_tensors[ltidx++]);
631+
}
632+
633+
// Also read weights into LazyTensors
634+
for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) {
635+
if (closure_uid[cidx] != -1) { // previously registered before serialization
636+
lazy_closure[cidx].read_weight(weights);
637+
}
638+
}
639+
} else {
640+
read(stream, scales);
641+
read(stream, zerops);
642+
643+
std::size_t closure_size = 0;
644+
read(stream, closure_size);
645+
std::vector<std::size_t> cpu_closure_ids;
646+
read(stream, cpu_closure_ids);
647+
closure.resize(closure_size);
648+
for (const auto& cidx : cpu_closure_ids) {
649+
read(stream, closure[cidx]);
650+
}
651+
}
595652

596653
LOG_DEBUG("DONE.");
597654
}
@@ -627,6 +684,17 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
627684
write_any(stream, p.second);
628685
}
629686

687+
// Write flow identifier
688+
bool is_weightless = true;
689+
if (m_non_npuw_props.count(ov::cache_mode.name()) &&
690+
m_non_npuw_props.at(ov::cache_mode.name()).as<CacheMode>() == CacheMode::OPTIMIZE_SPEED) {
691+
is_weightless = false;
692+
}
693+
write(stream, is_weightless);
694+
695+
// Create weightless context
696+
Context ctx(is_weightless, m_const_to_offset);
697+
630698
// Serialize compiled submodels
631699
write(stream, m_compiled_submodels.size());
632700
for (const auto& subm : m_compiled_submodels) {
@@ -644,15 +712,16 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
644712
write(stream, false);
645713
}
646714
// Write the rest of the submodel desc
647-
subm.serialize(stream);
715+
subm.serialize(stream, ctx);
648716
}
649717

650718
LOG_INFO("Done.");
651719
}
652720

653721
std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
654722
std::istream& stream,
655-
const std::shared_ptr<const ov::IPlugin>& plugin) {
723+
const std::shared_ptr<const ov::IPlugin>& plugin,
724+
const ov::AnyMap& properties) {
656725
LOG_INFO("Deserializing CompiledModel...");
657726
LOG_BLOCK();
658727

@@ -699,6 +768,26 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
699768
}
700769
compiled->implement_properties();
701770

771+
// Read flow identifier
772+
bool is_weightless = false;
773+
read(stream, is_weightless);
774+
775+
// Initialize weights stream if weightless flow
776+
std::string weights_path;
777+
if (is_weightless) {
778+
NPUW_ASSERT(properties.find(ov::weights_path.name()) != properties.end() &&
779+
"There is no WEIGHTS_PATH set in properties but the blob was exported as weightless!");
780+
weights_path = properties.at(ov::weights_path.name()).as<std::string>();
781+
}
782+
783+
ov::npuw::s11n::Weights weights = nullptr;
784+
if (is_weightless) {
785+
auto mapped_memory = ov::load_mmap_object(weights_path);
786+
weights = std::make_shared<ov::SharedBuffer<std::shared_ptr<ov::MappedMemory>>>(mapped_memory->data(),
787+
mapped_memory->size(),
788+
mapped_memory);
789+
}
790+
702791
// Deserialize compiled submodels
703792
std::size_t subm_size = 0;
704793
read(stream, subm_size);
@@ -719,7 +808,7 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
719808
plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx]);
720809
}
721810
compiled->m_compiled_submodels[i].device_it = compiled->m_dev_list.begin() + device_idx;
722-
compiled->m_compiled_submodels[i].deserialize(stream);
811+
compiled->m_compiled_submodels[i].deserialize(stream, weights);
723812
}
724813

725814
compiled->implement_properties();
@@ -730,6 +819,33 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
730819
return compiled;
731820
}
732821

822+
void ov::npuw::CompiledModel::reconstruct_closure() {
823+
for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
824+
auto& comp_model_desc = m_compiled_submodels[idx];
825+
826+
// Skip optimized out and non-functions
827+
if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
828+
continue;
829+
}
830+
831+
const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
832+
auto& func_desc = m_compiled_submodels[real_idx];
833+
834+
// At this point closure size should have already been deserialized
835+
NPUW_ASSERT(!comp_model_desc.closure.empty() && "Closure shouldn't be empty at this point!");
836+
for (std::size_t cidx = 0; cidx < comp_model_desc.closure.size(); ++cidx) {
837+
if (comp_model_desc.closure[cidx]) {
838+
// host-side closure - already set, do nothing
839+
NPUW_ASSERT(!comp_model_desc.is_remote[cidx]);
840+
continue;
841+
}
842+
NPUW_ASSERT(comp_model_desc.closure_uid[cidx] != -1);
843+
comp_model_desc.closure[cidx] =
844+
m_weights_bank->get(comp_model_desc.closure_uid[cidx], *func_desc.device_it);
845+
}
846+
}
847+
}
848+
733849
void ov::npuw::CompiledModel::finalize_weights_bank() {
734850
LOG_INFO("Finalizing weights bank...");
735851
// Register lazy tensors
@@ -785,29 +901,22 @@ void ov::npuw::CompiledModel::finalize_weights_bank() {
785901
LOG_INFO("Done.");
786902
}
787903

788-
void ov::npuw::CompiledModel::reconstruct_closure() {
789-
for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
790-
auto& comp_model_desc = m_compiled_submodels[idx];
791-
792-
// Skip optimized out and non-functions
793-
if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
794-
continue;
795-
}
796-
797-
const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
798-
auto& func_desc = m_compiled_submodels[real_idx];
799-
800-
// At this point closure size should have already been deserialized
801-
NPUW_ASSERT(!comp_model_desc.closure.empty() && "Closure shouldn't be empty at this point!");
802-
for (std::size_t cidx = 0; cidx < comp_model_desc.closure.size(); ++cidx) {
803-
if (comp_model_desc.closure[cidx]) {
804-
// host-side closure - already set, do nothing
805-
NPUW_ASSERT(!comp_model_desc.is_remote[cidx]);
904+
void ov::npuw::CompiledModel::store_const_offsets(const std::shared_ptr<ov::Model>& model) {
905+
for (auto&& node_ptr : model->get_ordered_ops()) {
906+
if (ov::op::util::is_constant(node_ptr)) {
907+
const auto& c = std::static_pointer_cast<ov::op::v0::Constant>(node_ptr);
908+
auto rt_info = c->get_rt_info();
909+
auto weightless_cache_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static());
910+
if (weightless_cache_attr == rt_info.end()) {
806911
continue;
807912
}
808-
NPUW_ASSERT(comp_model_desc.closure_uid[cidx] != -1);
809-
comp_model_desc.closure[cidx] =
810-
m_weights_bank->get(comp_model_desc.closure_uid[cidx], *func_desc.device_it);
913+
std::size_t offset = weightless_cache_attr->second.as<ov::WeightlessCacheAttribute>().bin_offset;
914+
auto data_ptr = c->get_data_ptr();
915+
auto inserted = m_const_to_offset.insert({data_ptr, offset});
916+
if (!inserted.second) {
917+
NPUW_ASSERT(inserted.first->second == offset &&
918+
"Model contains two constants with same pointer and different offset!");
919+
}
811920
}
812921
}
813922
}

0 commit comments

Comments
 (0)