21
21
#include " openvino/util/common_util.hpp"
22
22
#include " partitioning/patterns/opt.hpp"
23
23
#include " plugin.hpp"
24
- #include " serialization.hpp"
25
24
#include " unfold_sync_infer_request.hpp"
26
25
#include " util.hpp"
27
26
30
29
#include " intel_npu/config/npuw.hpp"
31
30
#include " intel_npu/npuw_private_properties.hpp"
32
31
#include " llm_compiled_model.hpp"
32
+ #include " openvino/core/rt_info/weightless_caching_attributes.hpp"
33
33
#include " openvino/runtime/device_id_parser.hpp"
34
34
#include " openvino/runtime/internal_properties.hpp"
35
35
#include " openvino/runtime/properties.hpp"
@@ -206,6 +206,9 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
206
206
}
207
207
}
208
208
209
+ // Store original constants' offset for serialization purposes
210
+ store_const_offsets (model);
211
+
209
212
auto partitioning = getPartitioning (model, m_cfg);
210
213
m_total_stat.gflops = partitioning.total_gflops ;
211
214
m_total_stat.ops = partitioning.total_ops ;
@@ -509,7 +512,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
509
512
LOG_DEBUG (" CompiledModel is being deserialized, skipping the full constructor flow..." );
510
513
}
511
514
512
- void ov::npuw::CompiledModel::CompiledModelDesc::serialize (std::ostream& stream) const {
515
+ void ov::npuw::CompiledModel::CompiledModelDesc::serialize (std::ostream& stream,
516
+ const ov::npuw::s11n::Context& ctx) const {
513
517
using namespace ov ::npuw::s11n;
514
518
515
519
LOG_DEBUG (" Serializing CompiledModelDesc..." );
@@ -526,37 +530,58 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream)
526
530
527
531
write (stream, spatial);
528
532
529
- write (stream, scales);
530
- write (stream, zerops);
531
533
write (stream, is_remote);
532
-
533
- // NOTE: for closure only serialize uids - full flow
534
534
write (stream, closure_uid);
535
535
536
- // Some tensors might be present in CPU closure already - need to serialize as is
537
- // FIXME: When weightless serialization is introduced, this should be handled differently
538
- write (stream, closure.size ());
539
- std::vector<ov::Tensor> cpu_closures;
540
- std::vector<std::size_t > cpu_closure_ids;
541
- for (std::size_t cidx = 0 ; cidx < closure.size (); ++cidx) {
542
- if (closure_uid[cidx] == -1 ) { // CPU closure, not in the bank
543
- cpu_closure_ids.push_back (cidx);
544
- cpu_closures.push_back (closure[cidx]);
536
+ if (ctx.is_weightless ) {
537
+ write_weightless (stream, scales, ctx);
538
+ write_weightless (stream, zerops, ctx);
539
+
540
+ write (stream, closure.size ());
541
+ std::vector<ov::Tensor> cpu_closures;
542
+ std::vector<std::size_t > cpu_closure_ids;
543
+ std::vector<ov::npuw::weights::LazyTensor> non_cpu_tensors;
544
+ std::vector<std::size_t > non_cpu_tensors_ids;
545
+ for (std::size_t cidx = 0 ; cidx < closure.size (); ++cidx) {
546
+ if (closure_uid[cidx] == -1 ) { // CPU closure
547
+ cpu_closure_ids.push_back (cidx);
548
+ cpu_closures.push_back (closure[cidx]);
549
+ } else {
550
+ non_cpu_tensors_ids.push_back (cidx);
551
+ non_cpu_tensors.push_back (lazy_closure[cidx]); // must be there
552
+ }
545
553
}
546
- }
547
554
548
- write (stream, cpu_closure_ids);
555
+ write (stream, cpu_closure_ids);
556
+ write_weightless (stream, cpu_closures, ctx);
557
+ write (stream, non_cpu_tensors_ids);
558
+ write (stream, non_cpu_tensors);
559
+ } else {
560
+ write (stream, scales);
561
+ write (stream, zerops);
562
+
563
+ write (stream, closure.size ());
564
+ std::vector<ov::Tensor> cpu_closures;
565
+ std::vector<std::size_t > cpu_closure_ids;
566
+ for (std::size_t cidx = 0 ; cidx < closure.size (); ++cidx) {
567
+ if (closure_uid[cidx] == -1 ) { // CPU closure, not in the bank
568
+ cpu_closure_ids.push_back (cidx);
569
+ cpu_closures.push_back (closure[cidx]);
570
+ }
571
+ }
549
572
550
- for (const auto & tensor : cpu_closures) {
551
- write (stream, tensor);
552
- }
573
+ write (stream, cpu_closure_ids);
553
574
554
- // FIXME: support weightless flow!
575
+ for (const auto & tensor : cpu_closures) {
576
+ write (stream, tensor);
577
+ }
578
+ }
555
579
556
580
LOG_DEBUG (" DONE." );
557
581
}
558
582
559
- void ov::npuw::CompiledModel::CompiledModelDesc::deserialize (std::istream& stream) {
583
+ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize (std::istream& stream,
584
+ const ov::npuw::s11n::Weights& weights) {
560
585
using namespace ov ::npuw::s11n;
561
586
562
587
LOG_DEBUG (" Deserializing CompiledModelDesc..." );
@@ -573,25 +598,57 @@ void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& strea
573
598
574
599
read (stream, spatial);
575
600
576
- read (stream, scales);
577
- read (stream, zerops);
578
601
read (stream, is_remote);
579
-
580
- // NOTE: for closure only deserialize uids - full flow
581
602
read (stream, closure_uid);
582
603
583
- // Some tensors might be present in CPU closure already - need to deserialize as is
584
- // FIXME: When weightless serialization is introduced, this should be handled differently
585
- std::size_t closure_size = 0 ;
586
- read (stream, closure_size);
587
- std::vector<std::size_t > cpu_closure_ids;
588
- read (stream, cpu_closure_ids);
589
- closure.resize (closure_size);
590
- for (const auto & cidx : cpu_closure_ids) {
591
- read (stream, closure[cidx]);
592
- }
604
+ if (weights) {
605
+ read_weightless (stream, scales, weights);
606
+ read_weightless (stream, zerops, weights);
607
+
608
+ std::size_t closure_size = 0 ;
609
+ read (stream, closure_size);
610
+ closure.resize (closure_size);
611
+ lazy_closure.resize (closure_size);
612
+
613
+ std::vector<std::size_t > cpu_closure_ids;
614
+ read (stream, cpu_closure_ids);
615
+
616
+ std::vector<ov::Tensor> cpu_closures;
617
+ read_weightless (stream, cpu_closures, weights);
618
+ std::size_t tidx = 0 ;
619
+ for (const auto & idx : cpu_closure_ids) {
620
+ closure[idx] = std::move (cpu_closures[tidx++]);
621
+ }
622
+
623
+ std::vector<std::size_t > non_cpu_tensors_ids;
624
+ read (stream, non_cpu_tensors_ids);
593
625
594
- // FIXME: support weightless flow!
626
+ std::vector<ov::npuw::weights::LazyTensor> non_cpu_tensors;
627
+ read (stream, non_cpu_tensors);
628
+ std::size_t ltidx = 0 ;
629
+ for (const auto & idx : non_cpu_tensors_ids) {
630
+ lazy_closure[idx] = std::move (non_cpu_tensors[ltidx++]);
631
+ }
632
+
633
+ // Also read weights into LazyTensors
634
+ for (std::size_t cidx = 0 ; cidx < closure.size (); ++cidx) {
635
+ if (closure_uid[cidx] != -1 ) { // previously registered before serialization
636
+ lazy_closure[cidx].read_weight (weights);
637
+ }
638
+ }
639
+ } else {
640
+ read (stream, scales);
641
+ read (stream, zerops);
642
+
643
+ std::size_t closure_size = 0 ;
644
+ read (stream, closure_size);
645
+ std::vector<std::size_t > cpu_closure_ids;
646
+ read (stream, cpu_closure_ids);
647
+ closure.resize (closure_size);
648
+ for (const auto & cidx : cpu_closure_ids) {
649
+ read (stream, closure[cidx]);
650
+ }
651
+ }
595
652
596
653
LOG_DEBUG (" DONE." );
597
654
}
@@ -627,6 +684,17 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
627
684
write_any (stream, p.second );
628
685
}
629
686
687
+ // Write flow identifier
688
+ bool is_weightless = true ;
689
+ if (m_non_npuw_props.count (ov::cache_mode.name ()) &&
690
+ m_non_npuw_props.at (ov::cache_mode.name ()).as <CacheMode>() == CacheMode::OPTIMIZE_SPEED) {
691
+ is_weightless = false ;
692
+ }
693
+ write (stream, is_weightless);
694
+
695
+ // Create weightless context
696
+ Context ctx (is_weightless, m_const_to_offset);
697
+
630
698
// Serialize compiled submodels
631
699
write (stream, m_compiled_submodels.size ());
632
700
for (const auto & subm : m_compiled_submodels) {
@@ -644,15 +712,16 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
644
712
write (stream, false );
645
713
}
646
714
// Write the rest of the submodel desc
647
- subm.serialize (stream);
715
+ subm.serialize (stream, ctx );
648
716
}
649
717
650
718
LOG_INFO (" Done." );
651
719
}
652
720
653
721
std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize (
654
722
std::istream& stream,
655
- const std::shared_ptr<const ov::IPlugin>& plugin) {
723
+ const std::shared_ptr<const ov::IPlugin>& plugin,
724
+ const ov::AnyMap& properties) {
656
725
LOG_INFO (" Deserializing CompiledModel..." );
657
726
LOG_BLOCK ();
658
727
@@ -699,6 +768,26 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
699
768
}
700
769
compiled->implement_properties ();
701
770
771
+ // Read flow identifier
772
+ bool is_weightless = false ;
773
+ read (stream, is_weightless);
774
+
775
+ // Initialize weights stream if weightless flow
776
+ std::string weights_path;
777
+ if (is_weightless) {
778
+ NPUW_ASSERT (properties.find (ov::weights_path.name ()) != properties.end () &&
779
+ " There is no WEIGHTS_PATH set in properties but the blob was exported as weightless!" );
780
+ weights_path = properties.at (ov::weights_path.name ()).as <std::string>();
781
+ }
782
+
783
+ ov::npuw::s11n::Weights weights = nullptr ;
784
+ if (is_weightless) {
785
+ auto mapped_memory = ov::load_mmap_object (weights_path);
786
+ weights = std::make_shared<ov::SharedBuffer<std::shared_ptr<ov::MappedMemory>>>(mapped_memory->data (),
787
+ mapped_memory->size (),
788
+ mapped_memory);
789
+ }
790
+
702
791
// Deserialize compiled submodels
703
792
std::size_t subm_size = 0 ;
704
793
read (stream, subm_size);
@@ -719,7 +808,7 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
719
808
plugin->get_core ()->import_model (buffer, compiled->m_dev_list [device_idx]);
720
809
}
721
810
compiled->m_compiled_submodels [i].device_it = compiled->m_dev_list .begin () + device_idx;
722
- compiled->m_compiled_submodels [i].deserialize (stream);
811
+ compiled->m_compiled_submodels [i].deserialize (stream, weights );
723
812
}
724
813
725
814
compiled->implement_properties ();
@@ -730,6 +819,33 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
730
819
return compiled;
731
820
}
732
821
822
+ void ov::npuw::CompiledModel::reconstruct_closure () {
823
+ for (size_t idx = 0 ; idx < m_compiled_submodels.size (); ++idx) {
824
+ auto & comp_model_desc = m_compiled_submodels[idx];
825
+
826
+ // Skip optimized out and non-functions
827
+ if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by ) {
828
+ continue ;
829
+ }
830
+
831
+ const auto real_idx = comp_model_desc.replaced_by .value_or (idx);
832
+ auto & func_desc = m_compiled_submodels[real_idx];
833
+
834
+ // At this point closure size should have already been deserialized
835
+ NPUW_ASSERT (!comp_model_desc.closure .empty () && " Closure shouldn't be empty at this point!" );
836
+ for (std::size_t cidx = 0 ; cidx < comp_model_desc.closure .size (); ++cidx) {
837
+ if (comp_model_desc.closure [cidx]) {
838
+ // host-side closure - already set, do nothing
839
+ NPUW_ASSERT (!comp_model_desc.is_remote [cidx]);
840
+ continue ;
841
+ }
842
+ NPUW_ASSERT (comp_model_desc.closure_uid [cidx] != -1 );
843
+ comp_model_desc.closure [cidx] =
844
+ m_weights_bank->get (comp_model_desc.closure_uid [cidx], *func_desc.device_it );
845
+ }
846
+ }
847
+ }
848
+
733
849
void ov::npuw::CompiledModel::finalize_weights_bank () {
734
850
LOG_INFO (" Finalizing weights bank..." );
735
851
// Register lazy tensors
@@ -785,29 +901,22 @@ void ov::npuw::CompiledModel::finalize_weights_bank() {
785
901
LOG_INFO (" Done." );
786
902
}
787
903
788
- void ov::npuw::CompiledModel::reconstruct_closure () {
789
- for (size_t idx = 0 ; idx < m_compiled_submodels.size (); ++idx) {
790
- auto & comp_model_desc = m_compiled_submodels[idx];
791
-
792
- // Skip optimized out and non-functions
793
- if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by ) {
794
- continue ;
795
- }
796
-
797
- const auto real_idx = comp_model_desc.replaced_by .value_or (idx);
798
- auto & func_desc = m_compiled_submodels[real_idx];
799
-
800
- // At this point closure size should have already been deserialized
801
- NPUW_ASSERT (!comp_model_desc.closure .empty () && " Closure shouldn't be empty at this point!" );
802
- for (std::size_t cidx = 0 ; cidx < comp_model_desc.closure .size (); ++cidx) {
803
- if (comp_model_desc.closure [cidx]) {
804
- // host-side closure - already set, do nothing
805
- NPUW_ASSERT (!comp_model_desc.is_remote [cidx]);
904
+ void ov::npuw::CompiledModel::store_const_offsets (const std::shared_ptr<ov::Model>& model) {
905
+ for (auto && node_ptr : model->get_ordered_ops ()) {
906
+ if (ov::op::util::is_constant (node_ptr)) {
907
+ const auto & c = std::static_pointer_cast<ov::op::v0::Constant>(node_ptr);
908
+ auto rt_info = c->get_rt_info ();
909
+ auto weightless_cache_attr = rt_info.find (ov::WeightlessCacheAttribute::get_type_info_static ());
910
+ if (weightless_cache_attr == rt_info.end ()) {
806
911
continue ;
807
912
}
808
- NPUW_ASSERT (comp_model_desc.closure_uid [cidx] != -1 );
809
- comp_model_desc.closure [cidx] =
810
- m_weights_bank->get (comp_model_desc.closure_uid [cidx], *func_desc.device_it );
913
+ std::size_t offset = weightless_cache_attr->second .as <ov::WeightlessCacheAttribute>().bin_offset ;
914
+ auto data_ptr = c->get_data_ptr ();
915
+ auto inserted = m_const_to_offset.insert ({data_ptr, offset});
916
+ if (!inserted.second ) {
917
+ NPUW_ASSERT (inserted.first ->second == offset &&
918
+ " Model contains two constants with same pointer and different offset!" );
919
+ }
811
920
}
812
921
}
813
922
}
0 commit comments