@@ -115,7 +115,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
115
115
116
116
// Initialize weights bank
117
117
const std::string weights_bank_opt = m_cfg.get <::intel_npu::NPUW_WEIGHTS_BANK>();
118
- m_weights_bank = ov::npuw::weights::bank (weights_bank_opt, plugin->get_core ());
118
+ const std::string wbank_alloc = m_cfg.get <::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>();
119
+ m_weights_bank = ov::npuw::weights::bank (weights_bank_opt, plugin->get_core (), wbank_alloc);
119
120
120
121
LOG_VERB (" *** Original model ***" );
121
122
const auto & orig_parameters = model->get_parameters ();
@@ -235,6 +236,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
235
236
} // for(ordered_subgraphs)
236
237
// NOTE(dm): there's a better way to do it, like we do in G-API backends.
237
238
239
+ m_update_required = m_cfg.get <::intel_npu::NPUW_FOLD>();
240
+
238
241
// Store mapping between manually splitted inputs/outputs
239
242
// to connect tensors between compiled submodels
240
243
m_submodels_input_to_prev_output = partitioning.input_to_prev_output ;
@@ -302,10 +305,11 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
302
305
m_compiled_submodels[id].host_gather = subgraph._host_gather ;
303
306
m_compiled_submodels[id].param_base = fcn_template._param_offset ;
304
307
m_compiled_submodels[id].closure = subgraph._closure ;
308
+ m_compiled_submodels[id].lazy_closure = subgraph._lazy_closure ;
305
309
m_compiled_submodels[id].scales = subgraph._scales ;
306
310
m_compiled_submodels[id].zerops = subgraph._zerops ;
307
- m_compiled_submodels[id].update_required . resize ( subgraph._closure . size (), false ) ;
308
- fill_weights_bank (id );
311
+ m_compiled_submodels[id].forced_to_fcall = subgraph._forced_to_fcall ;
312
+ m_compiled_submodels[id]. is_remote . resize (subgraph. _lazy_closure . size (), false );
309
313
} // if(!funcall)
310
314
311
315
if (!m_compiled_submodels[id].model && !m_compiled_submodels[id].replaced_by ) {
@@ -421,6 +425,9 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
421
425
}
422
426
}
423
427
428
+ // Finalize memory in closures and weight banks
429
+ finalize_weights_bank ();
430
+
424
431
// Print stats report when possible
425
432
{
426
433
LOG_INFO (" Initial device distribution:" );
@@ -434,24 +441,54 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
434
441
reset_io ();
435
442
}
436
443
437
- void ov::npuw::CompiledModel::fill_weights_bank (const std::size_t idx) {
438
- LOG_VERB (" Filling weights bank for Subgraph[" << idx << " ]..." );
439
- LOG_BLOCK ();
444
+ void ov::npuw::CompiledModel::finalize_weights_bank () {
445
+ // Register lazy tensors
446
+ for (std::size_t idx = 0 ; idx < m_compiled_submodels.size (); ++idx) {
447
+ auto & comp_model_desc = m_compiled_submodels[idx];
440
448
441
- NPUW_ASSERT (m_compiled_submodels[idx].replaced_by );
449
+ // Skip optimized out and non-functions
450
+ if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by ) {
451
+ return ;
452
+ }
442
453
443
- auto & comp_model_desc = m_compiled_submodels[idx];
454
+ const auto real_idx = comp_model_desc.replaced_by .value_or (idx);
455
+ auto & func_desc = m_compiled_submodels[real_idx];
444
456
445
- for (std::size_t cidx = 0u ; cidx < comp_model_desc.closure .size (); cidx++) {
446
- comp_model_desc.closure [cidx] = m_weights_bank->update (comp_model_desc.closure [cidx]);
447
- if (m_cfg.get <::intel_npu::NPUW_FOLD>()) {
448
- comp_model_desc.update_required [cidx] = true ;
449
- } else {
450
- comp_model_desc.update_required [cidx] = false ;
457
+ for (std::size_t tidx = 0 ; tidx < comp_model_desc.lazy_closure .size (); ++tidx) {
458
+ if (comp_model_desc.closure [tidx]) {
459
+ continue ; // host-side closure
460
+ }
461
+ m_weights_bank->registerLT (comp_model_desc.lazy_closure [tidx], *func_desc.device_it );
451
462
}
452
463
}
453
464
454
- LOG_VERB (" DONE" );
465
+ // Evaluate and allocate all LazyTensors inside the bank
466
+ m_weights_bank->evaluate_and_allocate ();
467
+
468
+ // Set evaluated and allocated ov::Tensors to closures
469
+ for (size_t idx = 0 ; idx < m_compiled_submodels.size (); ++idx) {
470
+ auto & comp_model_desc = m_compiled_submodels[idx];
471
+
472
+ // Skip optimized out and non-functions
473
+ if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by ) {
474
+ continue ;
475
+ }
476
+
477
+ const auto real_idx = comp_model_desc.replaced_by .value_or (idx);
478
+ auto & func_desc = m_compiled_submodels[real_idx];
479
+
480
+ for (std::size_t tidx = 0 ; tidx < comp_model_desc.lazy_closure .size (); ++tidx) {
481
+ if (comp_model_desc.closure [tidx]) {
482
+ // host-side closure - already set, do nothing
483
+ comp_model_desc.is_remote [tidx] = false ;
484
+ continue ;
485
+ }
486
+ const auto & lt = comp_model_desc.lazy_closure [tidx];
487
+ comp_model_desc.closure [tidx] = m_weights_bank->get (lt, *func_desc.device_it );
488
+ // FIXME: find a more reliable way to do so
489
+ comp_model_desc.is_remote [tidx] = m_weights_bank->is_remote (lt);
490
+ }
491
+ }
455
492
}
456
493
457
494
void ov::npuw::CompiledModel::remove_long_output_names (const std::shared_ptr<ov::Model>& model) {
@@ -748,7 +785,6 @@ void ov::npuw::CompiledModel::implement_properties() {
748
785
749
786
// 1.
750
787
// OV Public
751
- // ===============================================
752
788
m_prop_to_opt = {{ov::supported_properties.name (),
753
789
{ov::PropertyMutability::RO,
754
790
[&](const ::intel_npu::Config&) -> std::vector<PropertyName>& {
@@ -785,7 +821,6 @@ void ov::npuw::CompiledModel::implement_properties() {
785
821
return m_loaded_from_cache;
786
822
}}},
787
823
// OV Public Hints
788
- // =====================================================
789
824
{ov::hint::performance_mode.name (),
790
825
{ov::PropertyMutability::RO,
791
826
[&](const ::intel_npu::Config&) {
@@ -856,6 +891,7 @@ void ov::npuw::CompiledModel::implement_properties() {
856
891
BIND (npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
857
892
BIND (npuw::funcall_async, NPUW_FUNCALL_ASYNC),
858
893
BIND (npuw::weights_bank, NPUW_WEIGHTS_BANK),
894
+ BIND (npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC),
859
895
BIND (npuw::cache_dir, NPUW_CACHE_DIR),
860
896
BIND (npuw::accuracy::check, NPUW_ACC_CHECK),
861
897
BIND (npuw::accuracy::threshold, NPUW_ACC_THRESH),
0 commit comments