NPUW Hotfixes: Memory and L0 pipeline (#27826)

dmatveev · web-flow · commit aafb3fc5f214 · 2024-11-29T14:05:21.000Z
### Details: - Keep tensors for decompression cut-off in a host-side closure, not lazy tensor - so they are not uploaded to bank & detached after that - This leads to 2x memory consumption and the subsequent crash - Relaxed requirements to enable the unfolded execution - so it may still happen if there's single-call functions that require DCOFF (previously having those would reject this unfolded path ### Tickets: - C-155523 (most likely, related to) @smirnov-alexey please take care of the release branch cherry-pick
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -727,8 +727,9 @@ std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_sync_infe
         const auto num_submodels = m_compiled_submodels.size();
         for (std::size_t idx = 0u; idx < num_submodels; idx++) {
             const auto& comp_model_desc = m_compiled_submodels[idx];
-            if (!comp_model_desc.replaced_by.has_value()) {
-                // not a funcall, do nothing
+            if (!comp_model_desc.replaced_by.has_value() || comp_model_desc.forced_to_fcall) {
+                // not a funcall, do nothing, or a subgraph that was forced to funcall
+                // (a 1-call function) - skip
                 continue;
             }
             const auto real_idx = comp_model_desc.replaced_by.value();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
@@ -97,12 +97,10 @@ ClosureRemap build_remap(const Function& fbody, const DCOFFParams& params_to) {
             LOG_DEBUG("This is an OK parameter, will be kept");
             m.closure_remap.push_back(i - fbody._param_offset);
 
-            // Check if unpack is indeed required
-            const auto& type = param->get_element_type();
-            if (type == ov::element::i4 || type == ov::element::u4 || type == ov::element::i8 ||
-                type == ov::element::u8) {
-                m.weights_to_unpack.insert(i - fbody._param_offset);
-            }
+            // FIXME: type should be queried from a lazy tensor
+            // and compared against param->get_element_type()
+            // to decide 100%
+            m.weights_to_unpack.insert(i - fbody._param_offset);
         }
 
         // Process zero points for parameters