NPUW: F16 interconnect (#27069)

dmatveev · web-flow · commit 49d078d42b9d · 2025-03-11T16:11:44.000Z
### Details:
- Introduce a new option to lower connections x-subgraphs to f16 if
those are f32, should reduce interim memory by 2x
- Enabled by default, can be disabled if causes issues

### Tickets:
 - E-142363
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
@@ -48,6 +48,7 @@ DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims
 DEFINE_OPT(NPUW_SLICE_OUT, bool, false, npuw::partitioning::slice_out, CompileTime);
 DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime);
 DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, CompileTime);
+DEFINE_OPT(NPUW_F16IC, bool, false, npuw::partitioning::f16_interconnect, CompileTime);
 DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 128, npuw::partitioning::spatial_nway, CompileTime);
 DEFINE_OPT(NPUW_SPATIAL_DYN, bool, true, npuw::partitioning::spatial_dyn, CompileTime);
 DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, CompileTime);
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
@@ -234,6 +234,14 @@ static constexpr ov::Property<std::size_t> spatial_nway{"NPUW_SPATIAL_NWAY"};
  */
 static constexpr ov::Property<bool> spatial_dyn{"NPUW_SPATIAL_DYN"};
 
+/**
+ * @brief
+ * Type: boolean
+ * Force subgraph interconnect tensors to f16 precision if those are in f32
+ * Default value: false
+ */
+static constexpr ov::Property<bool> f16_interconnect{"NPUW_F16IC"};
+
 /**
  * @brief
  * Type: boolean
diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -34,6 +34,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_SPATIAL_NWAY>();
     desc.add<NPUW_SPATIAL_DYN>();
     desc.add<NPUW_HOST_GATHER>();
+    desc.add<NPUW_F16IC>();
     desc.add<NPUW_DCOFF_TYPE>();
     desc.add<NPUW_DCOFF_SCALE>();
     desc.add<NPUW_FUNCALL_FOR_ALL>();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -1427,6 +1427,7 @@ void ov::npuw::CompiledModel::implement_properties() {
                           BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN),
                           BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
                           BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
+                          BIND(npuw::partitioning::f16_interconnect, NPUW_F16IC),
                           BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
                           BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
                           BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
@@ -327,12 +327,38 @@ class Partitioner {
 private:
     FunctionPipelineType func_pipeline_type;
     ::intel_npu::Config& cfg;
+
+    std::size_t m_f16ic_counter = 0u;
+
+    std::shared_ptr<ov::Node> new_f16ic_cvt(ov::Output<ov::Node> out, ov::element::Type type);
 };
 
+std::shared_ptr<ov::Node> Partitioner::new_f16ic_cvt(ov::Output<ov::Node> out, ov::element::Type type) {
+    // These Converts are added on activations (cross-subgraph connections) when
+    // the model is being cut. This may end up in Converts added to different
+    // individual submodels, rather than the one flat original model.
+    // This, in turn, may cause naming collisions between the newly added Converts
+    // and, for example, the Converts that was there in the original model.
+    // Since the substantial part of the FOLDing algorithm still relies on
+    // operation names (Operation bank matching), this is the point where
+    // it did break - based on the clashed name match, one Convert was mistakenly
+    // recognized as some other, resulting in the broken match banks and the failed
+    // "all_ok" assert.
+    //
+    // The below code workarounds the issue by forcing these Convert names be
+    // unique. Again, there's no guarantee we won't see such Convert names in the
+    // original model(s), but the probability is quite low here.
+    auto new_src = std::make_shared<ov::op::v0::Convert>(out, type);
+    new_src->set_friendly_name("Convert_f16ic_" + std::to_string(m_f16ic_counter++));
+    return new_src;
+}
+
 void Partitioner::identifySubgraphs() {
     LOG_INFO("Identifying subgraphs for model " << model->get_friendly_name() << "...");
     LOG_BLOCK();
 
+    const bool connect_in_f16 = cfg.get<::intel_npu::NPUW_F16IC>();
+
     using namespace ov::npuw;
     std::vector<ov::npuw::Group>& partitions = ens.groups;
 
@@ -407,7 +433,7 @@ void Partitioner::identifySubgraphs() {
             input_mapping[orig_node] = orig_node;
             return orig_node;
         };
-        auto parameter_from = [&input_mapping](ov::Output<ov::Node> output) {
+        auto parameter_from = [&input_mapping, connect_in_f16](ov::Output<ov::Node> output) {
             auto orig_node = output.get_node_shared_ptr();
             auto it = input_mapping.find(orig_node);
             if (it != input_mapping.end()) {
@@ -428,8 +454,14 @@ void Partitioner::identifySubgraphs() {
                 LOG_VERB("Found bound value in " << output << ", substituting it with " << new_const);
             } else {
                 // OK, actually introduce a parameter, cache it, and return.
-                auto new_param =
-                    std::make_shared<ov::op::v0::Parameter>(output.get_element_type(), output.get_partial_shape());
+                // Lower the parameter precision here, if required.
+                // Note: doing so REQUIRES a Convert node to be present here
+                // to maintain graph contracts. See handling where parameter_from is called.
+                auto otype = output.get_element_type();
+                if (otype == ov::element::f32 && connect_in_f16) {
+                    otype = ov::element::f16;
+                }
+                auto new_param = std::make_shared<ov::op::v0::Parameter>(otype, output.get_partial_shape());
                 result = std::static_pointer_cast<ov::Node>(new_param);
             }
             input_mapping[orig_node] = result;
@@ -495,8 +527,22 @@ void Partitioner::identifySubgraphs() {
                         // Can't use input_node here directly since parameter_from converts
                         // ov::Node to Output<Node> which some layers don't support by default.
                         auto new_param = parameter_from(input_desc.get_source_output());
-                        ov::copy_runtime_info(input_node, new_param);
-                        input_desc.replace_source_output(new_param);
+
+                        std::shared_ptr<ov::Node> new_src;
+                        if (new_param->get_element_type() != input_desc.get_element_type()) {
+                            // This is the only case where types may not match
+                            NPUW_ASSERT(input_desc.get_element_type() == ov::element::f32);
+                            NPUW_ASSERT(new_param->get_element_type() == ov::element::f16);
+                            NPUW_ASSERT(connect_in_f16);
+                            new_src = new_f16ic_cvt(new_param, ov::element::f32);
+                            LOG_DEBUG("Added F16IC Param Convert " << new_src << " on top of " << new_param << " for "
+                                                                   << input_desc);
+                        } else {
+                            new_src = new_param;
+                        }
+                        NPUW_ASSERT(new_src);
+                        ov::copy_runtime_info(input_node, new_src);  // NB: Still not sure why do this
+                        input_desc.replace_source_output(new_src);
                     }
                 }  // if (is..)
             }      // for (inputs)
@@ -654,7 +700,14 @@ void Partitioner::identifySubgraphs() {
                         num_optimized_out++;
                         LOG_VERB("Discarding " << output_desc << " -- optimized out!");
                     } else {
-                        auto new_result = std::make_shared<ov::op::v0::Result>(output_desc);
+                        // Register a new Result. Optionally, lower it to f16
+                        ov::Output<ov::Node> result_src = output_desc;
+                        if (output_desc.get_element_type() == ov::element::f32 && connect_in_f16) {
+                            auto new_cvt = new_f16ic_cvt(output_desc, ov::element::f16);
+                            LOG_DEBUG("Added F16IC Result Convert " << new_cvt << " on top of " << output_desc);
+                            result_src = new_cvt;
+                        }
+                        auto new_result = std::make_shared<ov::op::v0::Result>(result_src);
                         result_cache[output_layer_ptr] = LinkPtrFrom{this_group_idx, new_result};
 
                         ov::copy_runtime_info(output_desc.get_node_shared_ptr(), new_result);