Skip to content

Commit 49d078d

Browse files
authored
NPUW: F16 interconnect (#27069)
### Details: - Introduce a new option to lower connections x-subgraphs to f16 if those are f32, should reduce interim memory by 2x - Enabled by default, can be disabled if causes issues ### Tickets: - E-142363
1 parent a05287f commit 49d078d

File tree

5 files changed

+70
-6
lines changed

5 files changed

+70
-6
lines changed

src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims
4848
DEFINE_OPT(NPUW_SLICE_OUT, bool, false, npuw::partitioning::slice_out, CompileTime);
4949
DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime);
5050
DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, CompileTime);
51+
DEFINE_OPT(NPUW_F16IC, bool, false, npuw::partitioning::f16_interconnect, CompileTime);
5152
DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 128, npuw::partitioning::spatial_nway, CompileTime);
5253
DEFINE_OPT(NPUW_SPATIAL_DYN, bool, true, npuw::partitioning::spatial_dyn, CompileTime);
5354
DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, CompileTime);

src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp

+8
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,14 @@ static constexpr ov::Property<std::size_t> spatial_nway{"NPUW_SPATIAL_NWAY"};
234234
*/
235235
static constexpr ov::Property<bool> spatial_dyn{"NPUW_SPATIAL_DYN"};
236236

237+
/**
238+
* @brief
239+
* Type: boolean
240+
* Force subgraph interconnect tensors to f16 precision if those are in f32
241+
* Default value: false
242+
*/
243+
static constexpr ov::Property<bool> f16_interconnect{"NPUW_F16IC"};
244+
237245
/**
238246
* @brief
239247
* Type: boolean

src/plugins/intel_npu/src/al/src/config/npuw.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
3434
desc.add<NPUW_SPATIAL_NWAY>();
3535
desc.add<NPUW_SPATIAL_DYN>();
3636
desc.add<NPUW_HOST_GATHER>();
37+
desc.add<NPUW_F16IC>();
3738
desc.add<NPUW_DCOFF_TYPE>();
3839
desc.add<NPUW_DCOFF_SCALE>();
3940
desc.add<NPUW_FUNCALL_FOR_ALL>();

src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -1427,6 +1427,7 @@ void ov::npuw::CompiledModel::implement_properties() {
14271427
BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN),
14281428
BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
14291429
BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
1430+
BIND(npuw::partitioning::f16_interconnect, NPUW_F16IC),
14301431
BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
14311432
BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
14321433
BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),

src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp

+59-6
Original file line numberDiff line numberDiff line change
@@ -327,12 +327,38 @@ class Partitioner {
327327
private:
328328
FunctionPipelineType func_pipeline_type;
329329
::intel_npu::Config& cfg;
330+
331+
std::size_t m_f16ic_counter = 0u;
332+
333+
std::shared_ptr<ov::Node> new_f16ic_cvt(ov::Output<ov::Node> out, ov::element::Type type);
330334
};
331335

336+
std::shared_ptr<ov::Node> Partitioner::new_f16ic_cvt(ov::Output<ov::Node> out, ov::element::Type type) {
337+
// These Converts are added on activations (cross-subgraph connections) when
338+
// the model is being cut. This may end up in Converts added to different
339+
// individual submodels, rather than the one flat original model.
340+
// This, in turn, may cause naming collisions between the newly added Converts
341+
// and, for example, the Converts that was there in the original model.
342+
// Since the substantial part of the FOLDing algorithm still relies on
343+
// operation names (Operation bank matching), this is the point where
344+
// it did break - based on the clashed name match, one Convert was mistakenly
345+
// recognized as some other, resulting in the broken match banks and the failed
346+
// "all_ok" assert.
347+
//
348+
// The below code workarounds the issue by forcing these Convert names be
349+
// unique. Again, there's no guarantee we won't see such Convert names in the
350+
// original model(s), but the probability is quite low here.
351+
auto new_src = std::make_shared<ov::op::v0::Convert>(out, type);
352+
new_src->set_friendly_name("Convert_f16ic_" + std::to_string(m_f16ic_counter++));
353+
return new_src;
354+
}
355+
332356
void Partitioner::identifySubgraphs() {
333357
LOG_INFO("Identifying subgraphs for model " << model->get_friendly_name() << "...");
334358
LOG_BLOCK();
335359

360+
const bool connect_in_f16 = cfg.get<::intel_npu::NPUW_F16IC>();
361+
336362
using namespace ov::npuw;
337363
std::vector<ov::npuw::Group>& partitions = ens.groups;
338364

@@ -407,7 +433,7 @@ void Partitioner::identifySubgraphs() {
407433
input_mapping[orig_node] = orig_node;
408434
return orig_node;
409435
};
410-
auto parameter_from = [&input_mapping](ov::Output<ov::Node> output) {
436+
auto parameter_from = [&input_mapping, connect_in_f16](ov::Output<ov::Node> output) {
411437
auto orig_node = output.get_node_shared_ptr();
412438
auto it = input_mapping.find(orig_node);
413439
if (it != input_mapping.end()) {
@@ -428,8 +454,14 @@ void Partitioner::identifySubgraphs() {
428454
LOG_VERB("Found bound value in " << output << ", substituting it with " << new_const);
429455
} else {
430456
// OK, actually introduce a parameter, cache it, and return.
431-
auto new_param =
432-
std::make_shared<ov::op::v0::Parameter>(output.get_element_type(), output.get_partial_shape());
457+
// Lower the parameter precision here, if required.
458+
// Note: doing so REQUIRES a Convert node to be present here
459+
// to maintain graph contracts. See handling where parameter_from is called.
460+
auto otype = output.get_element_type();
461+
if (otype == ov::element::f32 && connect_in_f16) {
462+
otype = ov::element::f16;
463+
}
464+
auto new_param = std::make_shared<ov::op::v0::Parameter>(otype, output.get_partial_shape());
433465
result = std::static_pointer_cast<ov::Node>(new_param);
434466
}
435467
input_mapping[orig_node] = result;
@@ -495,8 +527,22 @@ void Partitioner::identifySubgraphs() {
495527
// Can't use input_node here directly since parameter_from converts
496528
// ov::Node to Output<Node> which some layers don't support by default.
497529
auto new_param = parameter_from(input_desc.get_source_output());
498-
ov::copy_runtime_info(input_node, new_param);
499-
input_desc.replace_source_output(new_param);
530+
531+
std::shared_ptr<ov::Node> new_src;
532+
if (new_param->get_element_type() != input_desc.get_element_type()) {
533+
// This is the only case where types may not match
534+
NPUW_ASSERT(input_desc.get_element_type() == ov::element::f32);
535+
NPUW_ASSERT(new_param->get_element_type() == ov::element::f16);
536+
NPUW_ASSERT(connect_in_f16);
537+
new_src = new_f16ic_cvt(new_param, ov::element::f32);
538+
LOG_DEBUG("Added F16IC Param Convert " << new_src << " on top of " << new_param << " for "
539+
<< input_desc);
540+
} else {
541+
new_src = new_param;
542+
}
543+
NPUW_ASSERT(new_src);
544+
ov::copy_runtime_info(input_node, new_src); // NB: Still not sure why do this
545+
input_desc.replace_source_output(new_src);
500546
}
501547
} // if (is..)
502548
} // for (inputs)
@@ -654,7 +700,14 @@ void Partitioner::identifySubgraphs() {
654700
num_optimized_out++;
655701
LOG_VERB("Discarding " << output_desc << " -- optimized out!");
656702
} else {
657-
auto new_result = std::make_shared<ov::op::v0::Result>(output_desc);
703+
// Register a new Result. Optionally, lower it to f16
704+
ov::Output<ov::Node> result_src = output_desc;
705+
if (output_desc.get_element_type() == ov::element::f32 && connect_in_f16) {
706+
auto new_cvt = new_f16ic_cvt(output_desc, ov::element::f16);
707+
LOG_DEBUG("Added F16IC Result Convert " << new_cvt << " on top of " << output_desc);
708+
result_src = new_cvt;
709+
}
710+
auto new_result = std::make_shared<ov::op::v0::Result>(result_src);
658711
result_cache[output_layer_ptr] = LinkPtrFrom{this_group_idx, new_result};
659712

660713
ov::copy_runtime_info(output_desc.get_node_shared_ptr(), new_result);

0 commit comments

Comments
 (0)