@@ -327,12 +327,38 @@ class Partitioner {
327
327
private:
328
328
FunctionPipelineType func_pipeline_type;
329
329
::intel_npu::Config& cfg;
330
+
331
+ std::size_t m_f16ic_counter = 0u ;
332
+
333
+ std::shared_ptr<ov::Node> new_f16ic_cvt (ov::Output<ov::Node> out, ov::element::Type type);
330
334
};
331
335
336
+ std::shared_ptr<ov::Node> Partitioner::new_f16ic_cvt (ov::Output<ov::Node> out, ov::element::Type type) {
337
+ // These Converts are added on activations (cross-subgraph connections) when
338
+ // the model is being cut. This may end up in Converts added to different
339
+ // individual submodels, rather than the one flat original model.
340
+ // This, in turn, may cause naming collisions between the newly added Converts
341
+ // and, for example, the Converts that was there in the original model.
342
+ // Since the substantial part of the FOLDing algorithm still relies on
343
+ // operation names (Operation bank matching), this is the point where
344
+ // it did break - based on the clashed name match, one Convert was mistakenly
345
+ // recognized as some other, resulting in the broken match banks and the failed
346
+ // "all_ok" assert.
347
+ //
348
+ // The below code workarounds the issue by forcing these Convert names be
349
+ // unique. Again, there's no guarantee we won't see such Convert names in the
350
+ // original model(s), but the probability is quite low here.
351
+ auto new_src = std::make_shared<ov::op::v0::Convert>(out, type);
352
+ new_src->set_friendly_name (" Convert_f16ic_" + std::to_string (m_f16ic_counter++));
353
+ return new_src;
354
+ }
355
+
332
356
void Partitioner::identifySubgraphs () {
333
357
LOG_INFO (" Identifying subgraphs for model " << model->get_friendly_name () << " ..." );
334
358
LOG_BLOCK ();
335
359
360
+ const bool connect_in_f16 = cfg.get <::intel_npu::NPUW_F16IC>();
361
+
336
362
using namespace ov ::npuw;
337
363
std::vector<ov::npuw::Group>& partitions = ens.groups ;
338
364
@@ -407,7 +433,7 @@ void Partitioner::identifySubgraphs() {
407
433
input_mapping[orig_node] = orig_node;
408
434
return orig_node;
409
435
};
410
- auto parameter_from = [&input_mapping](ov::Output<ov::Node> output) {
436
+ auto parameter_from = [&input_mapping, connect_in_f16 ](ov::Output<ov::Node> output) {
411
437
auto orig_node = output.get_node_shared_ptr ();
412
438
auto it = input_mapping.find (orig_node);
413
439
if (it != input_mapping.end ()) {
@@ -428,8 +454,14 @@ void Partitioner::identifySubgraphs() {
428
454
LOG_VERB (" Found bound value in " << output << " , substituting it with " << new_const);
429
455
} else {
430
456
// OK, actually introduce a parameter, cache it, and return.
431
- auto new_param =
432
- std::make_shared<ov::op::v0::Parameter>(output.get_element_type (), output.get_partial_shape ());
457
+ // Lower the parameter precision here, if required.
458
+ // Note: doing so REQUIRES a Convert node to be present here
459
+ // to maintain graph contracts. See handling where parameter_from is called.
460
+ auto otype = output.get_element_type ();
461
+ if (otype == ov::element::f32 && connect_in_f16) {
462
+ otype = ov::element::f16;
463
+ }
464
+ auto new_param = std::make_shared<ov::op::v0::Parameter>(otype, output.get_partial_shape ());
433
465
result = std::static_pointer_cast<ov::Node>(new_param);
434
466
}
435
467
input_mapping[orig_node] = result;
@@ -495,8 +527,22 @@ void Partitioner::identifySubgraphs() {
495
527
// Can't use input_node here directly since parameter_from converts
496
528
// ov::Node to Output<Node> which some layers don't support by default.
497
529
auto new_param = parameter_from (input_desc.get_source_output ());
498
- ov::copy_runtime_info (input_node, new_param);
499
- input_desc.replace_source_output (new_param);
530
+
531
+ std::shared_ptr<ov::Node> new_src;
532
+ if (new_param->get_element_type () != input_desc.get_element_type ()) {
533
+ // This is the only case where types may not match
534
+ NPUW_ASSERT (input_desc.get_element_type () == ov::element::f32);
535
+ NPUW_ASSERT (new_param->get_element_type () == ov::element::f16);
536
+ NPUW_ASSERT (connect_in_f16);
537
+ new_src = new_f16ic_cvt (new_param, ov::element::f32);
538
+ LOG_DEBUG (" Added F16IC Param Convert " << new_src << " on top of " << new_param << " for "
539
+ << input_desc);
540
+ } else {
541
+ new_src = new_param;
542
+ }
543
+ NPUW_ASSERT (new_src);
544
+ ov::copy_runtime_info (input_node, new_src); // NB: Still not sure why do this
545
+ input_desc.replace_source_output (new_src);
500
546
}
501
547
} // if (is..)
502
548
} // for (inputs)
@@ -654,7 +700,14 @@ void Partitioner::identifySubgraphs() {
654
700
num_optimized_out++;
655
701
LOG_VERB (" Discarding " << output_desc << " -- optimized out!" );
656
702
} else {
657
- auto new_result = std::make_shared<ov::op::v0::Result>(output_desc);
703
+ // Register a new Result. Optionally, lower it to f16
704
+ ov::Output<ov::Node> result_src = output_desc;
705
+ if (output_desc.get_element_type () == ov::element::f32 && connect_in_f16) {
706
+ auto new_cvt = new_f16ic_cvt (output_desc, ov::element::f16);
707
+ LOG_DEBUG (" Added F16IC Result Convert " << new_cvt << " on top of " << output_desc);
708
+ result_src = new_cvt;
709
+ }
710
+ auto new_result = std::make_shared<ov::op::v0::Result>(result_src);
658
711
result_cache[output_layer_ptr] = LinkPtrFrom{this_group_idx, new_result};
659
712
660
713
ov::copy_runtime_info (output_desc.get_node_shared_ptr (), new_result);
0 commit comments