From 1a17147bb9daa2f21d074532edbd9fb4bff251d5 Mon Sep 17 00:00:00 2001 From: Kadian Date: Tue, 6 Aug 2024 14:49:46 +0100 Subject: [PATCH] Adding a new dcoff pattern --- .../plugin/npuw/partitioning/partitioning.cpp | 3 + .../npuw/partitioning/patterns/dcoff.cpp | 83 +++++++++++++++++++ .../npuw/partitioning/patterns/dcoff.hpp | 5 ++ 3 files changed, 91 insertions(+) diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 954c868e4bc887..c3819fabf79f28 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1622,6 +1622,9 @@ void Partitioner::decompressionCutOff(const std::string& func_name) { // LLaMaGPTQ rewr.add_matcher(dcoff_mode, dcoff_type, std::ref(params_to)); + // Phi-3 4SymW16A/GPTQ + rewr.add_matcher(dcoff_mode, dcoff_type, std::ref(params_to)); + rewr.run_on_model(f._model); ov::pass::Validate val; diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp index 156f22e59514b4..99ff93a606697a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp @@ -512,6 +512,89 @@ DCOFFPassReshape2::DCOFFPassReshape2(DCOffMode dcoff_mode, ov::element::Type dco register_matcher(std::make_shared(reshpe, "TagDCOFFReshape2"), std::move(callback)); } +// Pattern: Phi-3 4SymW16A/GPTQ +// +// +// "tensor" "scale" > "tensor" +// Param:A Param:C > Param:A +// i4 f16|f32 > f16 +// : : > : +// V : > V +// Convert : > Convert +// f16|f32 : > f32 +// : : > +// V V > +// Multiply > +// f16|f32 > +// : > +// : > +// V > +// Convert + +DCOFFPassCWAI3::DCOFFPassCWAI3(DCOffMode dcoff_mode, ov::element::Type dcoff_type, DCOFFParamRef pref) { + auto paramA = opp::wrap_type(); + auto paramC = opp::wrap_type(); + auto cvtA = opp::wrap_type({paramA}); + auto mulply = opp::wrap_type({cvtA, paramC}); + auto cvt = opp::wrap_type({mulply}); + + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + auto matched_nodeA = node_to_output.at(paramA).get_node_shared_ptr(); + auto matched_nodeC = node_to_output.at(paramC).get_node_shared_ptr(); + + NPUW_ASSERT(ov::op::util::is_parameter(matched_nodeA)); + NPUW_ASSERT(ov::op::util::is_parameter(matched_nodeC)); + + auto matched_paramA = std::static_pointer_cast(matched_nodeA); + auto matched_paramC = std::static_pointer_cast(matched_nodeC); + + if (ov::element::i4 == matched_paramA->get_element_type() && + (ov::element::f16 == matched_paramC->get_element_type() || + ov::element::f32 == matched_paramC->get_element_type())) { + LOG_DEBUG("Matched: " << matched_paramA << ", set element type to " << dcoff_type); + matched_paramA->set_element_type(dcoff_type); + + if (dcoff_mode == DCOffMode::CAST_SCALE) { + NPUW_ASSERT(dcoff_type == ov::element::f16); + + LOG_DEBUG("Matched: " << matched_paramC << " - parameter to remove..."); + LOG_BLOCK(); + + // Extra transformation here: + // - remove Multiply + Intermediate Convert + // - mark paramC for removal. + // Convert will be reconnected to paramA directly. + + // Record mapping from the Scale coeff parameter to the Real weight parameter + pref.get().scales[matched_paramC] = matched_paramA; + + // Disconnect Multiply and Convert from their outputs + auto matched_mulply = node_to_output.at(mulply).get_node_shared_ptr(); + auto matched_convrt = node_to_output.at(cvtA).get_node_shared_ptr(); + auto drop_outputs = [](std::shared_ptr node) { + for (auto&& node_outputs : node->outputs()) { + for (auto&& node_reader_port : node_outputs.get_target_inputs()) { + node_outputs.remove_target_input(node_reader_port); + } + } + }; + LOG_DEBUG("Dropping the connections..."); + drop_outputs(matched_mulply); + drop_outputs(matched_convrt); + + LOG_DEBUG("Reconnecting the Root..."); + auto matched_cvt = node_to_output.at(cvt).get_node_shared_ptr(); + matched_cvt->input(0).replace_source_output(matched_paramA); + } + LOG_DEBUG("Done"); + } + return false; // root node hasn't changed + }; + + register_matcher(std::make_shared(cvt, "TagDCOFFPassCWAI3"), std::move(callback)); +} + //------------------------------------------------------------------------------ // Pattern: 4SymW16A for CWAI // diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp index 385a63370655e5..83ed575f8afd41 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp @@ -129,6 +129,11 @@ class DCOFFPassReshape2 : public ov::pass::MatcherPass { DCOFFPassReshape2(DCOffMode dcoff_mode, ov::element::Type dcoff_type, DCOFFParamRef pref); }; +class DCOFFPassCWAI3 : public ov::pass::MatcherPass { +public: + DCOFFPassCWAI3(DCOffMode dcoff_mode, ov::element::Type dcoff_type, DCOFFParamRef pref); +}; + class CWAI1 : public ov::pass::MatcherPass { public: using CPtr = std::shared_ptr;