[Snippets][CPU] Added KVCacheMatcher check for LLM in MHATokenization (#28812)

a-sidorova · web-flow · commit a8e776b465a1 · 2025-02-24T15:25:50.000Z
### Details: - *Extracted `is_LLM` check from GPU plugin to common part to reuse in other plugins* - *Used extracted `is_large_language_model` function in check for LLM in MHATokenization in CPU Plugin* ### Tickets: - *CVS-160999*
diff --git a/src/common/transformations/include/transformations/utils/utils.hpp b/src/common/transformations/include/transformations/utils/utils.hpp
@@ -193,6 +193,8 @@ TRANSFORMATIONS_API bool constantIsEqualTo(const std::shared_ptr<ov::op::v0::Con
 
 TRANSFORMATIONS_API bool has_f16_constants(const std::shared_ptr<const ov::Model>& function);
 
+TRANSFORMATIONS_API bool is_large_language_model(const ov::Model& model);
+
 /**
  * \brief Check if 'other_shape' can be broadcasted to 'ref_shape'
  *
diff --git a/src/common/transformations/src/transformations/utils/utils.cpp b/src/common/transformations/src/transformations/utils/utils.cpp
@@ -12,11 +12,15 @@
 #include "openvino/core/validation_util.hpp"
 #include "openvino/op/add.hpp"
 #include "openvino/op/broadcast.hpp"
+#include "openvino/op/concat.hpp"
 #include "openvino/op/constant.hpp"
+#include "openvino/op/convert.hpp"
 #include "openvino/op/divide.hpp"
 #include "openvino/op/gather.hpp"
 #include "openvino/op/multiply.hpp"
+#include "openvino/op/paged_attention.hpp"
 #include "openvino/op/parameter.hpp"
+#include "openvino/op/read_value.hpp"
 #include "openvino/op/relu.hpp"
 #include "openvino/op/reshape.hpp"
 #include "openvino/op/shape_of.hpp"
@@ -25,6 +29,9 @@
 #include "openvino/op/tanh.hpp"
 #include "openvino/op/util/multi_subgraph_base.hpp"
 #include "openvino/op/util/shape_of_base.hpp"
+#include "openvino/pass/pattern/op/optional.hpp"
+#include "openvino/pass/pattern/op/or.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
 
 namespace ov {
 namespace op {
@@ -133,6 +140,28 @@ bool has_f16_constants(const std::shared_ptr<const ov::Model>& function) {
     return false;
 }
 
+bool is_large_language_model(const ov::Model& model) {
+    using namespace ov::pass::pattern;
+
+    const auto past = wrap_type<ov::op::v6::ReadValue>();
+    const auto convert_past = ov::pass::pattern::optional<ov::op::v0::Convert>(past);
+    const auto beam_idx = wrap_type<ov::op::v0::Parameter>();
+    const auto gather_past = wrap_type<ov::op::v8::Gather>({convert_past, beam_idx, wrap_type<ov::op::v0::Constant>()});
+    const auto gather_convert = ov::pass::pattern::optional<ov::op::v0::Convert>(gather_past);
+    const auto concat_past_input =
+        std::make_shared<ov::pass::pattern::op::Or>(OutputVector{convert_past, gather_convert});
+    const auto concat = wrap_type<ov::op::v0::Concat>({concat_past_input, any_input()});
+    const auto convert_present = ov::pass::pattern::optional<ov::op::v0::Convert>(concat);
+    const auto present = wrap_type<ov::op::v6::Assign>({convert_present});
+    const auto kvcache_matcher = std::make_shared<ov::pass::pattern::Matcher>(present, "KVCacheMatcher");
+
+    for (const auto& op : model.get_ops()) {
+        if (kvcache_matcher->match(op->output(0)) || ov::is_type<ov::op::PagedAttentionExtension>(op))
+            return true;
+    }
+    return false;
+}
+
 bool check_for_broadcast(const ov::PartialShape& ref_shape, const ov::PartialShape& other_shape) {
     if (ref_shape.rank().is_dynamic() || other_shape.rank().is_dynamic()) {
         return false;
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -1053,10 +1053,10 @@ void Transformations::MainSnippets(void) {
 #if defined(OPENVINO_ARCH_X86_64)
     // Currently, Snippets don't provide efficient execution for single token inference in LLM case.
     // To avoid performance degradations, we disable MHA tokenization into Subgraphs in LLMs'.
-    // We consider the presence of `ScaledDotProductAttentionWithKVCache` and `PagedAttentionExtension` ops
+    // We consider the presence of `ScaledDotProductAttentionWithKVCache` ops
     // in the model as a sign that this model is LLM.
-    const auto is_LLM = ov::op::util::has_op_with_type<intel_cpu::ScaledDotProductAttentionWithKVCache>(model) ||
-                        ov::op::util::has_op_with_type<ov::op::PagedAttentionExtension>(model);
+    const auto is_LLM = ov::op::util::is_large_language_model(*model.get()) ||
+                        ov::op::util::has_op_with_type<intel_cpu::ScaledDotProductAttentionWithKVCache>(model);
 
     // CPU Plugin Subgraph supports f32, bf16, quantized and fp16(on avx_512_core_amx_fp16 target) BRGEMM
     const auto is_infer_prc_supported_by_MHA =
diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp
@@ -6,23 +6,16 @@
 #include "intel_gpu/plugin/remote_context.hpp"
 #include "openvino/core/any.hpp"
 #include "openvino/core/model.hpp"
-#include "openvino/op/concat.hpp"
-#include "openvino/op/convert.hpp"
-#include "openvino/op/gather.hpp"
 #include "openvino/op/loop.hpp"
 #include "openvino/op/lstm_sequence.hpp"
-#include "openvino/op/paged_attention.hpp"
 #include "openvino/op/search_sorted.hpp"
 #include "openvino/op/stft.hpp"
-#include "openvino/pass/pattern/matcher.hpp"
-#include "openvino/pass/pattern/op/label.hpp"
-#include "openvino/pass/pattern/op/or.hpp"
-#include "openvino/pass/pattern/op/wrap_type.hpp"
 #include "ov_ops/dynamic_quantize.hpp"
 #include "openvino/runtime/internal_properties.hpp"
 #include "intel_gpu/runtime/internal_properties.hpp"
 #include "openvino/runtime/plugin_config.hpp"
 #include "openvino/runtime/properties.hpp"
+#include "transformations/utils/utils.hpp"
 
 
 namespace ov::intel_gpu {
@@ -86,32 +79,6 @@ bool requires_new_shape_infer(const std::shared_ptr<ov::Node>& op) {
     return false;
 }
 
-bool is_llm(const ov::Model& model) {
-    using namespace ov::pass::pattern;
-
-    auto past = wrap_type<ov::op::v6::ReadValue>();
-    auto convert_past = wrap_type<ov::op::v0::Convert>({past});
-    auto gather_input = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{past, convert_past});
-    auto beam_idx = wrap_type<ov::op::v0::Parameter>();
-    auto gather_past = wrap_type<ov::op::v8::Gather>({gather_input, beam_idx, wrap_type<ov::op::v0::Constant>()});
-    auto gather_convert = wrap_type<ov::op::v0::Convert>({gather_past});
-    auto concat_past_input = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{past, convert_past, gather_past, gather_convert});
-    auto concat = wrap_type<ov::op::v0::Concat>({concat_past_input, any_input()});
-    auto convert_present = wrap_type<ov::op::v0::Convert>({concat});
-    auto present_input = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{concat, convert_present});
-    auto present = wrap_type<ov::op::v6::Assign>({present_input});
-
-    auto kvcache_matcher = std::make_shared<ov::pass::pattern::Matcher>(present, "KVCacheMatcher");
-
-    for (auto& op : model.get_ordered_ops()) {
-        if (kvcache_matcher->match(op) || ov::is_type<ov::op::PagedAttentionExtension>(op)) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
 } // namespace
 
 ExecutionConfig::ExecutionConfig() : ov::PluginConfig() { }
@@ -163,7 +130,7 @@ void ExecutionConfig::apply_rt_info(const IRemoteContext* context, const ov::RTM
 }
 
 void ExecutionConfig::apply_model_specific_options(const IRemoteContext* context, const ov::Model& model) {
-    apply_rt_info(context, get_rt_info(model), is_llm(model));
+    apply_rt_info(context, get_rt_info(model), ov::op::util::is_large_language_model(model));
 
     const auto& ops = model.get_ops();
 

Original file line number	Diff line number	Diff line change
`@@ -193,6 +193,8 @@ TRANSFORMATIONS_API bool constantIsEqualTo(const std::shared_ptr<ov::op::v0::Con`
`193`	`193`
`194`	`194`	`TRANSFORMATIONS_API bool has_f16_constants(const std::shared_ptr<const ov::Model>& function);`
`195`	`195`
	`196`	`+TRANSFORMATIONS_API bool is_large_language_model(const ov::Model& model);`
	`197`	`+`
`196`	`198`	`/**`
`197`	`199`	`* \brief Check if 'other_shape' can be broadcasted to 'ref_shape'`
`198`	`200`	`*`