openvinotoolkit
diff --git a/‎.github/workflows/mac.yml
+1-1 b/‎.github/workflows/mac.yml
+1-1
diff --git a/‎src/common/transformations/src/transformations/common_optimizations/sdpa_scale_fusion.cpp
+19-9 b/‎src/common/transformations/src/transformations/common_optimizations/sdpa_scale_fusion.cpp
+19-9
diff --git a/‎src/common/transformations/tests/common_optimizations/sdpa_scale_fusion_test.cpp
+47 b/‎src/common/transformations/tests/common_optimizations/sdpa_scale_fusion_test.cpp
+47
diff --git a/‎src/frontends/ir/src/ir_deserializer.cpp
+22-10 b/‎src/frontends/ir/src/ir_deserializer.cpp
+22-10
diff --git a/‎src/frontends/ir/tests/frontend_test_basic.cpp
+1-1 b/‎src/frontends/ir/tests/frontend_test_basic.cpp
+1-1
diff --git a/‎src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl
+1-1 b/‎src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl
+1-1
diff --git a/‎src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp
+25-2 b/‎src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp
+25-2
diff --git a/‎src/plugins/intel_gpu/src/graph/program_node.cpp
+1-1 b/‎src/plugins/intel_gpu/src/graph/program_node.cpp
+1-1
diff --git a/‎src/plugins/intel_gpu/tests/unit/passes/mark_shape_of_subgraphs_test.cpp
+93 b/‎src/plugins/intel_gpu/tests/unit/passes/mark_shape_of_subgraphs_test.cpp
+93
diff --git a/‎src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp
+20 b/‎src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp
+20
diff --git a/‎src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp
+7 b/‎src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp
+7
diff --git a/‎src/plugins/intel_npu/src/al/src/config/runtime.cpp
+1 b/‎src/plugins/intel_npu/src/al/src/config/runtime.cpp
+1
@@ -424,7 +424,7 @@ jobs:
     defaults:
       run:
         shell: bash
-    runs-on: aks-linux-small
+    runs-on: aks-linux-medium
     container:
       image: 'openvinogithubactions.azurecr.io/library/python:3.12-slim'
       volumes:
 
@@ -49,8 +49,8 @@ SDPAScaleFusion::SDPAScaleFusion() {
 
         auto sdpa = m.get_match_root();
 
-        const bool has_q_scale = pattern_map.count(scaled_q);
-        const bool has_k_scale = pattern_map.count(scaled_k);
+        bool has_q_scale = pattern_map.count(scaled_q);
+        bool has_k_scale = pattern_map.count(scaled_k);
 
         // Nothing to do
         if (!has_q_scale && !has_k_scale)
@@ -83,22 +83,32 @@ SDPAScaleFusion::SDPAScaleFusion() {
         // Extract scalar scale values for Q and K if those are constant and set new inputs for SDPA
         if (has_q_scale) {
             scale_q_node = pattern_map.at(scale_q).get_node_shared_ptr();
-            if (ov::is_type<ov::op::v0::Constant>(scale_q_node)) {
-                scale_q_value = ov::as_type_ptr<ov::op::v0::Constant>(scale_q_node)->cast_vector<float>()[0];
-                q_input = pattern_map.at(q);
+            if (pattern_map.at(q).get_element_type() == q_input.get_element_type()) {
+                if (ov::is_type<ov::op::v0::Constant>(scale_q_node)) {
+                    scale_q_value = ov::as_type_ptr<ov::op::v0::Constant>(scale_q_node)->cast_vector<float>()[0];
+                    q_input = pattern_map.at(q);
+                }
+            } else {
+                has_q_scale = false;
             }
         }
         if (has_k_scale) {
             scale_k_node = pattern_map.at(scale_k).get_node_shared_ptr();
-            if (ov::is_type<ov::op::v0::Constant>(scale_k_node)) {
-                scale_k_value = ov::as_type_ptr<ov::op::v0::Constant>(scale_k_node)->cast_vector<float>()[0];
-                k_input = pattern_map.at(k);
+            if (pattern_map.at(k).get_element_type() == k_input.get_element_type()) {
+                if (ov::is_type<ov::op::v0::Constant>(scale_k_node)) {
+                    scale_k_value = ov::as_type_ptr<ov::op::v0::Constant>(scale_k_node)->cast_vector<float>()[0];
+                    k_input = pattern_map.at(k);
+                }
+            } else {
+                has_k_scale = false;
             }
         }
 
+        if (!has_q_scale && !has_k_scale)
+            return false;
+
         Output<ov::Node> new_scale_node;
         auto new_scale_val = prev_scale_value * scale_q_value * scale_k_value;
-
         // If new scale is 1 and we have non-constant scale node for either Q or K, then we can make it a scale of SDPA
         if (new_scale_val == 1.0f) {
             if (has_q_scale && !ov::is_type<ov::op::v0::Constant>(scale_q_node)) {
 
@@ -15,6 +15,7 @@
 #include "openvino/op/constant.hpp"
 #include "openvino/op/multiply.hpp"
 #include "openvino/op/scaled_dot_product_attention.hpp"
+#include "ov_ops/type_relaxed.hpp"
 
 using namespace testing;
 using namespace ov::pass;
@@ -226,3 +227,49 @@ TEST_F(TransformationTestsF, SDPAScaleFusionTest5) {
     comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
     comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
 }
+
+TEST_F(TransformationTestsF, SDPAScaleFusionTest6) {
+    const PartialShape query_shape{1, 32, -1, 32};
+    const PartialShape key_shape{1, 32, -1, 32};
+    const PartialShape value_shape{1, 32, -1, 32};
+
+    const auto query = std::make_shared<ov::op::v0::Parameter>(element::f16, query_shape);
+    const auto key = std::make_shared<ov::op::v0::Parameter>(element::i8, key_shape);
+    const auto value = std::make_shared<ov::op::v0::Parameter>(element::f16, value_shape);
+    const auto scale_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{8.0f});
+    const auto v_scaled = std::make_shared<ov::op::v1::Multiply>(value, scale_const);
+    const auto casual = false;
+    {
+        const auto q_scaled = std::make_shared<ov::op::v1::Multiply>(query, scale_const);
+        const auto k_scaled = std::make_shared<ov::op::TypeRelaxed<ov::op::v1::Multiply>>(
+            std::vector<element::Type>{element::f16, element::f16},
+            std::vector<element::Type>{element::f16},
+            ov::op::TemporaryReplaceOutputType(key, element::f16).get(),
+            ov::op::TemporaryReplaceOutputType(scale_const, element::f16).get());
+        const auto sdpa =
+            std::make_shared<ov::op::v13::ScaledDotProductAttention>(q_scaled, k_scaled, v_scaled, casual);
+
+        model = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
+        manager.register_pass<ov::pass::SDPAScaleFusion>();
+    }
+
+    {
+        const auto k_scaled_ref = std::make_shared<ov::op::TypeRelaxed<ov::op::v1::Multiply>>(
+            std::vector<element::Type>{element::f16, element::f16},
+            std::vector<element::Type>{element::f16},
+            ov::op::TemporaryReplaceOutputType(key, element::f16).get(),
+            ov::op::TemporaryReplaceOutputType(scale_const, element::f16).get());
+        const auto new_mask_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{0.0f});
+        const auto new_scale_const =
+            ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{8.0f / std::sqrt(32.0f)});
+        const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(query,
+                                                                                   k_scaled_ref,
+                                                                                   v_scaled,
+                                                                                   new_mask_const,
+                                                                                   new_scale_const,
+                                                                                   casual);
+        model_ref = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
+    }
+
+    comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
+}
@@ -44,18 +44,30 @@ namespace {
  * @return A set of unique tensor names.
  */
 std::unordered_set<std::string> deserialize_tensor_names(const std::string_view& tensor_names) {
-    // tensor names are separated by comma, but ignore escaped comma
-    static const auto splitter = std::regex(R"((?:[^\\,\n]|\\.)+)");
+    static const auto escaped_delim = std::regex(R"(\\,)");
+    constexpr auto delim = ",";
+    constexpr auto esc_char = '\\';
 
     auto output_names = std::unordered_set<std::string>();
-    std::transform(std::cregex_token_iterator{tensor_names.data(), tensor_names.data() + tensor_names.size(), splitter},
-                   std::cregex_token_iterator{},
-                   std::inserter(output_names, output_names.end()),
-                   [](const auto& token) {
-                       // If tensor name contains escaped comma, replace it with comma
-                       static const auto escaped_delim = std::regex(R"(\\,)");
-                       return std::regex_replace(token.str(), escaped_delim, ",");
-                   });
+    auto name_inserter = std::inserter(output_names, output_names.end());
+    for (size_t pos = tensor_names.find(delim), start = 0; start != std::string::npos;
+         pos = tensor_names.find(delim, pos)) {
+        if (pos == std::string::npos) {
+            if (auto name_view = tensor_names.substr(start); name_view.size() > 0) {
+                *name_inserter = std::regex_replace(std::string(name_view), escaped_delim, delim);
+            }
+            start = pos;
+        } else if (auto delim_pos = pos - 1; delim_pos != std::string::npos && tensor_names[delim_pos] == esc_char) {
+            ++pos;
+        } else {
+            if (auto length = pos - start; length > 0) {
+                *name_inserter =
+                    std::regex_replace(std::string(tensor_names.substr(start, length)), escaped_delim, delim);
+            }
+            start = ++pos;
+        }
+    }
+
     return output_names;
 }
 }  // namespace
 
@@ -1303,7 +1303,7 @@ TEST_F(IRFrontendTests, model_with_tensor_names_with_spaces) {
                 <layer id="0" name="input2" type="Parameter" version="opset1">
                     <data shape="1,4,512" element_type="f32"/>
                     <output>
-                        <port id="0" precision="FP32" names="input2">
+                        <port id="0" precision="FP32" names="model/bert/encoder/layer_0/attention/self/query/Tensordot/MatMul;model/bert/encoder/layer_0/attention/self/query/BiasAdd;model/bert/encoder/layer_0/attention/output/dense/Tensordot/shape;model/bert/encoder/layer_0/attention/self/query/Tensordot;model/bert/encoder/layer_0/attention/self/query/BiasAdd/ReadVariableOp_Gemm__32:0">
                             <dim>1</dim>
                             <dim>4</dim>
                             <dim>512</dim>
 
@@ -62,7 +62,7 @@ OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_profiling_data_path, "", "Save csv fi
 OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_graphs_path, "", "Save intermediate graph representations during model compilation pipeline to specified folder")
 OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_sources_path, "", "Save generated sources for each kernel to specified folder")
 OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_tensors_path, "", "Save intermediate in/out tensors of each primitive to specified folder")
-OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_tensors, ov::intel_gpu::DumpTensors::all, "Tensor types to dump. Supported values: all, inputs, outputs")
+OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_tensors, ov::intel_gpu::DumpTensors::all, "Tensor types to dump. Supported values: all, in, out")
 OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_tensors_format, ov::intel_gpu::DumpFormat::text, "Format of the tensors dump. Supported values: binary, text, text_raw")
 OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_layer_names, std::vector<std::string>{}, "Activate dump for specified layers only")
 OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_memory_pool_path, "", "Save csv file with memory pool info to specified folder")
 
@@ -10,14 +10,37 @@
 #include "select_inst.h"
 #include "strided_slice_inst.h"
 #include "gather_inst.h"
+#include "input_layout_inst.h"
+#include "paged_attention_inst.h"
 #include "pass_manager.h"
 
 #include "intel_gpu/graph/program.hpp"
 
 using namespace cldnn;
 
-void mark_shape_of_subgraphs::look_for_shape_of_subgraph(program_node& node) {
+static bool is_shape_of_subgraph_root(program_node& node) {
     if (node.is_type<shape_of>()) {
+        return true;
+    }
+
+    // Allow input_layout to be the root of the shape_of subgraph if it's 'max_context_len'
+    // input of PagedAttention, which can be used as a shape calculation flow source in some
+    // models like Qwen and Qwen2
+    if (node.is_type<input_layout>()) {
+        const auto& users = node.get_users();
+        for (const auto& user : users) {
+            const auto max_context_len_input_id = 12;
+            if (user->is_type<paged_attention>() && user->get_dependency_index(node) == max_context_len_input_id) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+void mark_shape_of_subgraphs::look_for_shape_of_subgraph(program_node& node) {
+    if (is_shape_of_subgraph_root(node)) {
         mark_node(node);
         return;
     }
@@ -102,7 +125,7 @@ void mark_shape_of_subgraphs::mark_node(program_node& node) {
 
     // If current node has shape_of type add it to dependant shape_of nodes for
     // correct dependency propagation for users
-    if (node.is_type<shape_of>())
+    if (is_shape_of_subgraph_root(node))
         node.add_dependant_shape_of_node(&node);
 
     // Add parent shape_of nodes from other dependencies if there are any
 
@@ -658,7 +658,7 @@ void program_node::select_preferred_formats(impl_types impl_type) {
 }
 
 void program_node::add_dependant_shape_of_node(const program_node* node) {
-    OPENVINO_ASSERT(node->is_type<shape_of>(), "[GPU] Expected node type is shape_of");
+    OPENVINO_ASSERT(node->is_type<shape_of>() || node->is_type<input_layout>(), "[GPU] Expected node type is shape_of");
     dependant_shape_of_nodes.insert(node);
 }
 
 
@@ -18,6 +18,7 @@
 #include "select_inst.h"
 #include "strided_slice_inst.h"
 #include "broadcast_inst.h"
+#include "paged_attention_inst.h"
 #include "pass_manager.h"
 #include "to_string_utils.h"
 
@@ -31,6 +32,10 @@ static bool check_subgraph(const program_node& node, const program_node& last_no
     if (custom_dependant_nodes_count.find(node.id()) != custom_dependant_nodes_count.end())
         expected_dependant_nodes = custom_dependant_nodes_count[node.id()];
 
+    // Skip some custom nodes if they are not intended to be included into shape_of subgraph
+    if (expected_dependant_nodes == 0)
+        return true;
+
     if (!node.is_in_shape_of_subgraph() || node.get_dependant_shape_of_nodes().size() != expected_dependant_nodes)
         return false;
 
@@ -423,3 +428,91 @@ TEST(mark_shape_of_subgraphs, broadcast_w_direct_shapeof_and_data) {
 
     ASSERT_TRUE(check_subgraph(prog->get_node("shape_of"), prog->get_node("broadcast")));
 }
+
+TEST(mark_shape_of_subgraphs, paged_attention_max_context_len_input) {
+    auto& engine = get_test_engine();
+    auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), 4, ov::Dimension::dynamic(), ov::Dimension::dynamic()},
+                                       data_types::f32, format::bfyx};
+    auto target_shape = engine.allocate_memory({ ov::PartialShape{4}, data_types::i32, format::bfyx });
+    set_values(target_shape, {4, 4, 1, 1});
+
+    auto subtract_one = engine.allocate_memory({ ov::PartialShape{1}, data_types::i32, format::bfyx });
+    set_values(target_shape, {-1});
+
+    auto query_layout = layout{ov::PartialShape{ov::Dimension::dynamic(), 128},
+                               data_types::f32,
+                               format::bfyx};
+    auto key_layout = query_layout;
+    auto value_layout = query_layout;
+    auto key_cache_layout = layout{ov::PartialShape{ov::Dimension::dynamic(), 2, 64, 16},
+                                   data_types::f32,
+                                   format::bfyx};
+    auto dynamic_i32_layout = layout{ov::PartialShape::dynamic(1), data_types::i32, format::bfyx};
+    auto value_cache_layout = key_cache_layout;
+    auto past_lens_layout = dynamic_i32_layout;
+    auto subsequence_begins_layout = dynamic_i32_layout;
+    auto block_indices_layout = dynamic_i32_layout;
+    auto block_indices_begins_layout = dynamic_i32_layout;
+    auto scale_layout = layout{ov::PartialShape{1}, data_types::f32, format::bfyx};
+    auto sliding_window_layout = layout{ov::PartialShape{}, data_types::i32, format::bfyx};
+    auto alibi_layout = layout{ov::PartialShape{}, data_types::f32, format::bfyx};
+    auto max_context_len_layout = layout{ov::PartialShape{1}, data_types::i32, format::bfyx};;
+
+    std::vector<input_info> pa_inputs = {input_info("query"),
+                                         input_info("key"),
+                                         input_info("value"),
+                                         input_info("key_cache"),
+                                         input_info("value_cache"),
+                                         input_info("past_lens"),
+                                         input_info("subsequence_begins"),
+                                         input_info("block_indices"),
+                                         input_info("block_indices_begins"),
+                                         input_info("scale"),
+                                         input_info("sliding_window"),
+                                         input_info("alibi"),
+                                         input_info("max_context_len")};
+
+    auto pa_prim = paged_attention("paged_attention", pa_inputs);
+    pa_prim.head_size = 64;
+    pa_prim.kv_heads_num = 2;
+    pa_prim.heads_num = 2;
+    pa_prim.scale_val = 1.f;
+    pa_prim.has_alibi = false;
+    pa_prim.num_outputs = 1;
+    pa_prim.has_rotated_blocks = false;
+
+    topology topology;
+    topology.add(input_layout("query", query_layout));
+    topology.add(input_layout("key", key_layout));
+    topology.add(input_layout("value", value_layout));
+    topology.add(input_layout("key_cache", key_cache_layout));
+    topology.add(input_layout("value_cache", value_cache_layout));
+    topology.add(input_layout("past_lens", past_lens_layout));
+    topology.add(input_layout("subsequence_begins", subsequence_begins_layout));
+    topology.add(input_layout("block_indices", block_indices_layout));
+    topology.add(input_layout("block_indices_begins", block_indices_begins_layout));
+    topology.add(input_layout("scale", scale_layout));
+    topology.add(input_layout("sliding_window", sliding_window_layout));
+    topology.add(input_layout("alibi", alibi_layout));
+    topology.add(input_layout("max_context_len", max_context_len_layout));
+    topology.add(input_layout("input", input_layout_dynamic));
+    topology.add(data("target_shape", target_shape));
+    topology.add(data("subtract_one", subtract_one));
+    topology.add(shape_of("shape_of", input_info("input"), data_types::i32));
+    topology.add(broadcast("broadcast", input_info("shape_of"), input_info("target_shape"), {}, ov::op::BroadcastType::BIDIRECTIONAL));
+    topology.add(eltwise("subtract_one_max_context_len", input_info("max_context_len"), input_info("subtract_one"), eltwise_mode::sum));
+    topology.add(eltwise("updated_broadcast", input_info("broadcast"), input_info("subtract_one_max_context_len"), eltwise_mode::sum));
+    topology.add(reshape("reshape", input_info("input"), input_info("updated_broadcast"), false, ov::PartialShape::dynamic(4)));
+    topology.add(pa_prim);
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    network network(engine, topology, config);
+
+    auto prog = network.get_program();
+    ASSERT_NE(prog, nullptr);
+
+    ASSERT_TRUE(check_subgraph(prog->get_node("shape_of"), prog->get_node("updated_broadcast"), {{"updated_broadcast", 2}}));
+    ASSERT_TRUE(check_subgraph(prog->get_node("max_context_len"), prog->get_node("updated_broadcast"), {{"updated_broadcast", 2}, {"paged_attention", 0}}));
+}
@@ -309,4 +309,24 @@ struct RUN_INFERENCES_SEQUENTIALLY final : OptionBase<RUN_INFERENCES_SEQUENTIALL
     }
 };
 
+struct DISABLE_VERSION_CHECK final : OptionBase<DISABLE_VERSION_CHECK, bool> {
+    static std::string_view key() {
+        return ov::intel_npu::disable_version_check.name();
+    }
+
+    static bool defaultValue() {
+        return false;
+    }
+
+#ifdef NPU_PLUGIN_DEVELOPER_BUILD
+    static std::string_view envVar() {
+        return "OV_NPU_DISABLE_VERSION_CHECK";
+    }
+#endif
+
+    static OptionMode mode() {
+        return OptionMode::RunTime;
+    }
+};
+
 }  // namespace intel_npu
@@ -329,5 +329,12 @@ static constexpr ov::Property<std::string> backend_compilation_params{"NPU_BACKE
  */
 static constexpr ov::Property<bool> run_inferences_sequentially{"NPU_RUN_INFERENCES_SEQUENTIALLY"};
 
+/**
+ * @brief [Only for NPU Plugin]
+ * Type: boolean, default is false.
+ * This option allows to skip the blob version check
+ */
+static constexpr ov::Property<bool> disable_version_check{"NPU_DISABLE_VERSION_CHECK"};
+
 }  // namespace intel_npu
 }  // namespace ov
@@ -29,6 +29,7 @@ void intel_npu::registerRunTimeOptions(OptionsDesc& desc) {
     desc.add<WEIGHTS_PATH>();
     desc.add<BYPASS_UMD_CACHING>();
     desc.add<RUN_INFERENCES_SEQUENTIALLY>();
+    desc.add<DISABLE_VERSION_CHECK>();
 }
 
 // Heuristically obtained number. Varies depending on the values of PLATFORM and PERFORMANCE_HINT
Original file line number	Diff line number	Diff line change
`@@ -658,7 +658,7 @@ void program_node::select_preferred_formats(impl_types impl_type) {`
`658`	`658`	`}`
`659`	`659`
`660`	`660`	`void program_node::add_dependant_shape_of_node(const program_node* node) {`
`661`		`- OPENVINO_ASSERT(node->is_type<shape_of>(), "[GPU] Expected node type is shape_of");`
	`661`	`+ OPENVINO_ASSERT(node->is_type<shape_of>() \|\| node->is_type<input_layout>(), "[GPU] Expected node type is shape_of");`
`662`	`662`	`dependant_shape_of_nodes.insert(node);`
`663`	`663`	`}`
`664`	`664`
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@ void intel_npu::registerRunTimeOptions(OptionsDesc& desc) {`
`29`	`29`	`desc.add<WEIGHTS_PATH>();`
`30`	`30`	`desc.add<BYPASS_UMD_CACHING>();`
`31`	`31`	`desc.add<RUN_INFERENCES_SEQUENTIALLY>();`
	`32`	`+ desc.add<DISABLE_VERSION_CHECK>();`
`32`	`33`	`}`
`33`	`34`
`34`	`35`	`// Heuristically obtained number. Varies depending on the values of PLATFORM and PERFORMANCE_HINT`