[GPU] Extend shape_of subgraphs markup logic to include PagedAttention input (#29445)

sshlyapn · web-flow · commit d737fa2d2aae · 2025-03-13T11:16:27.000Z
### Details: - Extend shape_of subgraphs markup logic to include PagedAttention's `max_context_len` input - This patch fixes qwen-7b-chat performance issue, which is caused by the fact that it relies on `max_context_len` data input which is treated as "ShapeOf" subgraph data source. Considering that it isn't used anywhere except PagedAttention itself as a direct input, or some simple shape-flow calculations, add such input and its users to ShapeOf-subgraph as well. Otherwise, such subgraphs are calculated on GPU, introducing runtime synchronization and significantly dropping performance. Qwen model: ![image](https://github.com/user-attachments/assets/8d42204f-aa1a-422d-af3c-907d205c0b33) Qwen-2 model: ![image](https://github.com/user-attachments/assets/6d613e9c-5a58-41f7-8719-74192cf24c4e) ### Tickets: - [CVS-164134](https://jira.devtools.intel.com/browse/CVS-164134)
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp
@@ -10,14 +10,37 @@
 #include "select_inst.h"
 #include "strided_slice_inst.h"
 #include "gather_inst.h"
+#include "input_layout_inst.h"
+#include "paged_attention_inst.h"
 #include "pass_manager.h"
 
 #include "intel_gpu/graph/program.hpp"
 
 using namespace cldnn;
 
-void mark_shape_of_subgraphs::look_for_shape_of_subgraph(program_node& node) {
+static bool is_shape_of_subgraph_root(program_node& node) {
     if (node.is_type<shape_of>()) {
+        return true;
+    }
+
+    // Allow input_layout to be the root of the shape_of subgraph if it's 'max_context_len'
+    // input of PagedAttention, which can be used as a shape calculation flow source in some
+    // models like Qwen and Qwen2
+    if (node.is_type<input_layout>()) {
+        const auto& users = node.get_users();
+        for (const auto& user : users) {
+            const auto max_context_len_input_id = 12;
+            if (user->is_type<paged_attention>() && user->get_dependency_index(node) == max_context_len_input_id) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+void mark_shape_of_subgraphs::look_for_shape_of_subgraph(program_node& node) {
+    if (is_shape_of_subgraph_root(node)) {
         mark_node(node);
         return;
     }
@@ -102,7 +125,7 @@ void mark_shape_of_subgraphs::mark_node(program_node& node) {
 
     // If current node has shape_of type add it to dependant shape_of nodes for
     // correct dependency propagation for users
-    if (node.is_type<shape_of>())
+    if (is_shape_of_subgraph_root(node))
         node.add_dependant_shape_of_node(&node);
 
     // Add parent shape_of nodes from other dependencies if there are any
diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp
@@ -658,7 +658,7 @@ void program_node::select_preferred_formats(impl_types impl_type) {
 }
 
 void program_node::add_dependant_shape_of_node(const program_node* node) {
-    OPENVINO_ASSERT(node->is_type<shape_of>(), "[GPU] Expected node type is shape_of");
+    OPENVINO_ASSERT(node->is_type<shape_of>() || node->is_type<input_layout>(), "[GPU] Expected node type is shape_of");
     dependant_shape_of_nodes.insert(node);
 }
 
diff --git a/src/plugins/intel_gpu/tests/unit/passes/mark_shape_of_subgraphs_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/mark_shape_of_subgraphs_test.cpp
@@ -18,6 +18,7 @@
 #include "select_inst.h"
 #include "strided_slice_inst.h"
 #include "broadcast_inst.h"
+#include "paged_attention_inst.h"
 #include "pass_manager.h"
 #include "to_string_utils.h"
 
@@ -31,6 +32,10 @@ static bool check_subgraph(const program_node& node, const program_node& last_no
     if (custom_dependant_nodes_count.find(node.id()) != custom_dependant_nodes_count.end())
         expected_dependant_nodes = custom_dependant_nodes_count[node.id()];
 
+    // Skip some custom nodes if they are not intended to be included into shape_of subgraph
+    if (expected_dependant_nodes == 0)
+        return true;
+
     if (!node.is_in_shape_of_subgraph() || node.get_dependant_shape_of_nodes().size() != expected_dependant_nodes)
         return false;
 
@@ -423,3 +428,91 @@ TEST(mark_shape_of_subgraphs, broadcast_w_direct_shapeof_and_data) {
 
     ASSERT_TRUE(check_subgraph(prog->get_node("shape_of"), prog->get_node("broadcast")));
 }
+
+TEST(mark_shape_of_subgraphs, paged_attention_max_context_len_input) {
+    auto& engine = get_test_engine();
+    auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), 4, ov::Dimension::dynamic(), ov::Dimension::dynamic()},
+                                       data_types::f32, format::bfyx};
+    auto target_shape = engine.allocate_memory({ ov::PartialShape{4}, data_types::i32, format::bfyx });
+    set_values(target_shape, {4, 4, 1, 1});
+
+    auto subtract_one = engine.allocate_memory({ ov::PartialShape{1}, data_types::i32, format::bfyx });
+    set_values(target_shape, {-1});
+
+    auto query_layout = layout{ov::PartialShape{ov::Dimension::dynamic(), 128},
+                               data_types::f32,
+                               format::bfyx};
+    auto key_layout = query_layout;
+    auto value_layout = query_layout;
+    auto key_cache_layout = layout{ov::PartialShape{ov::Dimension::dynamic(), 2, 64, 16},
+                                   data_types::f32,
+                                   format::bfyx};
+    auto dynamic_i32_layout = layout{ov::PartialShape::dynamic(1), data_types::i32, format::bfyx};
+    auto value_cache_layout = key_cache_layout;
+    auto past_lens_layout = dynamic_i32_layout;
+    auto subsequence_begins_layout = dynamic_i32_layout;
+    auto block_indices_layout = dynamic_i32_layout;
+    auto block_indices_begins_layout = dynamic_i32_layout;
+    auto scale_layout = layout{ov::PartialShape{1}, data_types::f32, format::bfyx};
+    auto sliding_window_layout = layout{ov::PartialShape{}, data_types::i32, format::bfyx};
+    auto alibi_layout = layout{ov::PartialShape{}, data_types::f32, format::bfyx};
+    auto max_context_len_layout = layout{ov::PartialShape{1}, data_types::i32, format::bfyx};;
+
+    std::vector<input_info> pa_inputs = {input_info("query"),
+                                         input_info("key"),
+                                         input_info("value"),
+                                         input_info("key_cache"),
+                                         input_info("value_cache"),
+                                         input_info("past_lens"),
+                                         input_info("subsequence_begins"),
+                                         input_info("block_indices"),
+                                         input_info("block_indices_begins"),
+                                         input_info("scale"),
+                                         input_info("sliding_window"),
+                                         input_info("alibi"),
+                                         input_info("max_context_len")};
+
+    auto pa_prim = paged_attention("paged_attention", pa_inputs);
+    pa_prim.head_size = 64;
+    pa_prim.kv_heads_num = 2;
+    pa_prim.heads_num = 2;
+    pa_prim.scale_val = 1.f;
+    pa_prim.has_alibi = false;
+    pa_prim.num_outputs = 1;
+    pa_prim.has_rotated_blocks = false;
+
+    topology topology;
+    topology.add(input_layout("query", query_layout));
+    topology.add(input_layout("key", key_layout));
+    topology.add(input_layout("value", value_layout));
+    topology.add(input_layout("key_cache", key_cache_layout));
+    topology.add(input_layout("value_cache", value_cache_layout));
+    topology.add(input_layout("past_lens", past_lens_layout));
+    topology.add(input_layout("subsequence_begins", subsequence_begins_layout));
+    topology.add(input_layout("block_indices", block_indices_layout));
+    topology.add(input_layout("block_indices_begins", block_indices_begins_layout));
+    topology.add(input_layout("scale", scale_layout));
+    topology.add(input_layout("sliding_window", sliding_window_layout));
+    topology.add(input_layout("alibi", alibi_layout));
+    topology.add(input_layout("max_context_len", max_context_len_layout));
+    topology.add(input_layout("input", input_layout_dynamic));
+    topology.add(data("target_shape", target_shape));
+    topology.add(data("subtract_one", subtract_one));
+    topology.add(shape_of("shape_of", input_info("input"), data_types::i32));
+    topology.add(broadcast("broadcast", input_info("shape_of"), input_info("target_shape"), {}, ov::op::BroadcastType::BIDIRECTIONAL));
+    topology.add(eltwise("subtract_one_max_context_len", input_info("max_context_len"), input_info("subtract_one"), eltwise_mode::sum));
+    topology.add(eltwise("updated_broadcast", input_info("broadcast"), input_info("subtract_one_max_context_len"), eltwise_mode::sum));
+    topology.add(reshape("reshape", input_info("input"), input_info("updated_broadcast"), false, ov::PartialShape::dynamic(4)));
+    topology.add(pa_prim);
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    network network(engine, topology, config);
+
+    auto prog = network.get_program();
+    ASSERT_NE(prog, nullptr);
+
+    ASSERT_TRUE(check_subgraph(prog->get_node("shape_of"), prog->get_node("updated_broadcast"), {{"updated_broadcast", 2}}));
+    ASSERT_TRUE(check_subgraph(prog->get_node("max_context_len"), prog->get_node("updated_broadcast"), {{"updated_broadcast", 2}, {"paged_attention", 0}}));
+}

Original file line number	Diff line number	Diff line change
`@@ -658,7 +658,7 @@ void program_node::select_preferred_formats(impl_types impl_type) {`
`658`	`658`	`}`
`659`	`659`
`660`	`660`	`void program_node::add_dependant_shape_of_node(const program_node* node) {`
`661`		`- OPENVINO_ASSERT(node->is_type<shape_of>(), "[GPU] Expected node type is shape_of");`
	`661`	`+ OPENVINO_ASSERT(node->is_type<shape_of>() \|\| node->is_type<input_layout>(), "[GPU] Expected node type is shape_of");`
`662`	`662`	`dependant_shape_of_nodes.insert(node);`
`663`	`663`	`}`
`664`	`664`