Skip to content

Commit 4ea94be

Browse files
authored
Merge branch 'master' into tensor_view_for_partial_value_propagation
2 parents 65debd2 + e2eff09 commit 4ea94be

File tree

24 files changed

+1388
-120
lines changed

24 files changed

+1388
-120
lines changed

.github/workflows/mac.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ jobs:
424424
defaults:
425425
run:
426426
shell: bash
427-
runs-on: aks-linux-small
427+
runs-on: aks-linux-medium
428428
container:
429429
image: 'openvinogithubactions.azurecr.io/library/python:3.12-slim'
430430
volumes:

src/common/transformations/src/transformations/common_optimizations/sdpa_scale_fusion.cpp

+19-9
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ SDPAScaleFusion::SDPAScaleFusion() {
4949

5050
auto sdpa = m.get_match_root();
5151

52-
const bool has_q_scale = pattern_map.count(scaled_q);
53-
const bool has_k_scale = pattern_map.count(scaled_k);
52+
bool has_q_scale = pattern_map.count(scaled_q);
53+
bool has_k_scale = pattern_map.count(scaled_k);
5454

5555
// Nothing to do
5656
if (!has_q_scale && !has_k_scale)
@@ -83,22 +83,32 @@ SDPAScaleFusion::SDPAScaleFusion() {
8383
// Extract scalar scale values for Q and K if those are constant and set new inputs for SDPA
8484
if (has_q_scale) {
8585
scale_q_node = pattern_map.at(scale_q).get_node_shared_ptr();
86-
if (ov::is_type<ov::op::v0::Constant>(scale_q_node)) {
87-
scale_q_value = ov::as_type_ptr<ov::op::v0::Constant>(scale_q_node)->cast_vector<float>()[0];
88-
q_input = pattern_map.at(q);
86+
if (pattern_map.at(q).get_element_type() == q_input.get_element_type()) {
87+
if (ov::is_type<ov::op::v0::Constant>(scale_q_node)) {
88+
scale_q_value = ov::as_type_ptr<ov::op::v0::Constant>(scale_q_node)->cast_vector<float>()[0];
89+
q_input = pattern_map.at(q);
90+
}
91+
} else {
92+
has_q_scale = false;
8993
}
9094
}
9195
if (has_k_scale) {
9296
scale_k_node = pattern_map.at(scale_k).get_node_shared_ptr();
93-
if (ov::is_type<ov::op::v0::Constant>(scale_k_node)) {
94-
scale_k_value = ov::as_type_ptr<ov::op::v0::Constant>(scale_k_node)->cast_vector<float>()[0];
95-
k_input = pattern_map.at(k);
97+
if (pattern_map.at(k).get_element_type() == k_input.get_element_type()) {
98+
if (ov::is_type<ov::op::v0::Constant>(scale_k_node)) {
99+
scale_k_value = ov::as_type_ptr<ov::op::v0::Constant>(scale_k_node)->cast_vector<float>()[0];
100+
k_input = pattern_map.at(k);
101+
}
102+
} else {
103+
has_k_scale = false;
96104
}
97105
}
98106

107+
if (!has_q_scale && !has_k_scale)
108+
return false;
109+
99110
Output<ov::Node> new_scale_node;
100111
auto new_scale_val = prev_scale_value * scale_q_value * scale_k_value;
101-
102112
// If new scale is 1 and we have non-constant scale node for either Q or K, then we can make it a scale of SDPA
103113
if (new_scale_val == 1.0f) {
104114
if (has_q_scale && !ov::is_type<ov::op::v0::Constant>(scale_q_node)) {

src/common/transformations/tests/common_optimizations/sdpa_scale_fusion_test.cpp

+47
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "openvino/op/constant.hpp"
1616
#include "openvino/op/multiply.hpp"
1717
#include "openvino/op/scaled_dot_product_attention.hpp"
18+
#include "ov_ops/type_relaxed.hpp"
1819

1920
using namespace testing;
2021
using namespace ov::pass;
@@ -226,3 +227,49 @@ TEST_F(TransformationTestsF, SDPAScaleFusionTest5) {
226227
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
227228
comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
228229
}
230+
231+
TEST_F(TransformationTestsF, SDPAScaleFusionTest6) {
232+
const PartialShape query_shape{1, 32, -1, 32};
233+
const PartialShape key_shape{1, 32, -1, 32};
234+
const PartialShape value_shape{1, 32, -1, 32};
235+
236+
const auto query = std::make_shared<ov::op::v0::Parameter>(element::f16, query_shape);
237+
const auto key = std::make_shared<ov::op::v0::Parameter>(element::i8, key_shape);
238+
const auto value = std::make_shared<ov::op::v0::Parameter>(element::f16, value_shape);
239+
const auto scale_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{8.0f});
240+
const auto v_scaled = std::make_shared<ov::op::v1::Multiply>(value, scale_const);
241+
const auto casual = false;
242+
{
243+
const auto q_scaled = std::make_shared<ov::op::v1::Multiply>(query, scale_const);
244+
const auto k_scaled = std::make_shared<ov::op::TypeRelaxed<ov::op::v1::Multiply>>(
245+
std::vector<element::Type>{element::f16, element::f16},
246+
std::vector<element::Type>{element::f16},
247+
ov::op::TemporaryReplaceOutputType(key, element::f16).get(),
248+
ov::op::TemporaryReplaceOutputType(scale_const, element::f16).get());
249+
const auto sdpa =
250+
std::make_shared<ov::op::v13::ScaledDotProductAttention>(q_scaled, k_scaled, v_scaled, casual);
251+
252+
model = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
253+
manager.register_pass<ov::pass::SDPAScaleFusion>();
254+
}
255+
256+
{
257+
const auto k_scaled_ref = std::make_shared<ov::op::TypeRelaxed<ov::op::v1::Multiply>>(
258+
std::vector<element::Type>{element::f16, element::f16},
259+
std::vector<element::Type>{element::f16},
260+
ov::op::TemporaryReplaceOutputType(key, element::f16).get(),
261+
ov::op::TemporaryReplaceOutputType(scale_const, element::f16).get());
262+
const auto new_mask_const = ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{0.0f});
263+
const auto new_scale_const =
264+
ov::op::v0::Constant::create(element::f16, ov::Shape{}, std::vector<float>{8.0f / std::sqrt(32.0f)});
265+
const auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(query,
266+
k_scaled_ref,
267+
v_scaled,
268+
new_mask_const,
269+
new_scale_const,
270+
casual);
271+
model_ref = std::make_shared<ov::Model>(NodeVector{sdpa}, ParameterVector{query, key, value});
272+
}
273+
274+
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
275+
}

src/frontends/ir/src/ir_deserializer.cpp

+22-10
Original file line numberDiff line numberDiff line change
@@ -44,18 +44,30 @@ namespace {
4444
* @return A set of unique tensor names.
4545
*/
4646
std::unordered_set<std::string> deserialize_tensor_names(const std::string_view& tensor_names) {
47-
// tensor names are separated by comma, but ignore escaped comma
48-
static const auto splitter = std::regex(R"((?:[^\\,\n]|\\.)+)");
47+
static const auto escaped_delim = std::regex(R"(\\,)");
48+
constexpr auto delim = ",";
49+
constexpr auto esc_char = '\\';
4950

5051
auto output_names = std::unordered_set<std::string>();
51-
std::transform(std::cregex_token_iterator{tensor_names.data(), tensor_names.data() + tensor_names.size(), splitter},
52-
std::cregex_token_iterator{},
53-
std::inserter(output_names, output_names.end()),
54-
[](const auto& token) {
55-
// If tensor name contains escaped comma, replace it with comma
56-
static const auto escaped_delim = std::regex(R"(\\,)");
57-
return std::regex_replace(token.str(), escaped_delim, ",");
58-
});
52+
auto name_inserter = std::inserter(output_names, output_names.end());
53+
for (size_t pos = tensor_names.find(delim), start = 0; start != std::string::npos;
54+
pos = tensor_names.find(delim, pos)) {
55+
if (pos == std::string::npos) {
56+
if (auto name_view = tensor_names.substr(start); name_view.size() > 0) {
57+
*name_inserter = std::regex_replace(std::string(name_view), escaped_delim, delim);
58+
}
59+
start = pos;
60+
} else if (auto delim_pos = pos - 1; delim_pos != std::string::npos && tensor_names[delim_pos] == esc_char) {
61+
++pos;
62+
} else {
63+
if (auto length = pos - start; length > 0) {
64+
*name_inserter =
65+
std::regex_replace(std::string(tensor_names.substr(start, length)), escaped_delim, delim);
66+
}
67+
start = ++pos;
68+
}
69+
}
70+
5971
return output_names;
6072
}
6173
} // namespace

src/frontends/ir/tests/frontend_test_basic.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -1303,7 +1303,7 @@ TEST_F(IRFrontendTests, model_with_tensor_names_with_spaces) {
13031303
<layer id="0" name="input2" type="Parameter" version="opset1">
13041304
<data shape="1,4,512" element_type="f32"/>
13051305
<output>
1306-
<port id="0" precision="FP32" names="input2">
1306+
<port id="0" precision="FP32" names="model/bert/encoder/layer_0/attention/self/query/Tensordot/MatMul;model/bert/encoder/layer_0/attention/self/query/BiasAdd;model/bert/encoder/layer_0/attention/output/dense/Tensordot/shape;model/bert/encoder/layer_0/attention/self/query/Tensordot;model/bert/encoder/layer_0/attention/self/query/BiasAdd/ReadVariableOp_Gemm__32:0">
13071307
<dim>1</dim>
13081308
<dim>4</dim>
13091309
<dim>512</dim>

src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_profiling_data_path, "", "Save csv fi
6262
OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_graphs_path, "", "Save intermediate graph representations during model compilation pipeline to specified folder")
6363
OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_sources_path, "", "Save generated sources for each kernel to specified folder")
6464
OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_tensors_path, "", "Save intermediate in/out tensors of each primitive to specified folder")
65-
OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_tensors, ov::intel_gpu::DumpTensors::all, "Tensor types to dump. Supported values: all, inputs, outputs")
65+
OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_tensors, ov::intel_gpu::DumpTensors::all, "Tensor types to dump. Supported values: all, in, out")
6666
OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_tensors_format, ov::intel_gpu::DumpFormat::text, "Format of the tensors dump. Supported values: binary, text, text_raw")
6767
OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_layer_names, std::vector<std::string>{}, "Activate dump for specified layers only")
6868
OV_CONFIG_DEBUG_OPTION(ov::intel_gpu, dump_memory_pool_path, "", "Save csv file with memory pool info to specified folder")

src/plugins/intel_gpu/src/graph/graph_optimizer/mark_shape_of_subgraphs.cpp

+25-2
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,37 @@
1010
#include "select_inst.h"
1111
#include "strided_slice_inst.h"
1212
#include "gather_inst.h"
13+
#include "input_layout_inst.h"
14+
#include "paged_attention_inst.h"
1315
#include "pass_manager.h"
1416

1517
#include "intel_gpu/graph/program.hpp"
1618

1719
using namespace cldnn;
1820

19-
void mark_shape_of_subgraphs::look_for_shape_of_subgraph(program_node& node) {
21+
static bool is_shape_of_subgraph_root(program_node& node) {
2022
if (node.is_type<shape_of>()) {
23+
return true;
24+
}
25+
26+
// Allow input_layout to be the root of the shape_of subgraph if it's 'max_context_len'
27+
// input of PagedAttention, which can be used as a shape calculation flow source in some
28+
// models like Qwen and Qwen2
29+
if (node.is_type<input_layout>()) {
30+
const auto& users = node.get_users();
31+
for (const auto& user : users) {
32+
const auto max_context_len_input_id = 12;
33+
if (user->is_type<paged_attention>() && user->get_dependency_index(node) == max_context_len_input_id) {
34+
return true;
35+
}
36+
}
37+
}
38+
39+
return false;
40+
}
41+
42+
void mark_shape_of_subgraphs::look_for_shape_of_subgraph(program_node& node) {
43+
if (is_shape_of_subgraph_root(node)) {
2144
mark_node(node);
2245
return;
2346
}
@@ -102,7 +125,7 @@ void mark_shape_of_subgraphs::mark_node(program_node& node) {
102125

103126
// If current node has shape_of type add it to dependant shape_of nodes for
104127
// correct dependency propagation for users
105-
if (node.is_type<shape_of>())
128+
if (is_shape_of_subgraph_root(node))
106129
node.add_dependant_shape_of_node(&node);
107130

108131
// Add parent shape_of nodes from other dependencies if there are any

src/plugins/intel_gpu/src/graph/program_node.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,7 @@ void program_node::select_preferred_formats(impl_types impl_type) {
658658
}
659659

660660
void program_node::add_dependant_shape_of_node(const program_node* node) {
661-
OPENVINO_ASSERT(node->is_type<shape_of>(), "[GPU] Expected node type is shape_of");
661+
OPENVINO_ASSERT(node->is_type<shape_of>() || node->is_type<input_layout>(), "[GPU] Expected node type is shape_of");
662662
dependant_shape_of_nodes.insert(node);
663663
}
664664

src/plugins/intel_gpu/tests/unit/passes/mark_shape_of_subgraphs_test.cpp

+93
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "select_inst.h"
1919
#include "strided_slice_inst.h"
2020
#include "broadcast_inst.h"
21+
#include "paged_attention_inst.h"
2122
#include "pass_manager.h"
2223
#include "to_string_utils.h"
2324

@@ -31,6 +32,10 @@ static bool check_subgraph(const program_node& node, const program_node& last_no
3132
if (custom_dependant_nodes_count.find(node.id()) != custom_dependant_nodes_count.end())
3233
expected_dependant_nodes = custom_dependant_nodes_count[node.id()];
3334

35+
// Skip some custom nodes if they are not intended to be included into shape_of subgraph
36+
if (expected_dependant_nodes == 0)
37+
return true;
38+
3439
if (!node.is_in_shape_of_subgraph() || node.get_dependant_shape_of_nodes().size() != expected_dependant_nodes)
3540
return false;
3641

@@ -423,3 +428,91 @@ TEST(mark_shape_of_subgraphs, broadcast_w_direct_shapeof_and_data) {
423428

424429
ASSERT_TRUE(check_subgraph(prog->get_node("shape_of"), prog->get_node("broadcast")));
425430
}
431+
432+
TEST(mark_shape_of_subgraphs, paged_attention_max_context_len_input) {
433+
auto& engine = get_test_engine();
434+
auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), 4, ov::Dimension::dynamic(), ov::Dimension::dynamic()},
435+
data_types::f32, format::bfyx};
436+
auto target_shape = engine.allocate_memory({ ov::PartialShape{4}, data_types::i32, format::bfyx });
437+
set_values(target_shape, {4, 4, 1, 1});
438+
439+
auto subtract_one = engine.allocate_memory({ ov::PartialShape{1}, data_types::i32, format::bfyx });
440+
set_values(target_shape, {-1});
441+
442+
auto query_layout = layout{ov::PartialShape{ov::Dimension::dynamic(), 128},
443+
data_types::f32,
444+
format::bfyx};
445+
auto key_layout = query_layout;
446+
auto value_layout = query_layout;
447+
auto key_cache_layout = layout{ov::PartialShape{ov::Dimension::dynamic(), 2, 64, 16},
448+
data_types::f32,
449+
format::bfyx};
450+
auto dynamic_i32_layout = layout{ov::PartialShape::dynamic(1), data_types::i32, format::bfyx};
451+
auto value_cache_layout = key_cache_layout;
452+
auto past_lens_layout = dynamic_i32_layout;
453+
auto subsequence_begins_layout = dynamic_i32_layout;
454+
auto block_indices_layout = dynamic_i32_layout;
455+
auto block_indices_begins_layout = dynamic_i32_layout;
456+
auto scale_layout = layout{ov::PartialShape{1}, data_types::f32, format::bfyx};
457+
auto sliding_window_layout = layout{ov::PartialShape{}, data_types::i32, format::bfyx};
458+
auto alibi_layout = layout{ov::PartialShape{}, data_types::f32, format::bfyx};
459+
auto max_context_len_layout = layout{ov::PartialShape{1}, data_types::i32, format::bfyx};;
460+
461+
std::vector<input_info> pa_inputs = {input_info("query"),
462+
input_info("key"),
463+
input_info("value"),
464+
input_info("key_cache"),
465+
input_info("value_cache"),
466+
input_info("past_lens"),
467+
input_info("subsequence_begins"),
468+
input_info("block_indices"),
469+
input_info("block_indices_begins"),
470+
input_info("scale"),
471+
input_info("sliding_window"),
472+
input_info("alibi"),
473+
input_info("max_context_len")};
474+
475+
auto pa_prim = paged_attention("paged_attention", pa_inputs);
476+
pa_prim.head_size = 64;
477+
pa_prim.kv_heads_num = 2;
478+
pa_prim.heads_num = 2;
479+
pa_prim.scale_val = 1.f;
480+
pa_prim.has_alibi = false;
481+
pa_prim.num_outputs = 1;
482+
pa_prim.has_rotated_blocks = false;
483+
484+
topology topology;
485+
topology.add(input_layout("query", query_layout));
486+
topology.add(input_layout("key", key_layout));
487+
topology.add(input_layout("value", value_layout));
488+
topology.add(input_layout("key_cache", key_cache_layout));
489+
topology.add(input_layout("value_cache", value_cache_layout));
490+
topology.add(input_layout("past_lens", past_lens_layout));
491+
topology.add(input_layout("subsequence_begins", subsequence_begins_layout));
492+
topology.add(input_layout("block_indices", block_indices_layout));
493+
topology.add(input_layout("block_indices_begins", block_indices_begins_layout));
494+
topology.add(input_layout("scale", scale_layout));
495+
topology.add(input_layout("sliding_window", sliding_window_layout));
496+
topology.add(input_layout("alibi", alibi_layout));
497+
topology.add(input_layout("max_context_len", max_context_len_layout));
498+
topology.add(input_layout("input", input_layout_dynamic));
499+
topology.add(data("target_shape", target_shape));
500+
topology.add(data("subtract_one", subtract_one));
501+
topology.add(shape_of("shape_of", input_info("input"), data_types::i32));
502+
topology.add(broadcast("broadcast", input_info("shape_of"), input_info("target_shape"), {}, ov::op::BroadcastType::BIDIRECTIONAL));
503+
topology.add(eltwise("subtract_one_max_context_len", input_info("max_context_len"), input_info("subtract_one"), eltwise_mode::sum));
504+
topology.add(eltwise("updated_broadcast", input_info("broadcast"), input_info("subtract_one_max_context_len"), eltwise_mode::sum));
505+
topology.add(reshape("reshape", input_info("input"), input_info("updated_broadcast"), false, ov::PartialShape::dynamic(4)));
506+
topology.add(pa_prim);
507+
508+
ExecutionConfig config = get_test_default_config(engine);
509+
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
510+
config.set_property(ov::intel_gpu::optimize_data(true));
511+
network network(engine, topology, config);
512+
513+
auto prog = network.get_program();
514+
ASSERT_NE(prog, nullptr);
515+
516+
ASSERT_TRUE(check_subgraph(prog->get_node("shape_of"), prog->get_node("updated_broadcast"), {{"updated_broadcast", 2}}));
517+
ASSERT_TRUE(check_subgraph(prog->get_node("max_context_len"), prog->get_node("updated_broadcast"), {{"updated_broadcast", 2}, {"paged_attention", 0}}));
518+
}

src/plugins/intel_npu/src/al/include/intel_npu/config/runtime.hpp

+20
Original file line numberDiff line numberDiff line change
@@ -309,4 +309,24 @@ struct RUN_INFERENCES_SEQUENTIALLY final : OptionBase<RUN_INFERENCES_SEQUENTIALL
309309
}
310310
};
311311

312+
struct DISABLE_VERSION_CHECK final : OptionBase<DISABLE_VERSION_CHECK, bool> {
313+
static std::string_view key() {
314+
return ov::intel_npu::disable_version_check.name();
315+
}
316+
317+
static bool defaultValue() {
318+
return false;
319+
}
320+
321+
#ifdef NPU_PLUGIN_DEVELOPER_BUILD
322+
static std::string_view envVar() {
323+
return "OV_NPU_DISABLE_VERSION_CHECK";
324+
}
325+
#endif
326+
327+
static OptionMode mode() {
328+
return OptionMode::RunTime;
329+
}
330+
};
331+
312332
} // namespace intel_npu

src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp

+7
Original file line numberDiff line numberDiff line change
@@ -329,5 +329,12 @@ static constexpr ov::Property<std::string> backend_compilation_params{"NPU_BACKE
329329
*/
330330
static constexpr ov::Property<bool> run_inferences_sequentially{"NPU_RUN_INFERENCES_SEQUENTIALLY"};
331331

332+
/**
333+
* @brief [Only for NPU Plugin]
334+
* Type: boolean, default is false.
335+
* This option allows to skip the blob version check
336+
*/
337+
static constexpr ov::Property<bool> disable_version_check{"NPU_DISABLE_VERSION_CHECK"};
338+
332339
} // namespace intel_npu
333340
} // namespace ov

src/plugins/intel_npu/src/al/src/config/runtime.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ void intel_npu::registerRunTimeOptions(OptionsDesc& desc) {
2929
desc.add<WEIGHTS_PATH>();
3030
desc.add<BYPASS_UMD_CACHING>();
3131
desc.add<RUN_INFERENCES_SEQUENTIALLY>();
32+
desc.add<DISABLE_VERSION_CHECK>();
3233
}
3334

3435
// Heuristically obtained number. Varies depending on the values of PLATFORM and PERFORMANCE_HINT

0 commit comments

Comments
 (0)