Skip to content

Commit b87b885

Browse files
[GPU] Fix dynamic loop's not matched issue during multiple shapes are inferenced
1 parent b7c8107 commit b87b885

File tree

3 files changed

+172
-1
lines changed

3 files changed

+172
-1
lines changed

src/plugins/intel_gpu/src/graph/include/loop_inst.h

+1
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@ class typed_primitive_inst<loop> : public typed_primitive_inst_base<loop> {
298298
std::vector<backedge_memory_mapping> backedge_memory_mappings;
299299
std::vector<concatenated_memory_mapping::ptr> concatenated_input_mem_mappings;
300300
std::vector<concatenated_memory_mapping::ptr> concatenated_output_mem_mappings;
301+
ov::PartialShape dynamic_sliced_layout;
301302

302303
static std::string to_string(const loop_node& node);
303304

src/plugins/intel_gpu/src/graph/loop.cpp

+9-1
Original file line numberDiff line numberDiff line change
@@ -375,12 +375,20 @@ loop_inst::concatenated_memory_mapping::ptr loop_inst::create_concat_memory_map(
375375
if (extern_mem_ptr != nullptr) {
376376
layout sliced_layout = intern_prim->get_output_layout(internal_id.idx);
377377
auto inter_mem_ptr = intern_prim->output_memory_ptr(internal_id.idx);
378-
if (inter_mem_ptr == nullptr) {
378+
if (inter_mem_ptr == nullptr || shape_changed()) {
379379
// if inner body intern_prim has no output memory because it has dynamic shape,
380380
// calculate inner body intern_prim layout using concat_mem's layout.
381381
auto updated_sliced_layout = sliced_layout.get_partial_shape();
382382
OPENVINO_ASSERT(updated_sliced_layout[io_prim_map.axis].is_static() || num_iterations > 0,
383383
"Not allowed dynamic dimension for axis when num_iteraiont is negative");
384+
385+
// Save or load dynamic sliced layout to update with various shapes
386+
if (updated_sliced_layout.is_dynamic()) {
387+
dynamic_sliced_layout = updated_sliced_layout;
388+
} else if (dynamic_sliced_layout.is_dynamic()) {
389+
updated_sliced_layout = dynamic_sliced_layout;
390+
}
391+
384392
auto concat_pshape = extern_prim->get_output_layout().get_partial_shape();
385393
const auto shape_size = concat_pshape.size();
386394
for (size_t i = 0; i < shape_size; i++) {

src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp

+162
Original file line numberDiff line numberDiff line change
@@ -601,3 +601,165 @@ TEST(loop_gpu, support_dynamic_tensoriterator_outer_axis) {
601601

602602
test_loop_gpu_wo_trip_count({ 2, 1, 1, 2}, { 2, 5, 1, 2}, input_data_5_4, output_data_5_4, 1, 4);
603603
}
604+
605+
static void test_loop_gpu_wo_trip_count_w_multiple_shapes(ov::PartialShape body_input_layout,
606+
std::vector<ov::PartialShape> whole_layouts,
607+
std::vector<std::vector<float>> input_data_list,
608+
std::vector<float> expected_output_data,
609+
size_t axis,
610+
size_t exit_value,
611+
bool is_caching_test = false) {
612+
auto& engine = get_test_engine();
613+
614+
auto b_input_layout = cldnn::layout{ body_input_layout, data_types::f32, format::bfyx };
615+
auto const_layout = cldnn::layout{ {}, data_types::i64, format::bfyx };
616+
617+
auto e_initial_condition_mem = engine.allocate_memory(const_layout);
618+
auto e_num_iteration_mem = engine.allocate_memory(const_layout);
619+
auto b_exit_value_mem = engine.allocate_memory(const_layout);
620+
auto b_index_inc_mem = engine.allocate_memory(const_layout);
621+
622+
// initialize input buffers
623+
set_values(e_initial_condition_mem, {1});
624+
set_values(b_exit_value_mem, {exit_value});
625+
set_values(b_index_inc_mem, {1});
626+
set_values(e_num_iteration_mem, {0});
627+
628+
primitive_id body_current_iteration_id = "b_index";
629+
primitive_id body_execution_condition_id = "b_cond_exit_value";
630+
631+
cldnn::topology body(
632+
input_layout(body_current_iteration_id, const_layout),
633+
input_layout("b_add_data", b_input_layout),
634+
input_layout("b_mul_data", b_input_layout),
635+
data("b_exit_value", b_exit_value_mem),
636+
data("b_index_inc", b_index_inc_mem),
637+
eltwise("b_index_update", input_info(body_current_iteration_id), input_info("b_index_inc"), eltwise_mode::sum),
638+
reorder("b_index_cast", input_info("b_index_update"),
639+
cldnn::format::any, data_types::f32, {}, cldnn::reorder_mean_mode::subtract, cldnn::padding(), true),
640+
eltwise(body_execution_condition_id, input_info("b_index"), input_info("b_exit_value"), eltwise_mode::lt),
641+
eltwise("b_add", input_info("b_add_data"), input_info("b_index_cast"), eltwise_mode::sum),
642+
eltwise("b_mul", input_info("b_mul_data"), input_info("b_index_cast"), eltwise_mode::prod));
643+
644+
primitive_id trip_count_id = "";
645+
primitive_id actual_iteration_count_id = "actual_iteration_count";
646+
primitive_id initial_condition_id = "initial_condition";
647+
int64_t num_iterations = -1;
648+
649+
std::vector<loop::io_primitive_map> input_primitive_maps {
650+
loop::io_primitive_map("input", "b_add_data", axis),
651+
loop::io_primitive_map("input", "b_mul_data", axis),
652+
loop::io_primitive_map(actual_iteration_count_id, body_current_iteration_id) };
653+
std::vector<loop::io_primitive_map> output_primitive_maps {
654+
loop::io_primitive_map(cldnn::input_info("loop", 0), cldnn::input_info("b_add", 0), axis),
655+
loop::io_primitive_map(cldnn::input_info("loop", 1), cldnn::input_info("b_mul", 0), axis) };
656+
std::vector<loop::backedge_mapping> back_edges {
657+
loop::backedge_mapping("b_index_update", body_current_iteration_id) };
658+
659+
auto body_program = build_program(engine, body, body_execution_condition_id, output_primitive_maps, back_edges, true);
660+
661+
cldnn::topology topology(
662+
input_layout("input", b_input_layout),
663+
input_layout(initial_condition_id, e_initial_condition_mem->get_layout()),
664+
mutable_data(actual_iteration_count_id, e_num_iteration_mem),
665+
loop("loop", { input_info(actual_iteration_count_id), input_info(initial_condition_id), input_info("input") }, body_program,
666+
trip_count_id, initial_condition_id, actual_iteration_count_id,
667+
input_primitive_maps, output_primitive_maps, back_edges,
668+
num_iterations, body_current_iteration_id, body_execution_condition_id, 2),
669+
eltwise("out_sum", input_info("loop", 0), input_info("loop", 1), eltwise_mode::sum));
670+
671+
ExecutionConfig config = get_test_default_config(engine);
672+
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
673+
674+
cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
675+
676+
677+
for (size_t i = 0 ; i < whole_layouts.size(); i++) {
678+
auto whole_layout = whole_layouts[i];
679+
auto input_data = input_data_list[i];
680+
681+
// initialize input buffers
682+
set_values(e_initial_condition_mem, {1});
683+
set_values(b_exit_value_mem, {exit_value});
684+
set_values(b_index_inc_mem, {1});
685+
set_values(e_num_iteration_mem, {0});
686+
687+
auto e_input_layout = cldnn::layout{ whole_layout, data_types::f32, format::bfyx };
688+
auto e_input_mem = engine.allocate_memory(e_input_layout); // b,f,x,y
689+
auto expected_output_layout = whole_layout;
690+
set_values(e_input_mem, input_data);
691+
network->set_input_data("input", e_input_mem);
692+
693+
network->set_input_data(initial_condition_id, e_initial_condition_mem);
694+
695+
auto outputs = network->execute();
696+
ASSERT_EQ(outputs.size(), 1);
697+
698+
auto expected_num_iterations = (exit_value + 1);
699+
expected_output_layout[axis] = expected_num_iterations;
700+
auto e_output_layout = cldnn::layout{ expected_output_layout, data_types::f32, format::bfyx };
701+
702+
auto num_iter_mem = network->get_output_memory(actual_iteration_count_id);
703+
if (num_iter_mem != nullptr) {
704+
mem_lock<int64_t> num_iter_ptr{ num_iter_mem, get_test_stream() };
705+
ASSERT_EQ(num_iter_ptr.data()[0], expected_num_iterations);
706+
}
707+
708+
std::vector<float> expected(input_data.size());
709+
if (expected_output_data.size() == 0) {
710+
size_t unit = 1;
711+
for (size_t k = axis; k < whole_layout.size(); k++) {
712+
unit *= whole_layout[k].get_length();
713+
}
714+
715+
for (size_t j = 0; j < input_data.size(); j++) {
716+
auto val = static_cast<size_t>((j % unit) / 4) + 1;
717+
expected[j] = static_cast<float>(input_data[j] + val) + static_cast<float>(input_data[j] * val);
718+
}
719+
} else {
720+
expected = expected_output_data;
721+
}
722+
723+
auto output_mem = outputs.begin()->second.get_memory();
724+
auto output_layout = output_mem->get_layout();
725+
ASSERT_EQ(output_layout.batch(), e_output_layout.batch());
726+
ASSERT_EQ(output_layout.feature(), e_output_layout.feature());
727+
ASSERT_EQ(output_layout.spatial(0), e_output_layout.spatial(0));
728+
ASSERT_EQ(output_layout.spatial(1), e_output_layout.spatial(1));
729+
// value check
730+
{
731+
mem_lock<float> output_ptr{ output_mem, get_test_stream() };
732+
for (size_t i = 0, iend = output_layout.count(); i < iend; ++i) {
733+
ASSERT_FLOAT_EQ(output_ptr[i], expected.at(i));
734+
}
735+
}
736+
}
737+
}
738+
739+
std::vector<float> input_data_4_4{
740+
1.0f, 2.0f, -15.f, 3.0f,
741+
4.0f, -15.f, 5.0f, 6.0f,
742+
-15.f, 7.0f, -15.f, 0.0f,
743+
0.0f, -15.f, 0.5f, -0.5f,
744+
};
745+
746+
std::vector<float> input_data_2_4_4{
747+
1.0f, 2.0f, -15.f, 3.0f,
748+
4.0f, -15.f, 5.0f, 6.0f,
749+
-15.f, 7.0f, -15.f, 0.0f,
750+
0.0f, -15.f, 0.5f, -0.5f,
751+
752+
1.0f, 2.0f, -15.f, 3.0f,
753+
4.0f, -15.f, 5.0f, 6.0f,
754+
-15.f, 7.0f, -15.f, 0.0f,
755+
0.0f, -15.f, 0.5f, -0.5f,
756+
};
757+
758+
TEST(loop_gpu, support_loop_w_dynamic_input_w_various_shapes) {
759+
test_loop_gpu_wo_trip_count_w_multiple_shapes(
760+
{ 1, -1, 1, 4 },
761+
{{ 1, 1, 4, 4 }, { 1, 2, 4, 4 }}, // axis value should be iter_num = (exit_value + 1)
762+
{input_data_4_4, input_data_2_4_4},
763+
std::vector<float>(),
764+
2, 3);
765+
}

0 commit comments

Comments
 (0)