Skip to content

Commit 25d9b04

Browse files
[GPU] Fix dynamic loop's not matched issue during multiple shapes are inferenced
1 parent 5872549 commit 25d9b04

File tree

2 files changed

+207
-5
lines changed

2 files changed

+207
-5
lines changed

src/plugins/intel_gpu/src/graph/loop.cpp

+21-5
Original file line numberDiff line numberDiff line change
@@ -375,16 +375,18 @@ loop_inst::concatenated_memory_mapping::ptr loop_inst::create_concat_memory_map(
375375
if (extern_mem_ptr != nullptr) {
376376
layout sliced_layout = intern_prim->get_output_layout(internal_id.idx);
377377
auto inter_mem_ptr = intern_prim->output_memory_ptr(internal_id.idx);
378-
if (inter_mem_ptr == nullptr) {
378+
if (inter_mem_ptr == nullptr || shape_changed()) {
379379
// if inner body intern_prim has no output memory because it has dynamic shape,
380380
// calculate inner body intern_prim layout using concat_mem's layout.
381381
auto updated_sliced_layout = sliced_layout.get_partial_shape();
382382
OPENVINO_ASSERT(updated_sliced_layout[io_prim_map.axis].is_static() || num_iterations > 0,
383383
"Not allowed dynamic dimension for axis when num_iteraiont is negative");
384+
385+
auto origin_input_pshape = body_network->get_primitive(internal_id.pid)->get_node_output_layout().get_partial_shape();
384386
auto concat_pshape = extern_prim->get_output_layout().get_partial_shape();
385387
const auto shape_size = concat_pshape.size();
386388
for (size_t i = 0; i < shape_size; i++) {
387-
if (updated_sliced_layout[i].is_dynamic()) {
389+
if (origin_input_pshape[i].is_dynamic()) {
388390
updated_sliced_layout[i] = concat_pshape[i];
389391
}
390392
}
@@ -464,7 +466,13 @@ void loop_inst::preprocess_input_memory(const int64_t num_iterations) {
464466
continue;
465467
}
466468

467-
auto memory = input_memory_ptr(memory_num);
469+
auto memory_origin = input_memory_ptr(memory_num);
470+
auto memory = memory_origin;
471+
auto input_layout = _impl_params->get_input_layout(memory_num);
472+
if (memory_origin->get_layout() != input_layout) {
473+
memory = _network.get_engine().reinterpret_buffer(*memory_origin, input_layout);
474+
}
475+
468476
for (size_t i = 0; i < input_map_ptrs.size(); ++i) {
469477
const auto input_map = input_map_ptrs.at(i);
470478
const auto& external_id = input_map->external_id;
@@ -549,16 +557,24 @@ void loop_inst::preprocess_backedge_memory() {
549557
if (is_dynamic()) {
550558
if (output_prim->outputs_allocated()) {
551559
auto internal_output_prim_mem = output_prim->output_memory_ptr();
552-
if (internal_output_prim_mem->get_layout() == initial_mem->get_layout()) {
560+
auto input_prim = body_network->get_primitive(back_edge.to);
561+
auto input_layout = input_prim->_impl_params->get_output_layout();
562+
563+
if (internal_output_prim_mem->get_layout() == initial_mem->get_layout() &&
564+
internal_output_prim_mem->get_layout() == input_layout) {
553565
backedge_mem = internal_output_prim_mem;
554566
body_network->set_input_data(back_edge.to, backedge_mem);
555567
GPU_DEBUG_LOG << idx << ") Get backedge_mem(" << backedge_mem
556568
<< ") from back_edge.from(" << back_edge.from << ")" << std::endl;
557569
} else {
570+
auto update_initial_memory = initial_mem;
571+
if (update_initial_memory->get_layout() != input_layout) {
572+
update_initial_memory = _network.get_engine().reinterpret_buffer(*initial_mem, input_layout);
573+
}
558574
// When input layout is changed or backedge_mem is null
559575
// because output layout of body network is not calculated yet,
560576
// Set backedge_mem to nullptr and update it after first execution.
561-
body_network->set_input_data(back_edge.to, initial_mem);
577+
body_network->set_input_data(back_edge.to, update_initial_memory);
562578
GPU_DEBUG_LOG << idx << ") Just set input data using initial_mem because back_edge.from("
563579
<< back_edge.from << ") layout is changed or backedge_mem is nullptr" << std::endl;
564580
}

src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp

+186
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
#include "intel_gpu/primitives/eltwise.hpp"
1212
#include <intel_gpu/primitives/data.hpp>
1313
#include <intel_gpu/primitives/loop.hpp>
14+
#include <intel_gpu/primitives/reshape.hpp>
15+
#include <intel_gpu/primitives/reduce.hpp>
16+
#include <intel_gpu/primitives/shape_of.hpp>
1417
#include <intel_gpu/primitives/mutable_data.hpp>
1518
#include <intel_gpu/primitives/data.hpp>
1619
#include <intel_gpu/graph/program.hpp>
@@ -601,3 +604,186 @@ TEST(loop_gpu, support_dynamic_tensoriterator_outer_axis) {
601604

602605
test_loop_gpu_wo_trip_count({ 2, 1, 1, 2}, { 2, 5, 1, 2}, input_data_5_4, output_data_5_4, 1, 4);
603606
}
607+
608+
static void test_loop_gpu_wo_trip_count_w_multiple_shapes(ov::PartialShape body_input_layout,
609+
std::vector<ov::PartialShape> whole_layouts,
610+
std::vector<std::vector<float>> input_data_list,
611+
std::vector<float> expected_output_data,
612+
size_t axis,
613+
size_t exit_value,
614+
bool is_caching_test = false) {
615+
auto& engine = get_test_engine();
616+
617+
auto b_input_layout = cldnn::layout{ body_input_layout, data_types::f32, format::bfyx };
618+
619+
ov::PartialShape sliced_input_shape = body_input_layout;
620+
sliced_input_shape[axis] = 1;
621+
auto sliced_input_layout = cldnn::layout{ sliced_input_shape, data_types::f32, format::bfyx };
622+
623+
auto const_layout = cldnn::layout{ {}, data_types::i64, format::bfyx };
624+
625+
auto e_initial_condition_mem = engine.allocate_memory(const_layout);
626+
auto e_num_iteration_mem = engine.allocate_memory(const_layout);
627+
auto b_exit_value_mem = engine.allocate_memory(const_layout);
628+
auto b_index_inc_mem = engine.allocate_memory(const_layout);
629+
630+
// initialize input buffers
631+
set_values(e_initial_condition_mem, {1});
632+
set_values(b_exit_value_mem, {exit_value});
633+
set_values(b_index_inc_mem, {1});
634+
set_values(e_num_iteration_mem, {0});
635+
636+
primitive_id body_current_iteration_id = "b_index";
637+
primitive_id body_execution_condition_id = "b_cond_exit_value";
638+
639+
cldnn::topology body(
640+
input_layout(body_current_iteration_id, const_layout),
641+
input_layout("b_add_data", sliced_input_layout),
642+
input_layout("b_mul_data", sliced_input_layout),
643+
data("b_exit_value", b_exit_value_mem),
644+
data("b_index_inc", b_index_inc_mem),
645+
eltwise("b_index_update", input_info(body_current_iteration_id), input_info("b_index_inc"), eltwise_mode::sum),
646+
reorder("b_index_cast", input_info("b_index_update"),
647+
cldnn::format::any, data_types::f32, {}, cldnn::reorder_mean_mode::subtract, cldnn::padding(), true),
648+
eltwise(body_execution_condition_id, input_info("b_index"), input_info("b_exit_value"), eltwise_mode::lt),
649+
eltwise("b_add", input_info("b_add_data"), input_info("b_index_cast"), eltwise_mode::sum),
650+
eltwise("b_mul", input_info("b_mul_data"), input_info("b_index_cast"), eltwise_mode::prod));
651+
652+
primitive_id trip_count_id = "";
653+
primitive_id actual_iteration_count_id = "actual_iteration_count";
654+
primitive_id initial_condition_id = "initial_condition";
655+
int64_t num_iterations = -1;
656+
657+
std::vector<loop::io_primitive_map> input_primitive_maps {
658+
loop::io_primitive_map("input", "b_add_data", axis),
659+
loop::io_primitive_map("input", "b_mul_data", axis),
660+
loop::io_primitive_map(actual_iteration_count_id, body_current_iteration_id) };
661+
std::vector<loop::io_primitive_map> output_primitive_maps {
662+
loop::io_primitive_map(cldnn::input_info("loop", 0), cldnn::input_info("b_add", 0), axis),
663+
loop::io_primitive_map(cldnn::input_info("loop", 1), cldnn::input_info("b_mul", 0), axis) };
664+
std::vector<loop::backedge_mapping> back_edges {
665+
loop::backedge_mapping("b_index_update", body_current_iteration_id) };
666+
667+
auto body_program = build_program(engine, body, body_execution_condition_id, output_primitive_maps, back_edges, true);
668+
669+
auto const_shape = engine.allocate_memory({ov::PartialShape{4}, data_types::i32, format::bfyx});
670+
std::vector<int32_t> body_input_layouts;
671+
for (size_t i = 0; i < body_input_layout.size(); i++) {
672+
if (body_input_layout[i].is_dynamic())
673+
body_input_layouts.push_back(-1);
674+
else
675+
body_input_layouts.push_back(body_input_layout[i].get_length());
676+
}
677+
set_values<int32_t>(const_shape, body_input_layouts);
678+
679+
cldnn::topology topology(
680+
input_layout("input_origin", b_input_layout),
681+
input_layout(initial_condition_id, e_initial_condition_mem->get_layout()),
682+
mutable_data(actual_iteration_count_id, e_num_iteration_mem),
683+
684+
shape_of("shape_of_input", input_info("input_origin"), data_types::i32),
685+
reduce("reduced_shape", input_info("shape_of_input"), reduce_mode::prod, {0}, true),
686+
reshape("reshape1", input_info("input_origin"), input_info("reduced_shape"), false, ov::PartialShape::dynamic(1)),
687+
data("const", const_shape),
688+
reshape("input", input_info("reshape1"), input_info("const"), false, ov::PartialShape::dynamic(4)),
689+
690+
loop("loop", { input_info(actual_iteration_count_id), input_info(initial_condition_id), input_info("input") }, body_program,
691+
trip_count_id, initial_condition_id, actual_iteration_count_id,
692+
input_primitive_maps, output_primitive_maps, back_edges,
693+
num_iterations, body_current_iteration_id, body_execution_condition_id, 2),
694+
eltwise("out_sum", input_info("loop", 0), input_info("loop", 1), eltwise_mode::sum));
695+
696+
ExecutionConfig config = get_test_default_config(engine);
697+
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
698+
699+
cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
700+
701+
for (size_t i = 0 ; i < whole_layouts.size(); i++) {
702+
auto whole_layout = whole_layouts[i];
703+
auto input_data = input_data_list[i];
704+
705+
// initialize input buffers
706+
set_values(e_initial_condition_mem, {1});
707+
set_values(b_exit_value_mem, {exit_value});
708+
set_values(b_index_inc_mem, {1});
709+
set_values(e_num_iteration_mem, {0});
710+
711+
auto e_input_layout = cldnn::layout{ whole_layout, data_types::f32, format::bfyx };
712+
auto e_input_mem = engine.allocate_memory(e_input_layout); // b,f,x,y
713+
auto expected_output_layout = whole_layout;
714+
set_values(e_input_mem, input_data);
715+
network->set_input_data("input_origin", e_input_mem);
716+
717+
network->set_input_data(initial_condition_id, e_initial_condition_mem);
718+
719+
auto outputs = network->execute();
720+
ASSERT_EQ(outputs.size(), 1);
721+
722+
auto expected_num_iterations = (exit_value + 1);
723+
expected_output_layout[axis] = expected_num_iterations;
724+
auto e_output_layout = cldnn::layout{ expected_output_layout, data_types::f32, format::bfyx };
725+
726+
auto num_iter_mem = network->get_output_memory(actual_iteration_count_id);
727+
if (num_iter_mem != nullptr) {
728+
mem_lock<int64_t> num_iter_ptr{ num_iter_mem, get_test_stream() };
729+
ASSERT_EQ(num_iter_ptr.data()[0], expected_num_iterations);
730+
}
731+
732+
std::vector<float> expected(input_data.size());
733+
if (expected_output_data.size() == 0) {
734+
size_t unit = 1;
735+
for (size_t k = axis; k < whole_layout.size(); k++) {
736+
unit *= whole_layout[k].get_length();
737+
}
738+
739+
for (size_t j = 0; j < input_data.size(); j++) {
740+
auto val = static_cast<size_t>((j % unit) / 4) + 1;
741+
expected[j] = static_cast<float>(input_data[j] + val) + static_cast<float>(input_data[j] * val);
742+
}
743+
} else {
744+
expected = expected_output_data;
745+
}
746+
747+
auto output_mem = outputs.begin()->second.get_memory();
748+
auto output_layout = output_mem->get_layout();
749+
ASSERT_EQ(output_layout.batch(), e_output_layout.batch());
750+
ASSERT_EQ(output_layout.feature(), e_output_layout.feature());
751+
ASSERT_EQ(output_layout.spatial(0), e_output_layout.spatial(0));
752+
ASSERT_EQ(output_layout.spatial(1), e_output_layout.spatial(1));
753+
// value check
754+
{
755+
mem_lock<float> output_ptr{ output_mem, get_test_stream() };
756+
for (size_t i = 0, iend = output_layout.count(); i < iend; ++i) {
757+
ASSERT_FLOAT_EQ(output_ptr[i], expected.at(i));
758+
}
759+
}
760+
}
761+
}
762+
763+
std::vector<float> input_data_4_4{
764+
1.0f, 2.0f, -15.f, 3.0f,
765+
4.0f, -15.f, 5.0f, 6.0f,
766+
-15.f, 7.0f, -15.f, 0.0f,
767+
0.0f, -15.f, 0.5f, -0.5f,
768+
};
769+
770+
std::vector<float> input_data_2_4_4{
771+
1.0f, 2.0f, -15.f, 3.0f,
772+
4.0f, -15.f, 5.0f, 6.0f,
773+
-15.f, 7.0f, -15.f, 0.0f,
774+
0.0f, -15.f, 0.5f, -0.5f,
775+
776+
1.0f, 2.0f, -15.f, 3.0f,
777+
4.0f, -15.f, 5.0f, 6.0f,
778+
-15.f, 7.0f, -15.f, 0.0f,
779+
0.0f, -15.f, 0.5f, -0.5f,
780+
};
781+
782+
TEST(loop_gpu, support_loop_w_dynamic_input_w_various_shapes) {
783+
test_loop_gpu_wo_trip_count_w_multiple_shapes(
784+
{ 1, -1, 4, 4 },
785+
{{ 1, 1, 4, 4 }, { 1, 2, 4, 4 }}, // axis value should be iter_num = (exit_value + 1)
786+
{input_data_4_4, input_data_2_4_4},
787+
std::vector<float>(),
788+
2, 3);
789+
}

0 commit comments

Comments
 (0)