Skip to content

Commit 467d37c

Browse files
[GPU] Reinterpret from 1 dim mem to 0 dim mem instead of allocating 0 bytes layout to OpenCL
1 parent a5c0d67 commit 467d37c

File tree

4 files changed

+138
-9
lines changed

4 files changed

+138
-9
lines changed

src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp

+21
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <ostream>
88
#include <tuple>
9+
#include "intel_gpu/runtime/engine.hpp"
910
#include "intel_gpu/runtime/layout.hpp"
1011
#include "intel_gpu/runtime/memory.hpp"
1112
#include "intel_gpu/runtime/optionals.hpp"
@@ -111,6 +112,26 @@ inline ov::Shape predict_shape(const std::string& name, const cldnn::layout layo
111112
return layout.get_shape();
112113
}
113114

115+
inline cldnn::memory::ptr allocate_memory_evenif_zero_bytes(cldnn::engine& _engine,
116+
const cldnn::layout& layout,
117+
cldnn::allocation_type type,
118+
bool reset = true) {
119+
if (layout.bytes_count() == 0) {
120+
auto non_zero_layout = cldnn::layout({1}, layout.data_type, layout.format);
121+
auto res = _engine.allocate_memory(non_zero_layout, type, false);
122+
return _engine.reinterpret_buffer(*res, layout);
123+
} else {
124+
return _engine.allocate_memory(layout, type, reset);
125+
}
126+
}
127+
128+
inline cldnn::memory::ptr allocate_memory_evenif_zero_bytes(cldnn::engine& _engine,
129+
const cldnn::layout& layout,
130+
bool reset = true) {
131+
cldnn::allocation_type type = _engine.get_lockable_preferred_memory_allocation_type(layout.format.is_image_2d());
132+
return allocate_memory_evenif_zero_bytes(_engine, layout, type, reset);
133+
}
134+
114135
/// WA: Force exit. Any opencl api call can be hang after CL_OUT_OF_RESOURCES.
115136
inline void ForceExit() {
116137
std::cerr << "[GPU] force exit.\n"

src/plugins/intel_gpu/src/graph/loop.cpp

+9-7
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "mutable_data_inst.h"
88
#include "json_object.h"
99
#include "primitive_type_base.h"
10+
#include "intel_gpu/plugin/common_utils.hpp"
1011
#include "intel_gpu/primitives/data.hpp"
1112
#include "intel_gpu/primitives/mutable_data.hpp"
1213
#include "intel_gpu/runtime/error_handler.hpp"
@@ -319,7 +320,7 @@ void loop_inst::update_backedge_mapped_memory() {
319320
// generally, shouldn't go this way, but...
320321
auto output_prim = body_network->get_primitive(back_edge.from);
321322
layout output_layout = output_prim->output_memory().get_layout();
322-
backedge_mem = body_network->get_engine().allocate_memory(output_layout, 0);
323+
backedge_mem = ov::intel_gpu::allocate_memory_evenif_zero_bytes(body_network->get_engine(), output_layout, false);
323324
}
324325
} else {
325326
auto external_id = output_mapping.front()->external_id;
@@ -397,7 +398,7 @@ loop_inst::concatenated_memory_mapping::ptr loop_inst::create_concat_memory_map(
397398
<< sliced_layout.get_partial_shape().to_string()
398399
<< " to " << updated_sliced_layout.to_string() << std::endl;
399400
sliced_layout.set_partial_shape(updated_sliced_layout);
400-
inter_mem_ptr = engine.allocate_memory(sliced_layout);
401+
inter_mem_ptr = ov::intel_gpu::allocate_memory_evenif_zero_bytes(engine, sliced_layout);
401402
intern_prim->set_output_layout(sliced_layout, internal_id.idx);
402403
}
403404

@@ -408,7 +409,7 @@ loop_inst::concatenated_memory_mapping::ptr loop_inst::create_concat_memory_map(
408409
sliced_mems.reserve(num_iterations);
409410
sliced_mems.push_back(inter_mem_ptr);
410411
for (int j=1; j < num_iterations; ++j) {
411-
memory::ptr sliced_mem = engine.allocate_memory(sliced_layout);
412+
memory::ptr sliced_mem = ov::intel_gpu::allocate_memory_evenif_zero_bytes(engine, sliced_layout);
412413
sliced_mems.push_back(sliced_mem);
413414
}
414415
}
@@ -500,7 +501,7 @@ void loop_inst::preprocess_input_memory(const int64_t num_iterations) {
500501
// if internal input memory is in backedge, allocate new memory.
501502
// Because internal input memory's data will be updated through backedge process.
502503
if (iter != _back_edges.end()) {
503-
internal_input_memory = body_network->get_engine().allocate_memory(memory->get_layout(), false);
504+
internal_input_memory = ov::intel_gpu::allocate_memory_evenif_zero_bytes(body_network->get_engine(), memory->get_layout(), false);
504505
internal_input_memory->copy_from(body_network->get_stream(), *memory);
505506
GPU_DEBUG_LOG << "Input memory of internal node(" << internal_id.to_string() << ") is set to new memory("
506507
<< internal_input_memory << ", " << internal_input_memory->get_layout().to_short_string()
@@ -723,7 +724,7 @@ void loop_inst::postprocess_output_memory(bool is_dynamic, int64_t current_itera
723724
} else {
724725
if (!output_allocated || shape_changed()) {
725726
auto concat_layout = _impl_params->get_output_layout(external_id.idx);
726-
auto concat_mem = _network.get_engine().allocate_memory(concat_layout, false);
727+
auto concat_mem = ov::intel_gpu::allocate_memory_evenif_zero_bytes(_network.get_engine(), concat_layout, false);
727728
external_outputs[external_id.idx] = concat_mem;
728729
auto iter = std::find_if(concatenated_output_mem_mappings.begin(),
729730
concatenated_output_mem_mappings.end(),
@@ -1082,7 +1083,8 @@ std::vector<event::ptr> loop_inst::handle_buffers_for_next_iteration(const loop_
10821083
// Check backedge_to shape needs to be updated by initial_mem
10831084
OPENVINO_ASSERT(mapping.initial_mem != nullptr, "initial_mem should not be null");
10841085
if (!mapping.initial_mem->get_layout().identical(to_mem->get_layout())) {
1085-
to_mem = body_network->get_engine().allocate_memory(mapping.initial_mem->get_layout(), false);
1086+
to_mem = ov::intel_gpu::allocate_memory_evenif_zero_bytes(body_network->get_engine(), mapping.initial_mem->get_layout(), false);
1087+
10861088
body_network->set_input_data(to_id, to_mem);
10871089
ev = to_mem->copy_from(body_network->get_stream(), *(mapping.initial_mem));
10881090
GPU_DEBUG_LOG << iter << ") [SINGLE] Backedge_to node(" << to_id << ") is set to new memory("
@@ -1104,7 +1106,7 @@ std::vector<event::ptr> loop_inst::handle_buffers_for_next_iteration(const loop_
11041106

11051107
// Check backedge_to shape needs to be updated by backedge_from
11061108
if (!from_mem->get_layout().identical(to_mem->get_layout())) {
1107-
to_mem = body_network->get_engine().allocate_memory(from_mem->get_layout(), false);
1109+
to_mem = ov::intel_gpu::allocate_memory_evenif_zero_bytes(body_network->get_engine(), from_mem->get_layout(), false);
11081110
GPU_DEBUG_LOG << iter << ") [SINGLE] Backedge_to node(" << to_id << ") is set to new memory("
11091111
<< to_mem << ", " << to_mem->get_layout().to_short_string()
11101112
<< ") because of shape update from backedge_from()" << from_id

src/plugins/intel_gpu/src/graph/primitive_inst.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -1982,11 +1982,11 @@ memory::ptr primitive_inst::allocate_output(engine& _engine,
19821982
if ((_node.is_output() && is_reorder_weights) || (!_node.is_output() && _node.is_type<input_layout>()))
19831983
reset = false;
19841984
GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl;
1985-
return _engine.allocate_memory(layout, alloc_type, reset);
1985+
return ov::intel_gpu::allocate_memory_evenif_zero_bytes(_engine, layout, alloc_type, reset);
19861986
}
19871987
} else if (!_node.can_share_buffer() || _node.can_be_optimized() || _node.is_output()) {
19881988
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
1989-
return _engine.allocate_memory(layout, alloc_type, reset);
1989+
return ov::intel_gpu::allocate_memory_evenif_zero_bytes(_engine, layout, alloc_type, reset);
19901990
} else {
19911991
return get_memory_from_pool(_engine,
19921992
net_id,

src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp

+106
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <intel_gpu/runtime/memory.hpp>
88
#include <intel_gpu/runtime/engine.hpp>
99
#include <intel_gpu/graph/network.hpp>
10+
#include "intel_gpu/plugin/common_utils.hpp"
1011
#include <intel_gpu/primitives/input_layout.hpp>
1112
#include "intel_gpu/primitives/eltwise.hpp"
1213
#include <intel_gpu/primitives/data.hpp>
@@ -1212,3 +1213,108 @@ TEST(loop_gpu, support_loop_w_dynamic_input_update_primitive_id) {
12121213
std::vector<float>(),
12131214
2, 3);
12141215
}
1216+
1217+
template <typename T>
1218+
void test_loop_gpu_zero_bytes_layout(bool is_caching_test)
1219+
{
1220+
auto& engine = get_test_engine();
1221+
1222+
// shape for zero bytes layout
1223+
auto trip_count_mem = ov::intel_gpu::allocate_memory_evenif_zero_bytes(engine, { cldnn::layout{ ov::PartialShape({0}), data_types::i32, format::bfyx } });
1224+
1225+
auto input_mem = engine.allocate_memory({ data_types::f32, format::bfyx, { 1, 1, 4, 5 } });
1226+
auto operand_mem = engine.allocate_memory({ data_types::f32, format::bfyx, { 1, 1, 4, 5 } });
1227+
auto initial_condition_mem = engine.allocate_memory({ data_types::i32, format::bfyx, { 1, 1, 1, 1 } });
1228+
auto num_iteration_mem = engine.allocate_memory({ data_types::i32, format::bfyx, { 1, 1, 1, 1 } });
1229+
1230+
std::vector<T> input_data{
1231+
1.0f, 2.0f, -15.f, 3.0f, 4.0f, -15.f, 5.0f, 6.0f, -15.f, 7.0f,
1232+
-15.f, 0.0f, 0.0f, -15.f, 0.5f, -0.5f, -15.f, 8.0f, 1.5f, 5.2f
1233+
};
1234+
std::vector<T> eltwise_operand {
1235+
1.f, -2.f, 3.f, -4.f, 3.0f, -2.0f, 1.f, -2.f, 3.0f, -4.0f,
1236+
3.f, -2.f, 1.f, -2.f, 3.5f, -4.5f, 5.f, -4.f, 3.5f, -2.2f
1237+
};
1238+
int trip_count = 8;
1239+
int initial_condition = 1;
1240+
1241+
// initialize input buffers
1242+
set_values(input_mem, input_data);
1243+
set_values(operand_mem, eltwise_operand);
1244+
set_values(trip_count_mem, { trip_count });
1245+
set_values(initial_condition_mem, {initial_condition});
1246+
1247+
topology body(
1248+
input_layout("input", input_mem->get_layout()),
1249+
data("eltwise_operand", operand_mem),
1250+
eltwise("eltwise", input_info("input"), input_info("eltwise_operand"), eltwise_mode::sum)
1251+
);
1252+
1253+
std::vector<loop::io_primitive_map> input_primitive_maps { loop::io_primitive_map("input", "input") };
1254+
std::vector<loop::io_primitive_map> output_primitive_maps { loop::io_primitive_map("loop", "eltwise") };
1255+
std::vector<loop::backedge_mapping> back_edges { loop::backedge_mapping("eltwise", "input") };
1256+
1257+
auto body_program = build_program(engine, body, "", output_primitive_maps, back_edges);
1258+
1259+
topology topology(
1260+
input_layout("input", input_mem->get_layout()),
1261+
input_layout("trip_count", trip_count_mem->get_layout()),
1262+
input_layout("initial_condition", initial_condition_mem->get_layout()),
1263+
mutable_data("num_iteration", num_iteration_mem),
1264+
loop("loop", { input_info("num_iteration"), input_info("trip_count"), input_info("initial_condition"), input_info("input") }, body_program,
1265+
"trip_count", "initial_condition", "num_iteration",
1266+
input_primitive_maps, output_primitive_maps, back_edges, 8)
1267+
);
1268+
1269+
cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test);
1270+
1271+
network->set_input_data("input", input_mem);
1272+
network->set_input_data("trip_count", trip_count_mem);
1273+
network->set_input_data("initial_condition", initial_condition_mem);
1274+
1275+
auto outputs = network->execute();
1276+
ASSERT_EQ(outputs.size(), 1);
1277+
auto output = outputs.begin()->second.get_memory();
1278+
auto output_layout = output->get_layout();
1279+
1280+
ASSERT_EQ(output_layout.batch(), 1);
1281+
ASSERT_EQ(output_layout.feature(), 1);
1282+
ASSERT_EQ(output_layout.spatial(0), 4);
1283+
ASSERT_EQ(output_layout.spatial(1), 5);
1284+
1285+
// value check
1286+
{
1287+
mem_lock<T> output_ptr{ output, get_test_stream() };
1288+
ASSERT_EQ(output_ptr.size(), input_data.size());
1289+
for (size_t i = 0, iend = input_data.size(); i < iend; ++i) {
1290+
ASSERT_FLOAT_EQ(output_ptr[i], input_data[i] + eltwise_operand[i] * trip_count);
1291+
}
1292+
}
1293+
1294+
// allocate new output memory
1295+
layout loop_l = network->get_output_memory("loop")->get_layout();
1296+
auto output_mem = engine.allocate_memory(loop_l);
1297+
network->set_output_memory("loop", output_mem);
1298+
1299+
//one more execute
1300+
set_values(input_mem, input_data);
1301+
set_values(operand_mem, eltwise_operand);
1302+
set_values(trip_count_mem, { trip_count });
1303+
set_values(initial_condition_mem, { initial_condition });
1304+
outputs = network->execute();
1305+
1306+
// check everything once again
1307+
ASSERT_EQ(outputs.size(), 1);
1308+
auto output2 = outputs.begin()->second.get_memory();
1309+
{
1310+
mem_lock<T> output_ptr2{ output2, get_test_stream() };
1311+
ASSERT_EQ(output_ptr2.size(), input_data.size());
1312+
for (size_t i = 0, iend = input_data.size(); i < iend; ++i) {
1313+
ASSERT_FLOAT_EQ(output_ptr2[i], input_data[i] + eltwise_operand[i] * trip_count);
1314+
}
1315+
}
1316+
}
1317+
1318+
TEST(loop_gpu, zero_bytes_layout) {
1319+
test_loop_gpu_zero_bytes_layout<float>(false);
1320+
}

0 commit comments

Comments
 (0)