Skip to content

Commit 64564e3

Browse files
[GPU] Reinterpret from 1 dim mem to 0 dim mem instead of allocating 0 bytes layout to OpenCL
1 parent b5a66b0 commit 64564e3

File tree

2 files changed

+127
-4
lines changed

2 files changed

+127
-4
lines changed

src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp

+22-4
Original file line numberDiff line numberDiff line change
@@ -174,23 +174,41 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty
174174

175175
check_allocatable(layout, type);
176176

177+
auto zero_bytes_layout = false;
178+
auto non_zero_layout = layout;
179+
if (layout.bytes_count() == 0) {
180+
cldnn::layout zero_dim_layout = layout;
181+
auto mem_ps = zero_dim_layout.get_partial_shape();
182+
for (size_t k = 0; k < mem_ps.size(); k++) {
183+
if (mem_ps[k] == 0)
184+
mem_ps[k] = 1;
185+
}
186+
187+
non_zero_layout = cldnn::layout(mem_ps, zero_dim_layout.data_type, zero_dim_layout.format);
188+
zero_bytes_layout = true;
189+
}
190+
177191
try {
178192
memory::ptr res = nullptr;
179193
if (layout.format.is_image_2d()) {
180-
res = std::make_shared<ocl::gpu_image2d>(this, layout);
194+
res = std::make_shared<ocl::gpu_image2d>(this, non_zero_layout);
181195
} else if (type == allocation_type::cl_mem) {
182-
res = std::make_shared<ocl::gpu_buffer>(this, layout);
196+
res = std::make_shared<ocl::gpu_buffer>(this, non_zero_layout);
183197
} else {
184-
res = std::make_shared<ocl::gpu_usm>(this, layout, type);
198+
res = std::make_shared<ocl::gpu_usm>(this, non_zero_layout, type);
185199
}
186200

187-
if (reset || res->is_memory_reset_needed(layout)) {
201+
if (reset || res->is_memory_reset_needed(non_zero_layout)) {
188202
auto ev = res->fill(get_service_stream());
189203
if (ev) {
190204
get_service_stream().wait_for_events({ev});
191205
}
192206
}
193207

208+
if (zero_bytes_layout) {
209+
res = reinterpret_buffer(*res, layout);
210+
}
211+
194212
return res;
195213
} catch (const cl::Error& clErr) {
196214
switch (clErr.err()) {

src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp

+105
Original file line numberDiff line numberDiff line change
@@ -1212,3 +1212,108 @@ TEST(loop_gpu, support_loop_w_dynamic_input_update_primitive_id) {
12121212
std::vector<float>(),
12131213
2, 3);
12141214
}
1215+
1216+
template <typename T>
1217+
void test_loop_gpu_zero_bytes_layout(bool is_caching_test)
1218+
{
1219+
auto& engine = get_test_engine();
1220+
1221+
// shape for zero bytes layout
1222+
auto trip_count_mem = engine.allocate_memory({ cldnn::layout{ ov::PartialShape({0}), data_types::i32, format::bfyx } });
1223+
1224+
auto input_mem = engine.allocate_memory({ data_types::f32, format::bfyx, { 1, 1, 4, 5 } });
1225+
auto operand_mem = engine.allocate_memory({ data_types::f32, format::bfyx, { 1, 1, 4, 5 } });
1226+
auto initial_condition_mem = engine.allocate_memory({ data_types::i32, format::bfyx, { 1, 1, 1, 1 } });
1227+
auto num_iteration_mem = engine.allocate_memory({ data_types::i32, format::bfyx, { 1, 1, 1, 1 } });
1228+
1229+
std::vector<T> input_data{
1230+
1.0f, 2.0f, -15.f, 3.0f, 4.0f, -15.f, 5.0f, 6.0f, -15.f, 7.0f,
1231+
-15.f, 0.0f, 0.0f, -15.f, 0.5f, -0.5f, -15.f, 8.0f, 1.5f, 5.2f
1232+
};
1233+
std::vector<T> eltwise_operand {
1234+
1.f, -2.f, 3.f, -4.f, 3.0f, -2.0f, 1.f, -2.f, 3.0f, -4.0f,
1235+
3.f, -2.f, 1.f, -2.f, 3.5f, -4.5f, 5.f, -4.f, 3.5f, -2.2f
1236+
};
1237+
int trip_count = 8;
1238+
int initial_condition = 1;
1239+
1240+
// initialize input buffers
1241+
set_values(input_mem, input_data);
1242+
set_values(operand_mem, eltwise_operand);
1243+
set_values(trip_count_mem, { trip_count });
1244+
set_values(initial_condition_mem, {initial_condition});
1245+
1246+
topology body(
1247+
input_layout("input", input_mem->get_layout()),
1248+
data("eltwise_operand", operand_mem),
1249+
eltwise("eltwise", input_info("input"), input_info("eltwise_operand"), eltwise_mode::sum)
1250+
);
1251+
1252+
std::vector<loop::io_primitive_map> input_primitive_maps { loop::io_primitive_map("input", "input") };
1253+
std::vector<loop::io_primitive_map> output_primitive_maps { loop::io_primitive_map("loop", "eltwise") };
1254+
std::vector<loop::backedge_mapping> back_edges { loop::backedge_mapping("eltwise", "input") };
1255+
1256+
auto body_program = build_program(engine, body, "", output_primitive_maps, back_edges);
1257+
1258+
topology topology(
1259+
input_layout("input", input_mem->get_layout()),
1260+
input_layout("trip_count", trip_count_mem->get_layout()),
1261+
input_layout("initial_condition", initial_condition_mem->get_layout()),
1262+
mutable_data("num_iteration", num_iteration_mem),
1263+
loop("loop", { input_info("num_iteration"), input_info("trip_count"), input_info("initial_condition"), input_info("input") }, body_program,
1264+
"trip_count", "initial_condition", "num_iteration",
1265+
input_primitive_maps, output_primitive_maps, back_edges, 8)
1266+
);
1267+
1268+
cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test);
1269+
1270+
network->set_input_data("input", input_mem);
1271+
network->set_input_data("trip_count", trip_count_mem);
1272+
network->set_input_data("initial_condition", initial_condition_mem);
1273+
1274+
auto outputs = network->execute();
1275+
ASSERT_EQ(outputs.size(), 1);
1276+
auto output = outputs.begin()->second.get_memory();
1277+
auto output_layout = output->get_layout();
1278+
1279+
ASSERT_EQ(output_layout.batch(), 1);
1280+
ASSERT_EQ(output_layout.feature(), 1);
1281+
ASSERT_EQ(output_layout.spatial(0), 4);
1282+
ASSERT_EQ(output_layout.spatial(1), 5);
1283+
1284+
// value check
1285+
{
1286+
mem_lock<T> output_ptr{ output, get_test_stream() };
1287+
ASSERT_EQ(output_ptr.size(), input_data.size());
1288+
for (size_t i = 0, iend = input_data.size(); i < iend; ++i) {
1289+
ASSERT_FLOAT_EQ(output_ptr[i], input_data[i] + eltwise_operand[i] * trip_count);
1290+
}
1291+
}
1292+
1293+
// allocate new output memory
1294+
layout loop_l = network->get_output_memory("loop")->get_layout();
1295+
auto output_mem = engine.allocate_memory(loop_l);
1296+
network->set_output_memory("loop", output_mem);
1297+
1298+
//one more execute
1299+
set_values(input_mem, input_data);
1300+
set_values(operand_mem, eltwise_operand);
1301+
set_values(trip_count_mem, { trip_count });
1302+
set_values(initial_condition_mem, { initial_condition });
1303+
outputs = network->execute();
1304+
1305+
// check everything once again
1306+
ASSERT_EQ(outputs.size(), 1);
1307+
auto output2 = outputs.begin()->second.get_memory();
1308+
{
1309+
mem_lock<T> output_ptr2{ output2, get_test_stream() };
1310+
ASSERT_EQ(output_ptr2.size(), input_data.size());
1311+
for (size_t i = 0, iend = input_data.size(); i < iend; ++i) {
1312+
ASSERT_FLOAT_EQ(output_ptr2[i], input_data[i] + eltwise_operand[i] * trip_count);
1313+
}
1314+
}
1315+
}
1316+
1317+
TEST(loop_gpu, zero_bytes_layout) {
1318+
test_loop_gpu_zero_bytes_layout<float>(false);
1319+
}

0 commit comments

Comments
 (0)