Skip to content

Commit 4034f74

Browse files
steve-yababushk
authored andcommitted
[GPU] Support bfyx and fsv32 input formats for concat (openvinotoolkit#26372)
### Details: - *Implement shape agnostic concat ref kernel* - *The shape agnostic concat ref kernel supports bfyx and fsv32 input formats* ### Tickets: - *149462*
1 parent a49c19a commit 4034f74

File tree

5 files changed

+117
-29
lines changed

5 files changed

+117
-29
lines changed

src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp

+2-23
Original file line numberDiff line numberDiff line change
@@ -923,35 +923,14 @@ void reorder_inputs::run(program& p, reorder_factory& rf) {
923923
}
924924
};
925925

926-
const auto reorder_input_concat = [&p, &rf](typed_program_node<concatenation>& concat_node) {
927-
auto output_layout = concat_node.get_output_layout();
928-
// Iterate over all dependencies of the concat node
929-
for (size_t i = 0; i < concat_node.get_dependencies().size(); ++i) {
930-
auto dep = concat_node.get_dependency_with_port(i);
931-
const auto& input = dep.first;
932-
auto input_layout = input->get_output_layout();
933-
// Change input data type of concat node from input format to output format
934-
if (input_layout.format != output_layout.format) {
935-
auto new_layout = input_layout;
936-
new_layout.format = output_layout.format;
937-
auto new_input = rf.get_reorder(input->id(), dep.second, input_layout, new_layout);
938-
if (new_input.first) {
939-
p.add_intermediate(new_input.first, concat_node, i);
940-
concat_node.get_dependency_with_port(i).first->recalc_output_layout();
941-
}
942-
}
943-
}
944-
};
945-
946926
for (auto& prim : p.get_processing_order()) {
947-
program_helpers::do_for_types<detection_output, deconvolution, convolution, fully_connected, pooling, concatenation>(
927+
program_helpers::do_for_types<detection_output, deconvolution, convolution, fully_connected, pooling>(
948928
*prim,
949929
reorder_input_detection_output,
950930
reorder_input_and_weights_deconvolution,
951931
reorder_convolution,
952932
reorder_input_fully_connected,
953-
reorder_input_pooling,
954-
reorder_input_concat);
933+
reorder_input_pooling);
955934
}
956935

957936
for (auto n : p.get_processing_order()) {

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/concatenation_gpu_ref.cl

+7-5
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@
66

77
#define GET_INDEX(prefix, ORDER) CAT(prefix, _GET_INDEX)(ORDER)
88

9-
KERNEL(concatenation_gpu_ref)(__global INPUT0_TYPE* input,
10-
__global OUTPUT_TYPE* output,
11-
uint output_offset_in_concat_axis
9+
KERNEL(concatenation_gpu_ref)(
10+
OPTIONAL_SHAPE_INFO_ARG
11+
__global INPUT0_TYPE* input,
12+
__global OUTPUT_TYPE* output,
13+
uint output_offset_in_concat_axis
1214
#if HAS_FUSED_OPS_DECLS
13-
, FUSED_OPS_DECLS
15+
, FUSED_OPS_DECLS
1416
#endif
1517
)
1618
{
@@ -22,7 +24,7 @@ KERNEL(concatenation_gpu_ref)(__global INPUT0_TYPE* input,
2224
#endif
2325
const uint d3 = (uint)get_global_id(2); // B
2426

25-
for (size_t d0 = 0; d0 < INPUT0_SIZES[INPUT_DIM_0]; ++d0) // X
27+
for (size_t d0 = 0; d0 < INPUT0_SIZE_X; ++d0) // X
2628
{
2729
uint input_offset = GET_INDEX(INPUT0, INPUT_DIMS_ORDER);
2830
uint output_offset = GET_INDEX(OUTPUT, OUTPUT_DIMS_ORDER);

src/plugins/intel_gpu/src/kernel_selector/jitter.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const {
364364
if (_tensor.GetLayout() == DataLayout::bf || _tensor.GetLayout() == DataLayout::bfyx ||
365365
_tensor.GetLayout() == DataLayout::bfzyx || _tensor.GetLayout() == DataLayout::bfwzyx ||
366366
_tensor.GetLayout() == DataLayout::bfuwzyx || _tensor.GetLayout() == DataLayout::bfvuwzyx ||
367-
_tensor.GetLayout() == DataLayout::b_fs_yx_fsv16) {
367+
_tensor.GetLayout() == DataLayout::b_fs_yx_fsv16 || _tensor.GetLayout() == DataLayout::b_fs_yx_fsv32) {
368368
definitions.push_back({_name + "_X_PITCH", "1"});
369369
definitions.push_back({_name + "_Y_PITCH", dims_padded.x()});
370370
definitions.push_back({_name + "_Z_PITCH", toVectorMulString({dims_padded.x(), dims_padded.y()})});

src/plugins/intel_gpu/src/kernel_selector/kernels/concatenation/concatenation_kernel_ref.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ ParamsKey ConcatenationKernelRef::GetSupportedKey() const {
5858
k.EnableConcatAxis(ConcatAxis::BATCH);
5959
k.EnableConcatKernelPerInput();
6060
k.EnableDifferentTypes();
61+
k.EnableDynamicShapesSupport();
6162
return k;
6263
}
6364

src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp

+106
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,112 @@ TEST(concat_cpu_impl, dynamic_4d_f) {
209209
start_concat_test_dynamic(impl_types::cpu);
210210
}
211211

212+
TEST(concat_gpu, dynamic_2d_bfyx_and_b_fs_yx_fsv32) {
213+
auto& engine = get_test_engine();
214+
215+
topology topology(
216+
input_layout("input0", { { 2, 4 }, data_types::f32, format::bfyx }),
217+
input_layout("input1", { { -1, 1 }, data_types::f32, format::bfyx }),
218+
reorder("reorder_input1", input_info("input1"), { { -1, 1 }, data_types::f16, format::b_fs_yx_fsv32 }),
219+
concatenation("concat",
220+
{ input_info("input0"), input_info("reorder_input1") },
221+
1,
222+
data_types::f32)
223+
);
224+
225+
ExecutionConfig config = get_test_default_config(engine);
226+
config.set_property(ov::intel_gpu::optimize_data(false));
227+
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
228+
ov::intel_gpu::ImplementationDesc impl = { format::bfyx, "", impl_types::ocl };
229+
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "concat", impl } }));
230+
231+
auto network = cldnn::network::build_network(engine, topology, config);
232+
233+
layout layout0 = { { 2, 4 }, data_types::f32, format::bfyx };
234+
layout layout1 = { { 2, 1 }, data_types::f32, format::bfyx };
235+
236+
auto input0 = engine.allocate_memory(layout0);
237+
auto input1 = engine.allocate_memory(layout1);
238+
239+
set_values<float>(input0, { 0, 1, 2, 3, 4, 5, 6, 7 });
240+
set_values<float>(input1, { 8, 9 });
241+
VF<float> expected_out = { 0, 1, 2, 3, 8, 4, 5, 6, 7, 9 };
242+
243+
network->set_input_data("input0", input0);
244+
network->set_input_data("input1", input1);
245+
246+
auto outputs = network->execute();
247+
ASSERT_EQ(outputs.size(), size_t(1));
248+
ASSERT_EQ(outputs.begin()->first, "concat");
249+
250+
auto output_memory = outputs.at("concat").get_memory();
251+
auto output_layout = outputs.at("concat").get_layout();
252+
cldnn::mem_lock<float> output_ptr(output_memory, get_test_stream());
253+
254+
ov::PartialShape expected_shape = layout0.get_partial_shape();
255+
expected_shape[1] = layout0.get_partial_shape()[1] +
256+
layout1.get_partial_shape()[1];
257+
258+
ASSERT_EQ(output_layout.get_partial_shape(), expected_shape);
259+
260+
for (size_t i = 0; i < output_layout.count(); ++i) {
261+
ASSERT_EQ(expected_out[i], output_ptr[i]) << " i = " << i;
262+
}
263+
}
264+
265+
TEST(concat_gpu, dynamic_4d_bfyx_and_b_fs_yx_fsv32) {
266+
auto& engine = get_test_engine();
267+
268+
topology topology(
269+
input_layout("input0", { { -1, -1, -1, -1 }, data_types::f32, format::bfyx }),
270+
input_layout("input1", { { -1, -1, -1, -1 }, data_types::f32, format::bfyx }),
271+
reorder("reorder_input1", input_info("input1"), { { -1, -1, -1, -1 }, data_types::f16, format::b_fs_yx_fsv32 }),
272+
concatenation("concat",
273+
{ input_info("input0"), input_info("reorder_input1") },
274+
1,
275+
data_types::f32)
276+
);
277+
278+
ExecutionConfig config = get_test_default_config(engine);
279+
config.set_property(ov::intel_gpu::optimize_data(false));
280+
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
281+
ov::intel_gpu::ImplementationDesc impl = { format::bfyx, "", impl_types::ocl };
282+
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "concat", impl } }));
283+
284+
auto network = cldnn::network::build_network(engine, topology, config);
285+
286+
layout layout0 = { { 2, 4, 1, 1 }, data_types::f32, format::bfyx };
287+
layout layout1 = { { 2, 1, 1, 1 }, data_types::f32, format::bfyx };
288+
289+
auto input0 = engine.allocate_memory(layout0);
290+
auto input1 = engine.allocate_memory(layout1);
291+
292+
set_values<float>(input0, { 0, 1, 2, 3, 4, 5, 6, 7 });
293+
set_values<float>(input1, { 8, 9 });
294+
VF<float> expected_out = { 0, 1, 2, 3, 8, 4, 5, 6, 7, 9 };
295+
296+
network->set_input_data("input0", input0);
297+
network->set_input_data("input1", input1);
298+
299+
auto outputs = network->execute();
300+
ASSERT_EQ(outputs.size(), size_t(1));
301+
ASSERT_EQ(outputs.begin()->first, "concat");
302+
303+
auto output_memory = outputs.at("concat").get_memory();
304+
auto output_layout = outputs.at("concat").get_layout();
305+
cldnn::mem_lock<float> output_ptr(output_memory, get_test_stream());
306+
307+
ov::PartialShape expected_shape = layout0.get_partial_shape();
308+
expected_shape[1] = layout0.get_partial_shape()[1] +
309+
layout1.get_partial_shape()[1];
310+
311+
ASSERT_EQ(output_layout.get_partial_shape(), expected_shape);
312+
313+
for (size_t i = 0; i < output_layout.count(); ++i) {
314+
ASSERT_EQ(expected_out[i], output_ptr[i]) << " i = " << i;
315+
}
316+
}
317+
212318
TEST(concat_gpu, dynamic_6d_f) {
213319
auto& engine = get_test_engine();
214320

0 commit comments

Comments
 (0)