Skip to content

Commit 1a26e6f

Browse files
authored
[GPU] Fuse more 'type conversion only' reorders (openvinotoolkit#25270)
### Details: - Reorder which is only converting type can be fused to the prior node ### Tickets: - 144957
1 parent 626966b commit 1a26e6f

15 files changed

+252
-59
lines changed

src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp

+13-4
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111
#include "convert_color_inst.h"
1212
#include "one_hot_inst.h"
1313
#include "shape_of_inst.h"
14+
#include "gather_inst.h"
15+
#include "select_inst.h"
16+
#include "eltwise_inst.h"
17+
#include "broadcast_inst.h"
1418
#include "permute_inst.h"
1519
#include "depth_to_space_inst.h"
1620
#include "concatenation_inst.h"
@@ -410,8 +414,11 @@ void remove_redundant_reorders::run(program& p) {
410414
continue;
411415

412416
bool same_data_type = input.get_output_layout().data_type == output_layout.data_type;
413-
bool allowed_dt_conversion_fuse = (input.is_type<one_hot>() || input.is_type<permute>() || input.is_type<mvn>() || input.is_type<concatenation>() ||
414-
input.is_type<depth_to_space>() || input.is_type<region_yolo>() || input.is_type<detection_output>());
417+
bool allowed_dt_conversion_fuse =
418+
(input.is_type<one_hot>() || input.is_type<permute>() || input.is_type<mvn>() ||
419+
input.is_type<concatenation>() || input.is_type<depth_to_space>() || input.is_type<region_yolo>() ||
420+
input.is_type<detection_output>() || input.is_type<gather>() || input.is_type<broadcast>() ||
421+
input.is_type<select>() || input.is_type<eltwise>());
415422
if (!same_data_type && !allowed_dt_conversion_fuse)
416423
continue;
417424

@@ -426,8 +433,10 @@ void remove_redundant_reorders::run(program& p) {
426433
auto old_output_layout_of_input = input.get_output_layout();
427434
input.set_output_layout(output_layout, false);
428435
if (input.type()->does_possible_implementation_exist(input)) {
429-
// Add fused_primitive_desc of reorder to the previous node which propagates original output layout during shape inference
430-
if (input.is_type<mvn>() || input.is_type<concatenation>()) {
436+
// Add fused_primitive_desc of reorder to the previous node which propagates original output layout
437+
// during shape inference
438+
if (input.is_type<mvn>() || input.is_type<concatenation>() || input.is_type<gather>() ||
439+
input.is_type<broadcast>() || input.is_type<select>() || input.is_type<eltwise>()) {
431440
fused_primitive_desc local_desc(node.get_primitive());
432441
local_desc.f_param = node.get_fuse_params();
433442
local_desc.total_num_deps = node.get_dependencies().size();

src/plugins/intel_gpu/src/graph/layout_optimizer.cpp

+12-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "reshape_inst.h"
1414
#include "arg_max_min_inst.h"
1515
#include "shape_of_inst.h"
16+
#include "select_inst.h"
1617
#include "condition_inst.h"
1718
#include "strided_slice_inst.h"
1819
#include <sstream>
@@ -33,6 +34,7 @@
3334
#include "prior_box_inst.h"
3435
#include "scatter_nd_update_inst.h"
3536
#include "gather_inst.h"
37+
#include "broadcast_inst.h"
3638
#include "loop_inst.h"
3739
#include "dft_inst.h"
3840
#include "to_string_utils.h"
@@ -428,10 +430,18 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, reorder_node
428430
bool allow_new_shape_infer = node.get_program().is_new_shape_infer();
429431
// Because mvn and concatenation kernel can work cross-layout, if reorder only performs type conversion,
430432
// fusing reorder to the previous node can be done even if it is a dynamic shape case
431-
if ((prev.is_type<mvn>() || prev.is_type<concatenation>()) &&
433+
if ((prev.is_type<mvn>() || prev.is_type<concatenation>() || prev.is_type<gather>() || prev.is_type<broadcast>() ||
434+
prev.is_type<select>() || prev.is_type<eltwise>()) &&
432435
!prev.is_in_shape_of_subgraph() && node.is_type_conversion_only() &&
433-
(format::is_simple_data_format(fmt_prev) && format::is_simple_data_format(fmt_next)))
436+
(format::is_simple_data_format(fmt_prev) && format::is_simple_data_format(fmt_next)) &&
437+
// If the prev node is backedge of the loop, the type will be changed by fusing reorder.
438+
// We can void only that case if we can check whether the current node is backedge of the network.
439+
// However no such handle is existing yet. (To be done in the future when we need to optimize out the type converting
440+
// reorders in the body network)
441+
!node.get_program().is_body_program() &&
442+
prev.get_preferred_impl_type() != cldnn::impl_types::cpu) {
434443
return true;
444+
}
435445

436446
if (prev.is_dynamic() || (!node.get_users().empty() && node.get_users().front()->is_dynamic()))
437447
return false;

src/plugins/intel_gpu/src/graph/select.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ std::vector<layout> select_inst::calc_output_layouts(const select_node& /*node*/
3939

4040
auto desc = impl_param.typed_desc<select>();
4141
auto dt = desc->output_data_types[0].value_or(input1_layout.data_type);
42+
if (impl_param.has_fused_primitives()) {
43+
dt = impl_param.get_output_element_type();
44+
}
4245

4346
ov::op::v1::Select op;
4447
op.set_auto_broadcast(desc->broadcast_spec);

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/broadcast_gpu_ref.cl

+8-4
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ inline uint FUNC(get_idx_pos)(OPTIONAL_SHAPE_INFO_ARG uint out_b, uint out_f, ui
253253
#define VLOAD CAT(vload, VEC_SIZE)
254254
#define VSTORE CAT(vstore,VEC_SIZE)
255255
#define INPUT0_VTYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, VEC_SIZE)
256+
#define OUTPUT_VTYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, VEC_SIZE)
256257

257258
KERNEL(broadcast_gpu_ref)(
258259
OPTIONAL_SHAPE_INFO_ARG
@@ -322,15 +323,18 @@ KERNEL(broadcast_gpu_ref)(
322323
uint output_idx = out_pos;
323324
unroll_for(uint j = 0; j < y_nums; j++) {
324325
unroll_for(uint i = 0; i < x_stride; i++) {
325-
output[output_idx + i] = input[idx_pos + i];
326+
output[output_idx + i] = TO_OUTPUT_TYPE(input[idx_pos + i]);
326327
}
327328
output_idx += OUTPUT_SIZE_X;
328329
}
329330
} else {
330331
uint output_idx = out_pos;
331332
INPUT0_VTYPE input_vec = VLOAD(0, &input[idx_pos]);
332333
unroll_for(uint i = 0; i < y_nums; i++) {
333-
VSTORE(input_vec, 0, &output[output_idx]);
334+
OUTPUT_VTYPE out_v;
335+
for (int j = 0; j < VEC_SIZE; ++j)
336+
out_v[j] = TO_OUTPUT_TYPE(input_vec[j]);
337+
VSTORE(out_v, 0, &output[output_idx]);
334338
output_idx += OUTPUT_SIZE_X;
335339
}
336340

@@ -339,7 +343,7 @@ KERNEL(broadcast_gpu_ref)(
339343

340344
output_idx = out_pos;
341345
unroll_for(uint i = 0; i < y_nums; i++) {
342-
output[output_idx + x_stride] = input_val;
346+
output[output_idx + x_stride] = TO_OUTPUT_TYPE(input_val);
343347
output_idx += OUTPUT_SIZE_X;
344348
}
345349
}
@@ -375,7 +379,7 @@ KERNEL(broadcast_gpu_ref)(
375379
const uint out_pos = OUTPUT_GET_INDEX(out_b, out_f, out_y, out_x);
376380
const uint idx_pos = FUNC_CALL(get_idx_pos)(OPTIONAL_SHAPE_INFO_TENSOR out_b, out_f, out_y, out_x);
377381
#endif
378-
output[out_pos] = input[idx_pos];
382+
output[out_pos] = TO_OUTPUT_TYPE(input[idx_pos]);
379383
}
380384
}
381385

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/generic_eltwise_ref.cl

+1-1
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,6 @@ KERNEL(eltwise)(
192192
#if QUANTIZATION_TERM && !OUTPUT_IS_FP
193193
output[output_offset] = TO_OUTPUT_TYPE_SAT(ACTIVATION(out, ACTIVATION_PARAMS));
194194
#else
195-
output[output_offset] = ACTIVATION_TYPED(out, ACTIVATION_PARAMS_TYPED);
195+
output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION_TYPED(out, ACTIVATION_PARAMS_TYPED));
196196
#endif
197197
}

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/select_gpu_ref.cl

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ KERNEL(select)(
4545
uint output_offset = OUTPUT_GET_INDEX(b, f, y, x);
4646
#endif
4747

48-
const OUTPUT_TYPE res = select(INPUT_2, INPUT_1, MASK);
48+
const OUTPUT_TYPE res = TO_OUTPUT_TYPE(select(INPUT_2, INPUT_1, MASK));
4949

5050
output[output_offset] = res;
5151
}

src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ ParamsKey BroadcastKernelRef::GetSupportedKey() const {
2525
k.EnableAllInputLayout();
2626
k.EnableAllOutputLayout();
2727

28+
k.EnableDifferentTypes();
2829
k.EnableTensorOffset();
2930
k.EnableTensorPitches();
3031
k.EnableBatching();

src/plugins/intel_gpu/src/kernel_selector/kernels/broadcast/broadcast_kernel_ref.h

+3
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,8 @@ class BroadcastKernelRef : public BroadcastKernelBase {
1414
KernelsData GetKernelsData(const Params& params) const override;
1515
KernelsPriority GetKernelsPriority(const Params& params) const override;
1616
ParamsKey GetSupportedKey() const override;
17+
std::vector<FusedOpType> GetSupportedFusedOps() const override {
18+
return { FusedOpType::REORDER };
19+
}
1720
};
1821
} // namespace kernel_selector

src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_ref.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ class EltwiseKernelRef : public EltwiseKernelBase {
1919
return {
2020
FusedOpType::QUANTIZE,
2121
FusedOpType::ACTIVATION,
22-
FusedOpType::ELTWISE
22+
FusedOpType::ELTWISE,
23+
FusedOpType::REORDER
2324
};
2425
}
2526

src/plugins/intel_gpu/src/kernel_selector/kernels/gather/gather_kernel_ref.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ class GatherKernelRef : public KernelBaseOpenCL {
3737
std::vector<FusedOpType> GetSupportedFusedOps() const override {
3838
return { FusedOpType::QUANTIZE,
3939
FusedOpType::ELTWISE,
40-
FusedOpType::ACTIVATION };
40+
FusedOpType::ACTIVATION,
41+
FusedOpType::REORDER };
4142
}
4243

4344
protected:

src/plugins/intel_gpu/src/kernel_selector/kernels/select/select_kernel_ref.h

+4
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ class SelectKernelRef : public SelectKernelBase {
1515
KernelsData GetKernelsData(const Params& params) const override;
1616
KernelsPriority GetKernelsPriority(const Params& params) const override;
1717
ParamsKey GetSupportedKey() const override;
18+
std::vector<FusedOpType> GetSupportedFusedOps() const override {
19+
return { FusedOpType::REORDER };
20+
}
21+
1822

1923
protected:
2024
bool Validate(const Params& p) const override;

src/plugins/intel_gpu/tests/unit/fusions/eltwise_fusion_test.cpp

+35-13
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ struct eltwise_test_params {
3030

3131
class EltwiseFusingTest : public ::BaseFusingTest<eltwise_test_params> {
3232
public:
33-
void execute(eltwise_test_params& p) {
33+
void execute(eltwise_test_params& p, bool count_reorder = false) {
3434
auto input_prim = get_mem(get_input_layout(p));
3535
auto input_prim2 = get_mem(get_input_layout2(p));
3636

@@ -45,7 +45,7 @@ class EltwiseFusingTest : public ::BaseFusingTest<eltwise_test_params> {
4545
network_not_fused.set_input_data("input2", input_prim2);
4646
}
4747

48-
compare(network_not_fused, network_fused, p);
48+
compare(network_not_fused, network_fused, p, count_reorder);
4949
}
5050

5151
layout get_input_layout(eltwise_test_params& p) {
@@ -545,21 +545,27 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_no_pitches_same_dims_quantize, ::t
545545
eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 4 },
546546
}));
547547

548-
class eltwise_activation : public EltwiseFusingTest {};
549-
TEST_P(eltwise_activation, basic) {
548+
class eltwise_activation_reorder : public EltwiseFusingTest {};
549+
TEST_P(eltwise_activation_reorder, basic) {
550550
auto p = GetParam();
551-
create_topologies(
552-
input_layout("input", get_input_layout(p)),
553-
input_layout("input2", get_input_layout2(p)),
554-
eltwise("eltwise", { input_info("input"), input_info("input2") }, p.mode, p.default_type),
555-
activation("activation", input_info("eltwise"), activation_func::relu, { 6.0f, 0.0f }),
556-
reorder("out", input_info("activation"), p.default_format, data_types::f32)
557-
);
551+
create_topologies(input_layout("input", get_input_layout(p)),
552+
input_layout("input2", get_input_layout2(p)),
553+
eltwise("eltwise", {input_info("input"), input_info("input2")}, p.mode, p.default_type),
554+
activation("activation", input_info("eltwise"), activation_func::relu, {6.0f, 0.0f}),
555+
reorder("out",
556+
input_info("activation"),
557+
p.default_format,
558+
data_types::f32,
559+
std::vector<float>(),
560+
cldnn::reorder_mean_mode::subtract,
561+
cldnn::padding(),
562+
true));
558563

559564
tolerance = default_tolerance(p.input_type);
560-
execute(p);
565+
execute(p, true);
561566
}
562567

568+
class eltwise_activation : public EltwiseFusingTest {};
563569
TEST_P(eltwise_activation, fp16_out) {
564570
auto p = GetParam();
565571
create_topologies(
@@ -574,6 +580,21 @@ TEST_P(eltwise_activation, fp16_out) {
574580
execute(p);
575581
}
576582

583+
INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_activation_reorder, ::testing::ValuesIn(std::vector<eltwise_test_params>{
584+
eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 5 },
585+
eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 5 },
586+
eltwise_test_params{ CASE_ELTWISE_FP16_3, 4, 5 },
587+
eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 5 },
588+
eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 5 },
589+
eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 5 },
590+
eltwise_test_params{ CASE_ELTWISE_FP32_FP16_1, 3, 5 },
591+
eltwise_test_params{ CASE_ELTWISE_FP32_FP16_2, 3, 5 },
592+
eltwise_test_params{ CASE_ELTWISE_FP32_FP16_3, 3, 5 },
593+
eltwise_test_params{ CASE_ELTWISE_FP16_FP32_1, 3, 5 },
594+
eltwise_test_params{ CASE_ELTWISE_FP16_FP32_2, 3, 5 },
595+
eltwise_test_params{ CASE_ELTWISE_FP16_FP32_3, 4, 5 }
596+
}));
597+
577598
INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_activation, ::testing::ValuesIn(std::vector<eltwise_test_params>{
578599
eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 4 },
579600
eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 4 },
@@ -590,6 +611,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_activation, ::testing::ValuesIn(st
590611
}));
591612

592613

614+
593615
class eltwise_quantize_fs_b_yx_fsv32 : public EltwiseFusingTest {};
594616
TEST_P(eltwise_quantize_fs_b_yx_fsv32, fusing_eltwise_quantize_layout) {
595617
auto p = GetParam();
@@ -649,4 +671,4 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_quantize_fs_b_yx_fsv32_exception,
649671
eltwise_test_params{ CASE_ELTWISE_FP16_B_FS_YX, 6, 6 },
650672
eltwise_test_params{ CASE_ELTWISE_FP16_BATCH_FS_B, 6, 6 },
651673
eltwise_test_params{ CASE_ELTWISE_FP16_BATCH_B_FS, 6, 6 },
652-
}));
674+
}));

src/plugins/intel_gpu/tests/unit/fusions/gather_fusion_test.cpp

+9-6
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ struct gather_test_params {
3232

3333
class GatherPrimitiveFusingTest : public ::BaseFusingTest<gather_test_params> {
3434
public:
35-
void execute(gather_test_params& p, bool is_dynamic = false) {
35+
void execute(gather_test_params& p, bool is_dynamic = false, bool count_reorder = false) {
3636
cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
3737
cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
3838
cfg_fused.set_property(ov::intel_gpu::optimize_data(true));
@@ -50,7 +50,7 @@ class GatherPrimitiveFusingTest : public ::BaseFusingTest<gather_test_params> {
5050
network_not_fused.set_input_data("eltwise_data", elt_input_prim);
5151
}
5252

53-
compare(network_not_fused, network_fused, p);
53+
compare(network_not_fused, network_fused, p, count_reorder);
5454
}
5555

5656
layout get_input_layout(gather_test_params& p, bool is_dynamic = false) {
@@ -119,6 +119,8 @@ class GatherPrimitiveFusingTest : public ::BaseFusingTest<gather_test_params> {
119119
#define CASE_GATHER_5D_FP16_4 { 3, 2, 2, 2, 2 }, { 2, 3, 1, 1 }, { 3, 2, 2, 3, 2 }, 2, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx
120120
#define CASE_GATHER_5D_FP16_5 { 1, 1, 2, 1, 1 }, { 3, 1, 1, 1 }, { 1, 1, 1, 1, 3 }, 4, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx
121121

122+
#define CASE_GATHER_INT8_1 { 2, 3, 4, 1 }, { 4 }, { 4, 3, 4, 1 }, 0, data_types::i8, format::bfyx, data_types::f32, format::bfyx
123+
122124
class gather_quantize : public GatherPrimitiveFusingTest {};
123125
TEST_P(gather_quantize, basic) {
124126
auto p = GetParam();
@@ -223,14 +225,15 @@ TEST_P(gather_eltwise_activation_dynamic, basic) {
223225
gather("gather_prim", input_info("input"), input_info("gather_indices"), p.axis, p.dictionary_shape.size(), p.out_shape),
224226
activation("activation", input_info("gather_prim"), activation_func::abs),
225227
eltwise("eltwise", { input_info("activation"), input_info("eltwise_data") }, eltwise_mode::prod),
226-
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32)
228+
reorder("reorder_bfyx", input_info("eltwise"), p.default_format, data_types::f32, std::vector<float>(), cldnn::reorder_mean_mode::subtract, cldnn::padding(), true)
227229
);
228230

229231
tolerance = 1e-5f;
230-
execute(p, true);
232+
execute(p, true, true);
231233
}
232234
INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_eltwise_activation_dynamic, ::testing::ValuesIn(std::vector<gather_test_params>{
233235
gather_test_params{ CASE_GATHER_FP32_6, 4, 6 },
234-
gather_test_params{ CASE_GATHER_FP16_6, 4, 6 },
235-
gather_test_params{ CASE_GATHER_FP16_7, 4, 6 },
236+
gather_test_params{ CASE_GATHER_FP16_6, 4, 7 },
237+
gather_test_params{ CASE_GATHER_FP16_7, 5, 8 },
238+
gather_test_params{ CASE_GATHER_INT8_1, 4, 7 },
236239
}));

0 commit comments

Comments
 (0)