Skip to content

Commit 27b6380

Browse files
[GPU] Add fusing false condition for fs_b_yx_fsv32_network at eltwise and quantize pattern
1 parent 8b59d92 commit 27b6380

File tree

5 files changed

+147
-14
lines changed

5 files changed

+147
-14
lines changed

src/plugins/intel_gpu/src/graph/graph_optimizer/fuse_primitives_with_layout.cpp

+11-11
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,18 @@
1212

1313
using namespace cldnn;
1414

15-
static bool eltwise_supports_fusings(eltwise_node& node) {
16-
auto out_layout = node.get_output_layout();
17-
// This condition refers to optimizied kernel EltwiseKernel_fs_b_yx_fsv32
18-
if (out_layout.data_type == data_types::f16 && out_layout.batch() > 1 && out_layout.format == format::fs_b_yx_fsv32) {
19-
return false;
20-
}
15+
void fuse_primitives_with_layout::run(program& p) {
16+
auto eltwise_supports_fusings = [&](eltwise_node& node) -> bool {
17+
auto out_layout = node.get_output_layout();
18+
// This condition refers to optimizied kernel EltwiseKernel_fs_b_yx_fsv32
19+
if (out_layout.data_type == data_types::f16 && out_layout.batch() > 1 &&
20+
(_lo.get_optimization_attributes().fs_b_yx_fsv32_network || out_layout.format == format::fs_b_yx_fsv32)) {
21+
return false;
22+
}
2123

22-
return true;
23-
}
24+
return true;
25+
};
2426

25-
void fuse_primitives_with_layout::run(program& p) {
2627
bool need_recalc_processing_order = false;
2728
std::map<primitive_id, std::vector<std::pair<primitive_id, size_t>>> fusing_history;
2829

@@ -35,7 +36,7 @@ void fuse_primitives_with_layout::run(program& p) {
3536
continue;
3637

3738
// No optimized Eltwise kernel supports fused-operation for fs_b_yx_fsv32
38-
// Check fusing quantize to eltwsise for this case
39+
// Check fusing quantize to eltwise for this case
3940
auto func_fuse_quantize = [&](quantize_node& node) {
4041
bool should_fuse = false;
4142
auto out_layout = node.get_output_layout();
@@ -49,7 +50,6 @@ void fuse_primitives_with_layout::run(program& p) {
4950
return;
5051

5152
should_fuse |= input_node.is_type<eltwise>() && eltwise_supports_fusings(input_node.as<eltwise>());
52-
5353
if (!should_fuse)
5454
return;
5555

src/plugins/intel_gpu/src/graph/include/pass_manager.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -397,10 +397,12 @@ class mark_runtime_skippable_nodes : public base_pass {
397397

398398
class fuse_primitives_with_layout : public base_pass {
399399
public:
400-
fuse_primitives_with_layout() : base_pass("fuse_primitives_with_layout") {}
400+
explicit fuse_primitives_with_layout(layout_optimizer& lo_ref) :
401+
base_pass("fuse_primitives_with_layout"), _lo(lo_ref) {}
401402

402403
private:
403404
void run(program& p) override;
405+
layout_optimizer& _lo;
404406
};
405407

406408
} // namespace cldnn

src/plugins/intel_gpu/src/graph/program.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,7 @@ void program::pre_optimize_graph(bool is_internal) {
594594

595595
// Check fusing primitives based on preferred format or layout optimization
596596
if (optimize_data) {
597-
apply_opt_pass<fuse_primitives_with_layout>();
597+
apply_opt_pass<fuse_primitives_with_layout>(lo);
598598
}
599599

600600
// add optimization attributes for onednn primitives

src/plugins/intel_gpu/tests/unit/passes/fuse_primitives_with_layout.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ TEST(fuse_primitives_with_layout, fuse_when_layout_format_of_input_and_output_ar
5151
node->set_output_layout(qt_layout, false);
5252
}
5353
}
54-
program_wrapper::apply_opt_pass<fuse_primitives_with_layout>(*program);
54+
55+
layout_optimizer lo(true);
56+
program_wrapper::apply_opt_pass<fuse_primitives_with_layout>(*program, lo);
5557

5658
ASSERT_TRUE(has_node(*program, "quantize"));
5759
}

src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp

+129
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,135 @@ TEST(quantize_gpu, quantize_levels_256_3d_unsigned) {
518518
}
519519
}
520520

521+
TEST(quantize_gpu, eltwise_quantize_fs_b_yx_fsv32) {
522+
tests::random_generator rg(GET_SUITE_NAME);
523+
auto& engine = get_test_engine();
524+
525+
// conv to enable 'fs_b_yx_fsv32_network'
526+
const int batch_num = 2;
527+
const int input_xy = 5;
528+
const int input_f = 32;
529+
const int output_f = 32;
530+
const int filter_xy = 1;
531+
const int pad = filter_xy / 2;
532+
533+
auto input_size = tensor(batch_num, input_f, input_xy, input_xy);
534+
auto input_data = rg.generate_random_4d<ov::float16>(batch_num, input_f, input_xy, input_xy, -1, 1);
535+
auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
536+
auto input_mem = engine.allocate_memory({ data_types::f16, format::bfyx, input_size });
537+
set_values(input_mem, input_data_bfyx);
538+
539+
auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy);
540+
auto weights_data = rg.generate_random_4d<ov::float16>(output_f, input_f, filter_xy, filter_xy, -1, 1);
541+
auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
542+
auto weights_mem = engine.allocate_memory({ data_types::f16, format::bfyx, weights_size });
543+
set_values(weights_mem, weights_data_bfyx);
544+
545+
topology topology(
546+
input_layout("input_conv", input_mem->get_layout()),
547+
data("weights_fsv", weights_mem));
548+
549+
// Reorder input to fs_byx_fsv32
550+
topology.add(reorder("input_fsv", input_info("input_conv"), { data_types::f16, format::fs_b_yx_fsv32, input_size }));
551+
552+
topology.add(convolution("conv0", input_info("input_fsv"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
553+
topology.add(convolution("conv1", input_info("conv0"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
554+
topology.add(convolution("conv2", input_info("conv1"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
555+
topology.add(convolution("conv3", input_info("conv2"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
556+
topology.add(convolution("conv4", input_info("conv3"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
557+
topology.add(convolution("conv5", input_info("conv4"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
558+
topology.add(convolution("conv6", input_info("conv5"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
559+
topology.add(convolution("conv7", input_info("conv6"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
560+
topology.add(convolution("conv8", input_info("conv7"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
561+
topology.add(convolution("conv9", input_info("conv8"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
562+
topology.add(convolution("conv10", input_info("conv9"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
563+
topology.add(convolution("conv11", input_info("conv10"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
564+
565+
topology.add(reorder("reorder_conv", input_info("conv11"), format::b_fs_yx_fsv16, data_types::f32));
566+
567+
// eltwise + quantize pattern
568+
auto in_layout = layout{ ov::PartialShape{2, 16, 1, 2}, data_types::f16, format::b_fs_yx_fsv16 };
569+
auto input = engine.allocate_memory(in_layout);
570+
auto input_low = engine.allocate_memory({ data_types::f32,format::bfyx,{ 1, 16, 1, 1 } });
571+
auto input_high = engine.allocate_memory({ data_types::f32,format::bfyx,{ 1, 16, 1, 1 } });
572+
auto output_low = engine.allocate_memory({ data_types::f32,format::bfyx,{ 1, 1, 1, 1 } });
573+
auto output_high = engine.allocate_memory({ data_types::f32,format::bfyx,{ 1, 1, 1, 1 } });
574+
575+
set_values(input, { -1.0f, 2.0f, 3.0f, 4.0f,
576+
5.0f, 2.0f, 2.0f, 3.0f,
577+
4.0f, 6.0f, 3.0f, 3.0f,
578+
3.0f, 5.0f, 1.0f, 1.0f,
579+
580+
1.0f, 1.0f, 1.0f, 1.0f,
581+
4.0f, 6.0f, 3.0f, 3.0f,
582+
3.0f, 5.0f, 1.0f, 1.0f,
583+
1.0f, 1.0f, 1.0f, 1.0f,
584+
585+
-1.0f, 2.0f, 3.0f, 4.0f,
586+
5.0f, 2.0f, 2.0f, 3.0f,
587+
4.0f, 6.0f, 3.0f, 3.0f,
588+
3.0f, 5.0f, 1.0f, 1.0f,
589+
590+
1.0f, 1.0f, 1.0f, 1.0f,
591+
4.0f, 6.0f, 3.0f, 3.0f,
592+
3.0f, 5.0f, 1.0f, 1.0f,
593+
1.0f, 1.0f, 1.0f, 1.0f });
594+
595+
set_values(input_low, { 0.0f, 1.0f, 2.0f, 3.0f,
596+
4.0f, 5.0f, 6.0f, 7.0f,
597+
7.0f, 6.0f, 5.0f, 4.0f,
598+
3.0f, 2.0f, 1.0f, 0.0f });
599+
set_values(input_high, { 0.0f, 1.0f, 2.0f, 3.0f,
600+
4.0f, 5.0f, 6.0f, 7.0f,
601+
7.0f, 6.0f, 5.0f, 4.0f,
602+
3.0f, 2.0f, 1.0f, 0.0f });
603+
set_values(output_low, { -1.0f });
604+
set_values(output_high, { 1.0f });
605+
606+
std::vector<float> ref_data = { -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
607+
-1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1,
608+
-1, -1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1,
609+
-1, -1, -1, -1, -1, -1, -1, 1, -1, 1, -1, 1,
610+
-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
611+
-1, 1, -1, 1 };
612+
613+
topology.add(
614+
input_layout("input1", in_layout),
615+
input_layout("input2", in_layout),
616+
eltwise("multiply", input_info("input1"), input_info("input2"), eltwise_mode::prod),
617+
data("input_low", input_low),
618+
data("input_high", input_high),
619+
data("output_low", output_low),
620+
data("output_high", output_high),
621+
quantize("quantize", input_info("multiply"), input_info("input_low"), input_info("input_high"), input_info("output_low"), input_info("output_high"), 2, data_types::f32),
622+
reorder("reorder", input_info("quantize"), format::b_fs_yx_fsv16, data_types::f32)
623+
);
624+
625+
ExecutionConfig config = get_test_default_config(engine);
626+
ov::intel_gpu::ImplementationDesc quantize_impl = { format::b_fs_yx_fsv16, "quantize_gpu_ref" };
627+
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "quantize", quantize_impl } }));
628+
config.set_property(ov::intel_gpu::optimize_data(true));
629+
630+
network network(engine, topology, config);
631+
network.set_input_data("input_conv", input_mem);
632+
network.set_input_data("input1", input);
633+
network.set_input_data("input2", input);
634+
auto outputs = network.execute();
635+
636+
auto output = outputs.at("reorder").get_memory();
637+
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
638+
639+
// Check that layout and memory contains logical size of tensor
640+
ASSERT_EQ(output->count(), (size_t)64);
641+
ASSERT_EQ(output->get_layout().count(), (size_t)64);
642+
643+
ASSERT_EQ(output->size(), ref_data.size() * sizeof(uint32_t));
644+
645+
for (size_t i = 0; i < ref_data.size(); ++i) {
646+
ASSERT_EQ(output_ptr[i], ref_data[i]) << " index = " << i;
647+
}
648+
}
649+
521650
TEST(quantize_gpu, dynamic) {
522651
auto& engine = get_test_engine();
523652

0 commit comments

Comments
 (0)