[GPU] Add fusing false condition for fs_b_yx_fsv32_network at eltwise and quantize pattern

kelvinchoi-intel · kelvinchoi-intel · commit 6d40f8b91d2f · 2024-06-27T23:19:04.000+09:00
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/fuse_primitives_with_layout.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/fuse_primitives_with_layout.cpp
@@ -12,17 +12,18 @@
 
 using namespace cldnn;
 
-static bool eltwise_supports_fusings(eltwise_node& node) {
-    auto out_layout = node.get_output_layout();
-    // This condition refers to optimizied kernel EltwiseKernel_fs_b_yx_fsv32
-    if (out_layout.data_type == data_types::f16 && out_layout.batch() > 1 && out_layout.format == format::fs_b_yx_fsv32) {
-        return false;
-    }
+void fuse_primitives_with_layout::run(program& p) {
+    auto eltwise_supports_fusings = [&](eltwise_node& node) -> bool {
+        auto out_layout = node.get_output_layout();
+        // This condition refers to optimizied kernel EltwiseKernel_fs_b_yx_fsv32
+        if (out_layout.data_type == data_types::f16 && out_layout.batch() > 1 &&
+            (_lo.get_optimization_attributes().fs_b_yx_fsv32_network || out_layout.format == format::fs_b_yx_fsv32)) {
+            return false;
+        }
 
-    return true;
-}
+        return true;
+    };
 
-void fuse_primitives_with_layout::run(program& p) {
     bool need_recalc_processing_order = false;
     std::map<primitive_id, std::vector<std::pair<primitive_id, size_t>>> fusing_history;
 
@@ -35,7 +36,7 @@ void fuse_primitives_with_layout::run(program& p) {
             continue;
 
         // No optimized Eltwise kernel supports fused-operation for fs_b_yx_fsv32
-        // Check fusing quantize to eltwsise for this case
+        // Check fusing quantize to eltwise for this case
         auto func_fuse_quantize = [&](quantize_node& node) {
             bool should_fuse = false;
             auto out_layout = node.get_output_layout();
@@ -49,7 +50,6 @@ void fuse_primitives_with_layout::run(program& p) {
                 return;
 
             should_fuse |= input_node.is_type<eltwise>() && eltwise_supports_fusings(input_node.as<eltwise>());
-
             if (!should_fuse)
                 return;
 
diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h
@@ -397,10 +397,12 @@ class mark_runtime_skippable_nodes : public base_pass {
 
 class fuse_primitives_with_layout : public base_pass {
 public:
-    fuse_primitives_with_layout() : base_pass("fuse_primitives_with_layout") {}
+    explicit fuse_primitives_with_layout(layout_optimizer& lo_ref) :
+        base_pass("fuse_primitives_with_layout"), _lo(lo_ref) {}
 
 private:
     void run(program& p) override;
+    layout_optimizer& _lo;
 };
 
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -594,7 +594,7 @@ void program::pre_optimize_graph(bool is_internal) {
 
     // Check fusing primitives based on preferred format or layout optimization
     if (optimize_data) {
-        apply_opt_pass<fuse_primitives_with_layout>();
+        apply_opt_pass<fuse_primitives_with_layout>(lo);
     }
 
     // add optimization attributes for onednn primitives
diff --git a/src/plugins/intel_gpu/tests/unit/passes/fuse_primitives_with_layout.cpp b/src/plugins/intel_gpu/tests/unit/passes/fuse_primitives_with_layout.cpp
@@ -51,7 +51,9 @@ TEST(fuse_primitives_with_layout, fuse_when_layout_format_of_input_and_output_ar
             node->set_output_layout(qt_layout, false);
         }
     }
-    program_wrapper::apply_opt_pass<fuse_primitives_with_layout>(*program);
+
+    layout_optimizer lo(true);
+    program_wrapper::apply_opt_pass<fuse_primitives_with_layout>(*program, lo);
 
     ASSERT_TRUE(has_node(*program, "quantize"));
 }
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp
@@ -518,6 +518,135 @@ TEST(quantize_gpu, quantize_levels_256_3d_unsigned) {
     }
 }
 
+TEST(quantize_gpu, eltwise_quantize_fs_b_yx_fsv32) {
+    tests::random_generator rg(GET_SUITE_NAME);
+    auto& engine = get_test_engine();
+
+    // conv to enable 'fs_b_yx_fsv32_network'
+    const int batch_num = 2;
+    const int input_xy = 5;
+    const int input_f = 32;
+    const int output_f = 32;
+    const int filter_xy = 1;
+    const int pad = filter_xy / 2;
+
+    auto input_size = tensor(batch_num, input_f, input_xy, input_xy);
+    auto input_data = rg.generate_random_4d<ov::float16>(batch_num, input_f, input_xy, input_xy, -1, 1);
+    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
+    auto input_mem = engine.allocate_memory({ data_types::f16, format::bfyx, input_size });
+    set_values(input_mem, input_data_bfyx);
+
+    auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy);
+    auto weights_data = rg.generate_random_4d<ov::float16>(output_f, input_f, filter_xy, filter_xy, -1, 1);
+    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
+    auto weights_mem = engine.allocate_memory({ data_types::f16, format::bfyx, weights_size });
+    set_values(weights_mem, weights_data_bfyx);
+
+    topology topology(
+        input_layout("input_conv", input_mem->get_layout()),
+        data("weights_fsv", weights_mem));
+
+    // Reorder input to fs_byx_fsv32
+    topology.add(reorder("input_fsv", input_info("input_conv"), { data_types::f16, format::fs_b_yx_fsv32, input_size }));
+
+    topology.add(convolution("conv0", input_info("input_fsv"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv1", input_info("conv0"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv2", input_info("conv1"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv3", input_info("conv2"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv4", input_info("conv3"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv5", input_info("conv4"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv6", input_info("conv5"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv7", input_info("conv6"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv8", input_info("conv7"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv9", input_info("conv8"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv10", input_info("conv9"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+    topology.add(convolution("conv11", input_info("conv10"), "weights_fsv", "", 1, {1, 1}, {1, 1}, { pad, pad }, { pad, pad }, false));
+
+    topology.add(reorder("reorder_conv", input_info("conv11"), format::b_fs_yx_fsv16, data_types::f32));
+
+    // eltwise + quantize pattern
+    auto in_layout = layout{ ov::PartialShape{2, 16, 1, 2}, data_types::f16, format::b_fs_yx_fsv16 };
+    auto input = engine.allocate_memory(in_layout);
+    auto input_low = engine.allocate_memory({ data_types::f32,format::bfyx,{ 1, 16, 1, 1 } });
+    auto input_high = engine.allocate_memory({ data_types::f32,format::bfyx,{ 1, 16, 1, 1 } });
+    auto output_low = engine.allocate_memory({ data_types::f32,format::bfyx,{ 1, 1, 1, 1 } });
+    auto output_high = engine.allocate_memory({ data_types::f32,format::bfyx,{ 1, 1, 1, 1 } });
+
+    set_values(input, { -1.0f, 2.0f, 3.0f, 4.0f,
+                         5.0f, 2.0f, 2.0f, 3.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+
+                         1.0f, 1.0f, 1.0f, 1.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+                         1.0f, 1.0f, 1.0f, 1.0f,
+
+                        -1.0f, 2.0f, 3.0f, 4.0f,
+                         5.0f, 2.0f, 2.0f, 3.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+
+                         1.0f, 1.0f, 1.0f, 1.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+                         1.0f, 1.0f, 1.0f, 1.0f });
+
+    set_values(input_low,  { 0.0f, 1.0f, 2.0f, 3.0f,
+                             4.0f, 5.0f, 6.0f, 7.0f,
+                             7.0f, 6.0f, 5.0f, 4.0f,
+                             3.0f, 2.0f, 1.0f, 0.0f });
+    set_values(input_high, { 0.0f, 1.0f, 2.0f, 3.0f,
+                             4.0f, 5.0f, 6.0f, 7.0f,
+                             7.0f, 6.0f, 5.0f, 4.0f,
+                             3.0f, 2.0f, 1.0f, 0.0f });
+    set_values(output_low,  { -1.0f });
+    set_values(output_high, {  1.0f });
+
+    std::vector<float> ref_data = { -1, 1, -1, 1,   -1, -1, -1, -1,     -1, -1, -1, -1,
+                                    -1, 1, -1, 1,   -1, 1, -1, 1,       -1, -1, -1, -1,
+                                    -1, -1, -1, 1,  -1, 1, -1, 1,       -1, 1, -1, 1,
+                                    -1, -1, -1, -1, -1, -1, -1, 1,      -1, 1, -1, 1,
+                                    -1, 1, -1, 1,   -1, -1, -1, -1,     -1, -1, -1, -1,
+                                    -1, 1, -1, 1 };
+
+    topology.add(
+        input_layout("input1", in_layout),
+        input_layout("input2", in_layout),
+        eltwise("multiply", input_info("input1"), input_info("input2"), eltwise_mode::prod),
+        data("input_low", input_low),
+        data("input_high", input_high),
+        data("output_low", output_low),
+        data("output_high", output_high),
+        quantize("quantize", input_info("multiply"), input_info("input_low"), input_info("input_high"), input_info("output_low"), input_info("output_high"), 2, data_types::f32),
+        reorder("reorder", input_info("quantize"), format::b_fs_yx_fsv16, data_types::f32)
+    );
+
+    ExecutionConfig config = get_test_default_config(engine);
+    ov::intel_gpu::ImplementationDesc quantize_impl = { format::b_fs_yx_fsv16, "quantize_gpu_ref" };
+    config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "quantize", quantize_impl } }));
+    config.set_property(ov::intel_gpu::optimize_data(true));
+
+    network network(engine, topology, config);
+    network.set_input_data("input_conv", input_mem);
+    network.set_input_data("input1", input);
+    network.set_input_data("input2", input);
+    auto outputs = network.execute();
+
+    auto output = outputs.at("reorder").get_memory();
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+    // Check that layout and memory contains logical size of tensor
+    ASSERT_EQ(output->count(), (size_t)64);
+    ASSERT_EQ(output->get_layout().count(), (size_t)64);
+
+    ASSERT_EQ(output->size(), ref_data.size() * sizeof(uint32_t));
+
+    for (size_t i = 0; i < ref_data.size(); ++i) {
+        ASSERT_EQ(output_ptr[i], ref_data[i]) << " index = " << i;
+    }
+}
+
 TEST(quantize_gpu, dynamic) {
     auto& engine = get_test_engine();
 

Original file line number	Diff line number	Diff line change
`@@ -594,7 +594,7 @@ void program::pre_optimize_graph(bool is_internal) {`
`594`	`594`
`595`	`595`	`// Check fusing primitives based on preferred format or layout optimization`
`596`	`596`	`if (optimize_data) {`
`597`		`- apply_opt_pass<fuse_primitives_with_layout>();`
	`597`	`+ apply_opt_pass<fuse_primitives_with_layout>(lo);`
`598`	`598`	`}`
`599`	`599`
`600`	`600`	`// add optimization attributes for onednn primitives`
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,9 @@ TEST(fuse_primitives_with_layout, fuse_when_layout_format_of_input_and_output_ar`
`51`	`51`	`node->set_output_layout(qt_layout, false);`
`52`	`52`	`}`
`53`	`53`	`}`
`54`		`- program_wrapper::apply_opt_pass<fuse_primitives_with_layout>(*program);`
	`54`	`+`
	`55`	`+ layout_optimizer lo(true);`
	`56`	`+ program_wrapper::apply_opt_pass<fuse_primitives_with_layout>(*program, lo);`
`55`	`57`
`56`	`58`	`ASSERT_TRUE(has_node(*program, "quantize"));`
`57`	`59`	`}`