graph: backend: dnnl: support lnorm + q with zps!=0

rzhang · TaoLv · commit 493921cd77a4 · 2024-06-18T08:50:34.000+08:00
diff --git a/src/graph/backend/dnnl/kernels/layernorm.hpp b/src/graph/backend/dnnl/kernels/layernorm.hpp
@@ -89,8 +89,9 @@ struct layernorm_fwd_t : public kernel_base_t {
 
         BACKEND_DNNL_ADD_PASS(pipeline, lower_down);
         BACKEND_DNNL_ADD_PASS(pipeline, fuse_post_typecast_to_predecessor);
-        BACKEND_DNNL_ADD_PASS(pipeline, fuse_post_ops);
         BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);
+        BACKEND_DNNL_ADD_PASS(pipeline, replace_quant_data_with_binary_post_op);
+        BACKEND_DNNL_ADD_PASS(pipeline, fuse_post_ops);
         BACKEND_DNNL_ADD_PASS(pipeline, convert_to_runtime_dst_scales);
         BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_scales);
         BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);
diff --git a/src/graph/backend/dnnl/patterns/layernorm_fusion.cpp b/src/graph/backend/dnnl/patterns/layernorm_fusion.cpp
@@ -79,7 +79,6 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, layernorm_post_ops_fusion_cpu)
                     auto q_graph = std::make_shared<pb_graph_t>();
                     pm::pb_op_t *pquantize
                             = q_graph->append_op(graph::op_kind::Quantize);
-                    pquantize->append_decision_function(check_zps_values<0>);
                     q_graph->create_input_port(0, pquantize, 0);
                     q_graph->create_output_port(0, pquantize, 0);
                     pgraph->append_optional(
diff --git a/tests/benchdnn/inputs/graph/pattern/harness_int8_all b/tests/benchdnn/inputs/graph/pattern/harness_int8_all
@@ -114,6 +114,8 @@
 --reset --in-shapes=0:1x128x150x150*acdb+1:1x128x150x150*acdb+2:1x128x150x150*acdb --op-attrs=3:axis:0 --case=pattern/int8/int8_concat_fusion_3.json
 #layernorm
 --reset --case=pattern/int8/int8_lnorm_gelu_quantize.json
+# layernorm with zp != 0
+--reset --op-attrs=2:zps:1 --case=pattern/int8/int8_lnorm_gelu_quantize.json
 --reset --case=pattern/int8/int8_lnorm_multiply_quantize.json
 --reset --case=pattern/int8/int8_lnorm_tc_multiply_quantize.json
 #softmax
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_pass.cpp b/tests/gtests/graph/unit/backend/dnnl/test_pass.cpp
@@ -11836,7 +11836,7 @@ TEST(test_pass_pass_system, FuseLayernormTypecastQuantize_CPU) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass_system, NotFuseLayernormTypecast) {
+TEST(test_pass_pass_system, NotFuseLayernormTypecast_GPU) {
     /*
              | (bf16)
            layernorm

Original file line number	Diff line number	Diff line change
`@@ -11836,7 +11836,7 @@ TEST(test_pass_pass_system, FuseLayernormTypecastQuantize_CPU) {`
`11836`	`11836`	`ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);`
`11837`	`11837`	`}`
`11838`	`11838`
`11839`		`-TEST(test_pass_pass_system, NotFuseLayernormTypecast) {`
	`11839`	`+TEST(test_pass_pass_system, NotFuseLayernormTypecast_GPU) {`
`11840`	`11840`	`/*`
`11841`	`11841`	`\| (bf16)`
`11842`	`11842`	`layernorm`