graph: dnnl: code polish and address left todo and skip legacy GQA

xiang1guo · xiang1guo · commit e2cd4db19d08 · 2025-03-21T08:55:20.000Z
diff --git a/src/graph/backend/dnnl/dnnl_shape_infer.cpp b/src/graph/backend/dnnl/dnnl_shape_infer.cpp
@@ -545,13 +545,18 @@ status_t infer_dnnl_binary_output_shape(op_t *n,
     }
 }
 
-//TODO(GX): revisit this function to correct logic, check if shape is given
 status_t infer_dnnl_sdpa_output_shape(op_t *n,
         std::vector<logical_tensor_t *> &inputs,
         std::vector<logical_tensor_t *> &outputs) {
+    // [batch_size, num_heads_q, seq_len_q, head_size_qk]
     auto query = logical_tensor_wrapper_t(inputs[0]);
+    // [batch_size, num_heads_q, head_size_qk, seq_len_kv,]
     auto key = logical_tensor_wrapper_t(inputs[1]);
-    auto value = logical_tensor_wrapper_t(inputs[1]);
+    // [batch_size, num_heads_v, seq_len_kv, head_size_v]
+    auto value = logical_tensor_wrapper_t(inputs[2]);
+    // [batch_size, num_heads_q, seq_len_q, head_size_v]
+    auto out0 = logical_tensor_wrapper_t(outputs[0]);
+
     dims query_dims = query.vdims();
     dims key_dims = key.vdims();
     dims value_dims = value.vdims();
@@ -563,7 +568,36 @@ status_t infer_dnnl_sdpa_output_shape(op_t *n,
             op_t::kind2str(n->get_kind()).c_str(), dims2str(query_dims).c_str(),
             dims2str(key_dims).c_str(), dims2str(value_dims).c_str());
 
-    dims inferred_output_shape = query_dims;
+    VCHECK_INVALID_SHAPE((query_dims.size() == 4),
+            "%s, only support 4D input for all Q/K/V. input0 dimension: %s, "
+            "input1 dimension: %s, input2 dimension: %s ",
+            op_t::kind2str(n->get_kind()).c_str(),
+            std::to_string(query_dims.size()).c_str(),
+            std::to_string(key_dims.size()).c_str(),
+            std::to_string(value_dims.size()).c_str());
+
+    VCHECK_INVALID_SHAPE((query_dims[3] == key_dims[2]),
+            "%s, query head size should be match with key head size. query "
+            "dims: %s, Key dims: %s",
+            op_t::kind2str(n->get_kind()).c_str(), dims2str(query_dims).c_str(),
+            dims2str(key_dims).c_str());
+
+    VCHECK_INVALID_SHAPE((key_dims[3] == value_dims[2]),
+            "%s, key sequence length should be match with value sequence "
+            "length. key dims: %s, value dims: %s ",
+            op_t::kind2str(n->get_kind()).c_str(), dims2str(key_dims).c_str(),
+            dims2str(value_dims).c_str());
+
+    dims inferred_output_shape;
+    inferred_output_shape
+            = {query_dims[0], query_dims[1], query_dims[2], value_dims[3]};
+
+    if (out0.ndims() != -1) {
+        VCHECK_INVALID_SHAPE(validate(inferred_output_shape, out0.vdims()),
+                "%s, inferred out shape and output shape are not compatible",
+                op_t::kind2str(n->get_kind()).c_str());
+    }
+
     set_shape_and_strides(*outputs[0], inferred_output_shape);
     return status::success;
 }
diff --git a/src/graph/backend/dnnl/kernels/large_partition.cpp b/src/graph/backend/dnnl/kernels/large_partition.cpp
@@ -142,7 +142,7 @@ void larger_partition_kernel_t::setup_pipeline_stage2(pass_pipeline_t &pipeline,
     }
     BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_src_transpose_to_matmul);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
     BACKEND_DNNL_ADD_PASS(pipeline, common_reorder_elimination);
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_adjacent_reorders);
diff --git a/src/graph/backend/dnnl/kernels/matmul.cpp b/src/graph/backend/dnnl/kernels/matmul.cpp
@@ -110,7 +110,7 @@ status_t matmul_t<quantized>::compile_impl(const dnnl_partition_impl_t *part,
     }
 
     BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
 
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_adjacent_reorders);
diff --git a/src/graph/backend/dnnl/kernels/mqa_decomp.cpp b/src/graph/backend/dnnl/kernels/mqa_decomp.cpp
@@ -87,7 +87,7 @@ status_t mqa_decomp_kernel_t<quantized, dt>::compile_impl(
         BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);
     }
     pipeline.reset_visualize_arg(true, false);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
 
     // Run the added passes
diff --git a/src/graph/backend/dnnl/kernels/sdp_decomp.cpp b/src/graph/backend/dnnl/kernels/sdp_decomp.cpp
@@ -86,7 +86,7 @@ status_t sdp_decomp_kernel_t<quantized, dt>::compile_impl(
         BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);
     }
     pipeline.reset_visualize_arg(true, false);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
 
     // Run the added passes
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive.cpp b/src/graph/backend/dnnl/kernels/sdp_primitive.cpp
@@ -92,7 +92,7 @@ status_t sdp_primitive_kernel_t<quantized>::compile_impl(
 
     pipeline.reset_visualize_arg(true, false);
     BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
 
     // bind the memory for each op
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_config.cpp b/src/graph/backend/dnnl/kernels/sdp_primitive_config.cpp
@@ -166,11 +166,28 @@ status_t sdp_primitive_config_t::locate_io(std::shared_ptr<subgraph_t> &sg,
 
 status_t sdp_primitive_config_t::initial_check(
         const std::shared_ptr<subgraph_t> &sg,
-        const std::vector<logical_tensor_t> &inputs) {
+        const std::vector<logical_tensor_t> &inputs, bool v1_kenrel) {
     // At least 3 inputs: Q, K, V
     VCHECK_SDP_PRIMITIVE(inputs.size() >= 3, status::invalid_arguments,
             "At least 3 inputs are required");
 
+    VCHECK_SDP_PRIMITIVE(inputs[0].data_type != dnnl_data_type_t::dnnl_f32,
+            status::invalid_arguments,
+            "SDPA ukernel doesn't support f32 datatype now");
+
+    // Note: sdpa_primitive_v1 kenrel currently don't support legacy GQA pattern.
+    if (v1_kenrel) {
+        for (auto &cur_op : sg->get_ops()) {
+            if (cur_op->get_kind() == graph::op_kind::StaticReshape) {
+                auto in = cur_op->get_input_value(0)->get_logical_tensor();
+                auto out = cur_op->get_output_value(0)->get_logical_tensor();
+                if (ltw(in).ndims() == 5 || ltw(out).ndims() == 5) {
+                    return status::unimplemented;
+                }
+            }
+        }
+    }
+
     // step1(pattern check): Not support sdpa variants with select as mask
     // We already have a pattern matcher to ensure that the sdpa patterns
     // dispatch to here are knows ones, and we have quant check in sdpa base
@@ -268,10 +285,10 @@ status_t sdp_primitive_config_t::initial_check(
 
     VCHECK_SDP_PRIMITIVE(q_id != -1 && k_id != -1 && v_id != -1,
             status::unimplemented, "Q, K, V are not found");
-    VCHECK_SDP_PRIMITIVE(ltw(inputs[q_id]).vdims().size() == 4
-                    && ltw(inputs[k_id]).vdims().size() == 4
-                    && ltw(inputs[v_id]).vdims().size() == 4,
-            status::unimplemented, "Q, K, V should be 4-dims");
+    // VCHECK_SDP_PRIMITIVE(ltw(inputs[q_id]).vdims().size() == 4
+    //              && ltw(inputs[k_id]).vdims().size() == 4
+    //           && ltw(inputs[v_id]).vdims().size() == 4,
+    // status::unimplemented, "Q, K, V should be 4-dims");
 
     // sdp_primitive only supports single scale value.
     if (scale) {
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_config.hpp b/src/graph/backend/dnnl/kernels/sdp_primitive_config.hpp
@@ -82,7 +82,8 @@ struct sdp_primitive_config_t {
     // 2. only support fp16 data type
     // 3. only support 4-dims tensor
     status_t initial_check(const std::shared_ptr<subgraph_t> &sg,
-            const std::vector<logical_tensor_t> &inputs);
+            const std::vector<logical_tensor_t> &inputs,
+            bool v1_kenrel = false);
 
     // Initialize parameters and primitive.
     status_t init(std::shared_ptr<subgraph_t> &sg, const dnnl::engine &p_engine,
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_v1.cpp b/src/graph/backend/dnnl/kernels/sdp_primitive_v1.cpp
@@ -61,7 +61,7 @@ status_t sdp_primitive_v1_kernel_t<quantized>::compile_impl(
                     p_engine_, part->get_fpmath_mode(), false, true);
     CHECK(set_given_inputs_outputs(subgraph_, inputs, outputs));
 
-    CHECK(cfg_.initial_check(subgraph_, inputs));
+    CHECK(cfg_.initial_check(subgraph_, inputs, true));
 
     subgraph_visualizer_t vis(part->id(), [this](const value_t *val) {
         return this->memory_planner_.get_memory_info(val);
@@ -76,10 +76,8 @@ status_t sdp_primitive_v1_kernel_t<quantized>::compile_impl(
     BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_src_transpose_to_matmul);
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_sdpa);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, insert_reshape_for_sdpa);
-
-    // TODO(GX):add fuse dst transpose to sdpa
-    // BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
 
     // bind the memory for each op`
@@ -145,7 +143,9 @@ status_t sdp_primitive_v1_kernel_t<quantized>::execute_impl(
 
     // Micro kernel doesn't use scratchpad memory, here we force-set size as
     // zero to avoid redundant memory allocation and deallocation.
-    temporary_scratchpad_t scratchpad(memory_planner_.total_internal_temporary_size(), p_engine_, *g_alloc_);
+    temporary_scratchpad_t scratchpad(
+            memory_planner_.total_internal_temporary_size(), p_engine_,
+            *g_alloc_);
     prepare_args_set(res, inputs, outputs, scratchpad);
 
     for (size_t i = 0; i < subgraph_->execs_.size(); i++) {
@@ -177,7 +177,9 @@ status_t sdp_primitive_v1_kernel_t<quantized>::sycl_execute_impl(
 
     // Micro kernel doesn't use scratchpad memory, here we force-set size as
     // zero to avoid redundant memory allocation and deallocation.
-    temporary_scratchpad_t scratchpad(memory_planner_.total_internal_temporary_size(), p_engine_, *g_alloc_);
+    temporary_scratchpad_t scratchpad(
+            memory_planner_.total_internal_temporary_size(), p_engine_,
+            *g_alloc_);
     prepare_args_set(res, inputs, outputs, scratchpad);
 
     for (size_t i = 0; i < subgraph_->execs_.size(); i++) {
@@ -215,7 +217,9 @@ status_t sdp_primitive_v1_kernel_t<quantized>::ocl_execute_impl(
 
     // Micro kernel doesn't use scratchpad memory, here we force-set size as
     // zero to avoid redundant memory allocation and deallocation.
-    temporary_scratchpad_t scratchpad(memory_planner_.total_internal_temporary_size(), p_engine_, *g_alloc_);
+    temporary_scratchpad_t scratchpad(
+            memory_planner_.total_internal_temporary_size(), p_engine_,
+            *g_alloc_);
     prepare_args_set(res, inputs, outputs, scratchpad);
 
     for (size_t i = 0; i < subgraph_->execs_.size(); i++) {
diff --git a/src/graph/backend/dnnl/layout_propagator.cpp b/src/graph/backend/dnnl/layout_propagator.cpp
@@ -1576,11 +1576,39 @@ status_t layout_propagator_for_sdpa(std::shared_ptr<op_t> &op,
     UNUSED(mgr);
     UNUSED(pd_cache);
     UNUSED(rewriter);
-    auto dst_md = make_dnnl_memory_desc(
-            op->get_output_value(0)->get_logical_tensor());
+
     value_ptr dst_val = op->get_output_value(0);
-    dst_val->set_strides(get_dense_strides(dst_md.get_dims()));
-    status_t status = fill_layout_info(dst_val, dst_md);
+    const logical_tensor_t &out_lt = dst_val->get_logical_tensor();
+    dnnl::memory::desc expected_md;
+
+    if (ltw(out_lt).is_any()) {
+        // For GQA, we need to check the layout of the dnnl_reshape output
+        // following dnnl_sdpa, which is given by the user.
+        if (!dst_val->get_consumers().empty()) {
+            const auto &consumer_op = dst_val->get_consumers()[0].get_op();
+            const auto &consumer_out = ltw(
+                    consumer_op.get_output_value(0)->get_logical_tensor());
+            if (consumer_op.get_kind() == op_kind::dnnl_reshape
+                    && consumer_out.ndims() == 5 && consumer_out.is_strided()) {
+                const auto &ori_strides = consumer_out.vstrides();
+                std::vector<dim_t> strides = {ori_strides[0], ori_strides[2],
+                        ori_strides[3], ori_strides[4]};
+                dnnl::memory::desc tmp_md {ltw(out_lt).vdims(),
+                        static_cast<dnnl::memory::data_type>(
+                                ltw(out_lt).data_type()),
+                        strides};
+                expected_md = tmp_md;
+            }
+        } else {
+            dnnl::memory::desc expected_md {ltw(out_lt).vdims(),
+                    static_cast<dnnl::memory::data_type>(
+                            ltw(out_lt).data_type()),
+                    dnnl::memory::format_tag::acbd};
+        }
+    } else {
+        expected_md = make_dnnl_memory_desc(out_lt);
+    }
+    status_t status = fill_layout_info(dst_val, expected_md);
 
     // fill scratchpads dimensions and data type to scratchpad value_t
     value_ptr scratchpad_val = op->get_output_value(1);
diff --git a/src/graph/backend/dnnl/passes/insert_ops.cpp b/src/graph/backend/dnnl/passes/insert_ops.cpp
@@ -662,6 +662,7 @@ status_t insert_reshape_for_sdpa(std::shared_ptr<subgraph_t> &sg) {
         reshape_output->set_attr<bool>(op_attr::special_zero, false);
         reshape_output->set_attr<std::vector<int64_t>>(
                 op_attr::shape, expected_output_dims);
+
         rewriter.insert_op_after(reshape_output, cur_op, 0);
     }
     rewriter.run();
diff --git a/src/graph/backend/dnnl/passes/transform.cpp b/src/graph/backend/dnnl/passes/transform.cpp
@@ -3835,13 +3835,16 @@ impl::status_t fuse_src_transpose_to_matmul(std::shared_ptr<subgraph_t> &sg) {
     return impl::status::success;
 }
 
-impl::status_t fuse_dst_transpose_to_matmul(std::shared_ptr<subgraph_t> &sg) {
+impl::status_t fuse_dst_transpose_to_predecessor(
+        std::shared_ptr<subgraph_t> &sg) {
     std::vector<op_ptr> transpose_ops;
     for (auto &cur_op : sg->get_ops()) {
         if (cur_op->get_kind() == op_kind::dnnl_transpose
                 && cur_op->get_input_value(0)->has_producer()
-                && cur_op->get_input_value(0)->get_producer().get_kind()
-                        == op_kind::dnnl_matmul
+                && (cur_op->get_input_value(0)->get_producer().get_kind()
+                                == op_kind::dnnl_matmul
+                        || cur_op->get_input_value(0)->get_producer().get_kind()
+                                == op_kind::dnnl_sdpa)
                 && !cur_op->get_output_value(0)->get_consumers().empty()
                 && (cur_op->get_output_value(0)
                                         ->get_consumers()[0]
@@ -3894,13 +3897,17 @@ impl::status_t fuse_dst_transpose_to_matmul(std::shared_ptr<subgraph_t> &sg) {
         dnnl::memory::desc expected_out_md = out_md.permute_axes(axes);
         // Special check to avoid low matmul performance with adbc layout.
         // TODO: remove this once the performance is improved.
-        if (get_format_tag(expected_out_md) == dnnl::memory::format_tag::adbc) {
+        if (in_val->get_producer().get_kind() == op_kind::dnnl_matmul
+                && get_format_tag(expected_out_md)
+                        == dnnl::memory::format_tag::adbc) {
             break;
         }
         const auto &strides = expected_out_md.get_strides();
         in_val->set_strides(strides);
-        auto &matmul = transpose_op->get_input_value(0)->get_producer();
-        matmul.set_attr(op_attr::keep_dst_layout, true);
+        if (in_val->get_producer().get_kind() == op_kind::dnnl_matmul) {
+            auto &matmul = in_val->get_producer();
+            matmul.set_attr(op_attr::keep_dst_layout, true);
+        }
     }
     rewriter.run();
     return impl::status::success;
@@ -4182,12 +4189,12 @@ status_t fuse_sdpa(std::shared_ptr<subgraph_t> &sg) {
             switch (walker->get_kind()) {
                 case op_kind::dnnl_matmul: {
                     if (pattern_ops.size() == 1) {
-                    } 
+                    }
                     // Finish pattern match process after second matmul
                     else {
                         valid_pattern = (pattern_ops.size() >= 3);
                         finished = true;
-                }
+                    }
                     break;
                 }
                 case op_kind::dnnl_binary: {
@@ -4256,8 +4263,8 @@ status_t fuse_sdpa(std::shared_ptr<subgraph_t> &sg) {
             auto alg = static_cast<dnnl::algorithm>(
                     op->get_attr<int64_t>(op_attr::alg_kind));
             // handle scale
-            if (alg == dnnl::algorithm::binary_mul || 
-                alg == dnnl::algorithm::binary_div) {
+            if (alg == dnnl::algorithm::binary_mul
+                    || alg == dnnl::algorithm::binary_div) {
                 auto scale_val = op->get_input_value(1);
                 scale_val->remove_consumer(*op, 1);
                 sdpa_op->connect_input(input_idx++, scale_val);
diff --git a/src/graph/backend/dnnl/passes/transform.hpp b/src/graph/backend/dnnl/passes/transform.hpp
@@ -206,7 +206,8 @@ impl::status_t fuse_src_transpose_to_matmul(std::shared_ptr<subgraph_t> &sg);
 
 // This pass will compute matmul with the dst layout of following transpose if
 // the operator after transpose need a dense layout
-impl::status_t fuse_dst_transpose_to_matmul(std::shared_ptr<subgraph_t> &sg);
+impl::status_t fuse_dst_transpose_to_predecessor(
+        std::shared_ptr<subgraph_t> &sg);
 
 // This pass will fuse all the reshape to its lead op for GQA.
 impl::status_t fuse_reshape_for_gqa(std::shared_ptr<subgraph_t> &sg);

Original file line number	Diff line number	Diff line change
`@@ -142,7 +142,7 @@ void larger_partition_kernel_t::setup_pipeline_stage2(pass_pipeline_t &pipeline,`
`142`	`142`	`}`
`143`	`143`	`BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);`
`144`	`144`	`BACKEND_DNNL_ADD_PASS(pipeline, fuse_src_transpose_to_matmul);`
`145`		`- BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);`
	`145`	`+ BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);`
`146`	`146`	`BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);`
`147`	`147`	`BACKEND_DNNL_ADD_PASS(pipeline, common_reorder_elimination);`
`148`	`148`	`BACKEND_DNNL_ADD_PASS(pipeline, fuse_adjacent_reorders);`
Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,7 @@ status_t matmul_t<quantized>::compile_impl(const dnnl_partition_impl_t *part,`
`110`	`110`	`}`
`111`	`111`
`112`	`112`	`BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);`
`113`		`- BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);`
	`113`	`+ BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);`
`114`	`114`	`BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);`
`115`	`115`
`116`	`116`	`BACKEND_DNNL_ADD_PASS(pipeline, fuse_adjacent_reorders);`
Original file line number	Diff line number	Diff line change
`@@ -87,7 +87,7 @@ status_t mqa_decomp_kernel_t<quantized, dt>::compile_impl(`
`87`	`87`	`BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);`
`88`	`88`	`}`
`89`	`89`	`pipeline.reset_visualize_arg(true, false);`
`90`		`- BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);`
	`90`	`+ BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);`
`91`	`91`	`BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);`
`92`	`92`
`93`	`93`	`// Run the added passes`
Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,7 @@ status_t sdp_decomp_kernel_t<quantized, dt>::compile_impl(`
`86`	`86`	`BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);`
`87`	`87`	`}`
`88`	`88`	`pipeline.reset_visualize_arg(true, false);`
`89`		`- BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);`
	`89`	`+ BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);`
`90`	`90`	`BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);`
`91`	`91`
`92`	`92`	`// Run the added passes`
Original file line number	Diff line number	Diff line change
`@@ -662,6 +662,7 @@ status_t insert_reshape_for_sdpa(std::shared_ptr<subgraph_t> &sg) {`
`662`	`662`	`reshape_output->set_attr<bool>(op_attr::special_zero, false);`
`663`	`663`	`reshape_output->set_attr<std::vector<int64_t>>(`
`664`	`664`	`op_attr::shape, expected_output_dims);`
	`665`	`+`
`665`	`666`	`rewriter.insert_op_after(reshape_output, cur_op, 0);`
`666`	`667`	`}`
`667`	`668`	`rewriter.run();`