graph: dnnl: add sdpa primitive ukernel v1

xiang1guo · xiang1guo · commit b89f51b3f99a · 2025-03-21T07:58:05.000-07:00
diff --git a/src/graph/backend/dnnl/kernels/large_partition.cpp b/src/graph/backend/dnnl/kernels/large_partition.cpp
@@ -142,7 +142,7 @@ void larger_partition_kernel_t::setup_pipeline_stage2(pass_pipeline_t &pipeline,
     }
     BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_src_transpose_to_matmul);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
     BACKEND_DNNL_ADD_PASS(pipeline, common_reorder_elimination);
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_adjacent_reorders);
diff --git a/src/graph/backend/dnnl/kernels/matmul.cpp b/src/graph/backend/dnnl/kernels/matmul.cpp
@@ -110,7 +110,7 @@ status_t matmul_t<quantized>::compile_impl(const dnnl_partition_impl_t *part,
     }
 
     BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
 
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_adjacent_reorders);
diff --git a/src/graph/backend/dnnl/kernels/mqa_decomp.cpp b/src/graph/backend/dnnl/kernels/mqa_decomp.cpp
@@ -87,7 +87,7 @@ status_t mqa_decomp_kernel_t<quantized, dt>::compile_impl(
         BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);
     }
     pipeline.reset_visualize_arg(true, false);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
 
     // Run the added passes
diff --git a/src/graph/backend/dnnl/kernels/sdp.hpp b/src/graph/backend/dnnl/kernels/sdp.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 #include "graph/backend/dnnl/kernels/large_partition.hpp"
 #include "graph/backend/dnnl/kernels/sdp_decomp.hpp"
 #include "graph/backend/dnnl/kernels/sdp_primitive.hpp"
+#include "graph/backend/dnnl/kernels/sdp_primitive_v1.hpp"
 
 #include "graph/backend/dnnl/dnnl_partition_impl.hpp"
 
@@ -65,7 +66,15 @@ struct sdp_base_t : public kernel_base_t {
 
         status_t ret = status::unimplemented;
 
+        // SDPA Ukernel v1 with fused internal sdpa solution. Support fload sdpa
+        // only.
+        // TODO(GX): Support quantized sdpa and merge with sdp_primitive_kernel_t.
         if (enable_ukernel) {
+            kernel = std::make_shared<sdp_primitive_v1_kernel_t<quantized>>();
+            ret = kernel->compile_impl(part, g_engine, inputs, outputs);
+        }
+
+        if (ret != status::success && enable_ukernel) {
             kernel = std::make_shared<sdp_primitive_kernel_t<quantized>>();
             ret = kernel->compile_impl(part, g_engine, inputs, outputs);
         }
diff --git a/src/graph/backend/dnnl/kernels/sdp_decomp.cpp b/src/graph/backend/dnnl/kernels/sdp_decomp.cpp
@@ -86,7 +86,7 @@ status_t sdp_decomp_kernel_t<quantized, dt>::compile_impl(
         BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);
     }
     pipeline.reset_visualize_arg(true, false);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
 
     // Run the added passes
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive.cpp b/src/graph/backend/dnnl/kernels/sdp_primitive.cpp
@@ -92,7 +92,7 @@ status_t sdp_primitive_kernel_t<quantized>::compile_impl(
 
     pipeline.reset_visualize_arg(true, false);
     BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
 
     // bind the memory for each op
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_config.cpp b/src/graph/backend/dnnl/kernels/sdp_primitive_config.cpp
@@ -166,11 +166,29 @@ status_t sdp_primitive_config_t::locate_io(std::shared_ptr<subgraph_t> &sg,
 
 status_t sdp_primitive_config_t::initial_check(
         const std::shared_ptr<subgraph_t> &sg,
-        const std::vector<logical_tensor_t> &inputs) {
+        const std::vector<logical_tensor_t> &inputs, bool v1_kenrel) {
     // At least 3 inputs: Q, K, V
     VCHECK_SDP_PRIMITIVE(inputs.size() >= 3, status::invalid_arguments,
             "At least 3 inputs are required");
 
+    // Ukernel doesn't support f32 datatype now
+    VCHECK_SDP_PRIMITIVE(inputs[0].data_type != dnnl_data_type_t::dnnl_f32,
+            status::invalid_arguments,
+            "SDPA ukernel doesn't support f32 datatype now");
+
+    // Note: sdpa_primitive_v1 kernel currently don't support legacy GQA pattern.
+    if (v1_kenrel) {
+        for (auto &cur_op : sg->get_ops()) {
+            if (cur_op->get_kind() == graph::op_kind::StaticReshape) {
+                auto in = cur_op->get_input_value(0)->get_logical_tensor();
+                auto out = cur_op->get_output_value(0)->get_logical_tensor();
+                if (ltw(in).ndims() == 5 || ltw(out).ndims() == 5) {
+                    return status::unimplemented;
+                }
+            }
+        }
+    }
+
     // step1(pattern check): Not support sdpa variants with select as mask
     // We already have a pattern matcher to ensure that the sdpa patterns
     // dispatch to here are knows ones, and we have quant check in sdpa base
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_config.hpp b/src/graph/backend/dnnl/kernels/sdp_primitive_config.hpp
@@ -82,7 +82,8 @@ struct sdp_primitive_config_t {
     // 2. only support fp16 data type
     // 3. only support 4-dims tensor
     status_t initial_check(const std::shared_ptr<subgraph_t> &sg,
-            const std::vector<logical_tensor_t> &inputs);
+            const std::vector<logical_tensor_t> &inputs,
+            bool v1_kenrel = false);
 
     // Initialize parameters and primitive.
     status_t init(std::shared_ptr<subgraph_t> &sg, const dnnl::engine &p_engine,
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_v1.cpp b/src/graph/backend/dnnl/kernels/sdp_primitive_v1.cpp
@@ -0,0 +1,239 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "graph/backend/dnnl/kernels/sdp_primitive_v1.hpp"
+
+#include "common/sdpa_pd.hpp"
+
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#include "gpu/intel/ocl/stream.hpp"
+#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+#include "gpu/intel/sycl/stream.hpp"
+#endif
+
+#include "graph/backend/dnnl/passes/compile_ops.hpp"
+#include "graph/backend/dnnl/passes/constant_propagation.hpp"
+#include "graph/backend/dnnl/passes/insert_ops.hpp"
+#include "graph/backend/dnnl/passes/layout_propagation.hpp"
+#include "graph/backend/dnnl/passes/lower.hpp"
+#include "graph/backend/dnnl/passes/memory_planning.hpp"
+#include "graph/backend/dnnl/passes/transform.hpp"
+#include "graph/backend/dnnl/passes/utils.hpp"
+
+#include "graph/backend/dnnl/op_executable.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace graph {
+namespace dnnl_impl {
+
+template <bool quantized>
+status_t sdp_primitive_v1_kernel_t<quantized>::compile_impl(
+        const dnnl_partition_impl_t *part, const engine_t *g_engine,
+        const std::vector<logical_tensor_t> &inputs,
+        const std::vector<logical_tensor_t> &outputs) {
+// sdp_primitive_v1_kernel_t only supports Intel GPU.
+#if defined(DNNL_WITH_SYCL) && DNNL_GPU_VENDOR != DNNL_VENDOR_INTEL
+    return status::unimplemented;
+#endif
+    if (quantized) { return status::unimplemented; }
+
+    p_engine_ = make_dnnl_engine(*g_engine);
+    g_alloc_
+            = reinterpret_cast<graph::allocator_t *>(g_engine->get_allocator());
+
+    // First, dry run on a deep copy
+    subgraph_
+            = std::make_shared<subgraph_t>(graph_t::deep_copy(part->get_ops()),
+                    p_engine_, part->get_fpmath_mode(), false, true);
+    CHECK(set_given_inputs_outputs(subgraph_, inputs, outputs));
+
+    CHECK(cfg_.initial_check(subgraph_, inputs, true));
+
+    subgraph_visualizer_t vis(part->id(), [this](const value_t *val) {
+        return this->memory_planner_.get_memory_info(val);
+    });
+    pass_pipeline_t pipeline = pass_pipeline_t(vis);
+
+    BACKEND_DNNL_ADD_PASS(pipeline, lower_down);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_implicit_causal_mask);
+    BACKEND_DNNL_ADD_PASS(pipeline, binary_canonicalization);
+    BACKEND_DNNL_ADD_PASS(pipeline, insert_permute_for_matmul);
+
+    pipeline.reset_visualize_arg(true, false);
+    BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_src_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_sdpa);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
+    BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
+
+    // bind the memory for each op`
+    auto memory_plan = [&](std::shared_ptr<subgraph_t> &sg) {
+        return memory_planner_.run(sg);
+    };
+    pipeline.reset_visualize_arg(true, true);
+    BACKEND_DNNL_ADD_PASS(pipeline, memory_plan);
+    BACKEND_DNNL_ADD_PASS(pipeline, compile_ops);
+
+    // Run the added passes
+    BACKEND_DNNL_CHECK(pipeline.run(subgraph_));
+
+    // fill information for inputs logical tensors
+    for (size_t i = 0; i < inputs.size(); i++) {
+        auto &in = const_cast<logical_tensor_t &>(inputs[i]);
+        in = subgraph_->ins_[i];
+    }
+
+    // fill information for outputs logical tensors
+    for (size_t i = 0; i < outputs.size(); i++) {
+        auto &out = const_cast<logical_tensor_t &>(outputs[i]);
+        out = subgraph_->outs_[i];
+    }
+
+    resource_ctor_ = [this]() {
+        return this->memory_planner_.get_exec_args_set().clone();
+    };
+
+    return status::success;
+}
+
+template <bool quantized>
+void sdp_primitive_v1_kernel_t<quantized>::prepare_args_set(
+        const execution_args_set_t *res, const std::vector<tensor_t> &inputs,
+        const std::vector<tensor_t> &outputs, const scratchpad_t &scratchpad) {
+    // update the data of partition in/outputs args
+    for (const auto &mem_idx : res->get_mems_use_external_inputs()) {
+        mem_idx.first.set_data_handle(inputs[mem_idx.second].get_data_handle());
+    }
+    for (const auto &mem_idx : res->get_mems_use_external_outputs()) {
+        mem_idx.first.set_data_handle(
+                outputs[mem_idx.second].get_data_handle());
+    }
+
+    grantor_t var_grantor = memory_planner_.internal_temporary_grantor(
+            scratchpad.get_buffer());
+
+    for (auto &mem_offkey : res->get_mems_use_internal_temporary()) {
+        mem_offkey.first.set_data_handle(var_grantor.get(mem_offkey.second));
+    }
+}
+
+template <bool quantized>
+status_t sdp_primitive_v1_kernel_t<quantized>::execute_impl(
+        const stream_t *g_stream, const std::vector<tensor_t> &inputs,
+        const std::vector<tensor_t> &outputs) {
+    dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
+
+    thread_local_cache_t<execution_args_set_t> res_cache;
+    execution_args_set_t *res = res_cache.get_or_add(
+            reinterpret_cast<size_t>(this), resource_ctor_);
+
+    temporary_scratchpad_t scratchpad(
+            memory_planner_.total_internal_temporary_size(), p_engine_,
+            *g_alloc_);
+    prepare_args_set(res, inputs, outputs, scratchpad);
+
+    for (size_t i = 0; i < subgraph_->execs_.size(); i++) {
+        subgraph_->execs_[i]->execute(p_stream, res->get_exec_args()[i]);
+    }
+
+    return status::success;
+}
+
+#ifdef DNNL_WITH_SYCL
+template <bool quantized>
+status_t sdp_primitive_v1_kernel_t<quantized>::sycl_execute_impl(
+        const stream_t *g_stream, const std::vector<tensor_t> &inputs,
+        const std::vector<tensor_t> &outputs,
+        const std::vector<::sycl::event> &sycl_deps,
+        ::sycl::event *sycl_event) {
+// sdp_primitive_v1_kernel_t only supports Intel GPU.
+#if DNNL_GPU_VENDOR != DNNL_VENDOR_INTEL
+    return status::unimplemented;
+#endif
+    auto deps = sycl_deps;
+    ::sycl::event returned_event;
+
+    dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
+
+    thread_local_cache_t<execution_args_set_t> res_cache;
+    execution_args_set_t *res = res_cache.get_or_add(
+            reinterpret_cast<size_t>(this), resource_ctor_);
+
+    temporary_scratchpad_t scratchpad(
+            memory_planner_.total_internal_temporary_size(), p_engine_,
+            *g_alloc_);
+    prepare_args_set(res, inputs, outputs, scratchpad);
+
+    for (size_t i = 0; i < subgraph_->execs_.size(); i++) {
+        if (subgraph_->is_constant_[i]) continue;
+        returned_event = subgraph_->execs_[i]->execute_sycl(
+                p_stream, res->get_exec_args()[i], deps);
+        deps = {returned_event};
+    }
+
+    scratchpad.set_deps(returned_event);
+    if (sycl_event) *sycl_event = returned_event;
+
+    return status::success;
+}
+#endif
+
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+template <bool quantized>
+status_t sdp_primitive_v1_kernel_t<quantized>::ocl_execute_impl(
+        const stream_t *g_stream, const std::vector<tensor_t> &inputs,
+        const std::vector<tensor_t> &outputs,
+        const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
+// sdp_primitive_v1_kernel_t only supports Intel GPU.
+#if DNNL_GPU_VENDOR != DNNL_VENDOR_INTEL
+    return status::unimplemented;
+#endif
+    auto deps = cl_deps;
+    cl_event returned_event {};
+
+    dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
+
+    thread_local_cache_t<execution_args_set_t> res_cache;
+    execution_args_set_t *res = res_cache.get_or_add(
+            reinterpret_cast<size_t>(this), resource_ctor_);
+
+    temporary_scratchpad_t scratchpad(
+            memory_planner_.total_internal_temporary_size(), p_engine_,
+            *g_alloc_);
+    prepare_args_set(res, inputs, outputs, scratchpad);
+
+    for (size_t i = 0; i < subgraph_->execs_.size(); i++) {
+        if (subgraph_->is_constant_[i]) continue;
+        returned_event = subgraph_->execs_[i]->execute_ocl(
+                p_stream, res->get_exec_args()[i], deps);
+        deps = {returned_event};
+    }
+
+    scratchpad.set_deps(returned_event);
+    if (ret_event) *ret_event = returned_event;
+
+    return status::success;
+}
+#endif
+
+template struct sdp_primitive_v1_kernel_t<false>;
+template struct sdp_primitive_v1_kernel_t<true>;
+
+} // namespace dnnl_impl
+} // namespace graph
+} // namespace impl
+} // namespace dnnl
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_v1.hpp b/src/graph/backend/dnnl/kernels/sdp_primitive_v1.hpp
diff --git a/src/graph/backend/dnnl/passes/transform.cpp b/src/graph/backend/dnnl/passes/transform.cpp
diff --git a/src/graph/backend/dnnl/passes/transform.hpp b/src/graph/backend/dnnl/passes/transform.hpp

Original file line number	Diff line number	Diff line change
`@@ -142,7 +142,7 @@ void larger_partition_kernel_t::setup_pipeline_stage2(pass_pipeline_t &pipeline,`
`142`	`142`	`}`
`143`	`143`	`BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);`
`144`	`144`	`BACKEND_DNNL_ADD_PASS(pipeline, fuse_src_transpose_to_matmul);`
`145`		`- BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);`
	`145`	`+ BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);`
`146`	`146`	`BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);`
`147`	`147`	`BACKEND_DNNL_ADD_PASS(pipeline, common_reorder_elimination);`
`148`	`148`	`BACKEND_DNNL_ADD_PASS(pipeline, fuse_adjacent_reorders);`
Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,7 @@ status_t matmul_t<quantized>::compile_impl(const dnnl_partition_impl_t *part,`
`110`	`110`	`}`
`111`	`111`
`112`	`112`	`BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);`
`113`		`- BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);`
	`113`	`+ BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);`
`114`	`114`	`BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);`
`115`	`115`
`116`	`116`	`BACKEND_DNNL_ADD_PASS(pipeline, fuse_adjacent_reorders);`
Original file line number	Diff line number	Diff line change
`@@ -87,7 +87,7 @@ status_t mqa_decomp_kernel_t<quantized, dt>::compile_impl(`
`87`	`87`	`BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);`
`88`	`88`	`}`
`89`	`89`	`pipeline.reset_visualize_arg(true, false);`
`90`		`- BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);`
	`90`	`+ BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);`
`91`	`91`	`BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);`
`92`	`92`
`93`	`93`	`// Run the added passes`
Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,7 @@ status_t sdp_decomp_kernel_t<quantized, dt>::compile_impl(`
`86`	`86`	`BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);`
`87`	`87`	`}`
`88`	`88`	`pipeline.reset_visualize_arg(true, false);`
`89`		`- BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);`
	`89`	`+ BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);`
`90`	`90`	`BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);`
`91`	`91`
`92`	`92`	`// Run the added passes`