graph: dnnl: comments fix

xiang1guo · xiang1guo · commit 9f084f6f9fa7 · 2025-03-24T09:01:43.000Z
diff --git a/src/graph/backend/dnnl/dnnl_op_def.hpp b/src/graph/backend/dnnl/dnnl_op_def.hpp
@@ -1134,6 +1134,9 @@ DNNL_GRAPH_OP_SCHEMA(dnnl_mask, 1,
                 .SET_EXECUTABLE_CREATOR(executable_creator<memory_reparser_t>)
                 .SET_ARG_INDICES_GETTER(memory_reparser_t))
 
+// The data types of query/key/value/mask/output must be consistent, and only
+// f16/bf16 are supported. The data type of scale must be consistent with other
+// input and output data types or fp32.
 DNNL_GRAPH_OP_SCHEMA(dnnl_sdpa, 1,
         op_schema_t()
                 .set_inputs_option(op_schema_t::param_num_option::variadic)
@@ -1152,8 +1155,6 @@ DNNL_GRAPH_OP_SCHEMA(dnnl_sdpa, 1,
                 .set_attr(op_attr::with_mask, true, attribute_kind::b)
                 // with_causal attribute support top-left mask type only
                 .set_attr(op_attr::with_causal, true, attribute_kind::b)
-                .set_attr(op_attr::fusion_info_key, false, attribute_kind::i,
-                        (int64_t)-1)
                 .set_shape_inference_function(infer_dnnl_sdpa_output_shape)
                 .SET_LAYOUT_PROPAGATOR(layout_propagator_for_sdpa)
                 .SET_EXECUTABLE_CREATOR(executable_creator<sdpa_executable_t>)
diff --git a/src/graph/backend/dnnl/kernels/sdp.hpp b/src/graph/backend/dnnl/kernels/sdp.hpp
@@ -69,8 +69,8 @@ struct sdp_base_t : public kernel_base_t {
         // SDPA Ukernel v1 with fused internal sdpa solution. Support fload sdpa
         // only.
         // TODO(GX): Support quantized sdpa and merge with sdp_primitive_kernel_t.
-        if (enable_ukernel) {
-            kernel = std::make_shared<sdp_primitive_v1_kernel_t<quantized>>();
+        if (enable_ukernel && !quantized) {
+            kernel = std::make_shared<sdp_primitive_v1_kernel_t>();
             ret = kernel->compile_impl(part, g_engine, inputs, outputs);
         }
 
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_config.cpp b/src/graph/backend/dnnl/kernels/sdp_primitive_config.cpp
@@ -166,7 +166,7 @@ status_t sdp_primitive_config_t::locate_io(std::shared_ptr<subgraph_t> &sg,
 
 status_t sdp_primitive_config_t::initial_check(
         const std::shared_ptr<subgraph_t> &sg,
-        const std::vector<logical_tensor_t> &inputs, bool v1_kenrel) {
+        const std::vector<logical_tensor_t> &inputs, bool v1_kernel) {
     // At least 3 inputs: Q, K, V
     VCHECK_SDP_PRIMITIVE(inputs.size() >= 3, status::invalid_arguments,
             "At least 3 inputs are required");
@@ -177,7 +177,7 @@ status_t sdp_primitive_config_t::initial_check(
             "SDPA ukernel doesn't support f32 datatype now");
 
     // Note: sdpa_primitive_v1 kernel currently don't support legacy GQA pattern.
-    if (v1_kenrel) {
+    if (v1_kernel) {
         for (auto &cur_op : sg->get_ops()) {
             if (cur_op->get_kind() == graph::op_kind::StaticReshape) {
                 auto in = cur_op->get_input_value(0)->get_logical_tensor();
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_config.hpp b/src/graph/backend/dnnl/kernels/sdp_primitive_config.hpp
@@ -83,7 +83,7 @@ struct sdp_primitive_config_t {
     // 3. only support 4-dims tensor
     status_t initial_check(const std::shared_ptr<subgraph_t> &sg,
             const std::vector<logical_tensor_t> &inputs,
-            bool v1_kenrel = false);
+            bool v1_kernel = false);
 
     // Initialize parameters and primitive.
     status_t init(std::shared_ptr<subgraph_t> &sg, const dnnl::engine &p_engine,
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_v1.cpp b/src/graph/backend/dnnl/kernels/sdp_primitive_v1.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024-2025 Intel Corporation
+* Copyright 2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,16 +40,14 @@ namespace impl {
 namespace graph {
 namespace dnnl_impl {
 
-template <bool quantized>
-status_t sdp_primitive_v1_kernel_t<quantized>::compile_impl(
+status_t sdp_primitive_v1_kernel_t::compile_impl(
         const dnnl_partition_impl_t *part, const engine_t *g_engine,
         const std::vector<logical_tensor_t> &inputs,
         const std::vector<logical_tensor_t> &outputs) {
 // sdp_primitive_v1_kernel_t only supports Intel GPU.
 #if defined(DNNL_WITH_SYCL) && DNNL_GPU_VENDOR != DNNL_VENDOR_INTEL
     return status::unimplemented;
 #endif
-    if (quantized) { return status::unimplemented; }
 
     p_engine_ = make_dnnl_engine(*g_engine);
     g_alloc_
@@ -110,8 +108,7 @@ status_t sdp_primitive_v1_kernel_t<quantized>::compile_impl(
     return status::success;
 }
 
-template <bool quantized>
-void sdp_primitive_v1_kernel_t<quantized>::prepare_args_set(
+void sdp_primitive_v1_kernel_t::prepare_args_set(
         const execution_args_set_t *res, const std::vector<tensor_t> &inputs,
         const std::vector<tensor_t> &outputs, const scratchpad_t &scratchpad) {
     // update the data of partition in/outputs args
@@ -131,9 +128,8 @@ void sdp_primitive_v1_kernel_t<quantized>::prepare_args_set(
     }
 }
 
-template <bool quantized>
-status_t sdp_primitive_v1_kernel_t<quantized>::execute_impl(
-        const stream_t *g_stream, const std::vector<tensor_t> &inputs,
+status_t sdp_primitive_v1_kernel_t::execute_impl(const stream_t *g_stream,
+        const std::vector<tensor_t> &inputs,
         const std::vector<tensor_t> &outputs) {
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
@@ -154,9 +150,8 @@ status_t sdp_primitive_v1_kernel_t<quantized>::execute_impl(
 }
 
 #ifdef DNNL_WITH_SYCL
-template <bool quantized>
-status_t sdp_primitive_v1_kernel_t<quantized>::sycl_execute_impl(
-        const stream_t *g_stream, const std::vector<tensor_t> &inputs,
+status_t sdp_primitive_v1_kernel_t::sycl_execute_impl(const stream_t *g_stream,
+        const std::vector<tensor_t> &inputs,
         const std::vector<tensor_t> &outputs,
         const std::vector<::sycl::event> &sycl_deps,
         ::sycl::event *sycl_event) {
@@ -193,9 +188,8 @@ status_t sdp_primitive_v1_kernel_t<quantized>::sycl_execute_impl(
 #endif
 
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
-template <bool quantized>
-status_t sdp_primitive_v1_kernel_t<quantized>::ocl_execute_impl(
-        const stream_t *g_stream, const std::vector<tensor_t> &inputs,
+status_t sdp_primitive_v1_kernel_t::ocl_execute_impl(const stream_t *g_stream,
+        const std::vector<tensor_t> &inputs,
         const std::vector<tensor_t> &outputs,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 // sdp_primitive_v1_kernel_t only supports Intel GPU.
@@ -230,8 +224,7 @@ status_t sdp_primitive_v1_kernel_t<quantized>::ocl_execute_impl(
 }
 #endif
 
-template struct sdp_primitive_v1_kernel_t<false>;
-template struct sdp_primitive_v1_kernel_t<true>;
+struct sdp_primitive_v1_kernel_t;
 
 } // namespace dnnl_impl
 } // namespace graph
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_v1.hpp b/src/graph/backend/dnnl/kernels/sdp_primitive_v1.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024-2025 Intel Corporation
+* Copyright 2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef GRAPH_BACKEND_DNNL_KERNELS_sdp_primitive_v1_HPP
-#define GRAPH_BACKEND_DNNL_KERNELS_sdp_primitive_v1_HPP
+#ifndef GRAPH_BACKEND_DNNL_KERNELS_SDP_PRIMITIVE_V1_HPP
+#define GRAPH_BACKEND_DNNL_KERNELS_SDP_PRIMITIVE_V1_HPP
 
 #include <algorithm>
 #include <memory>
@@ -40,7 +40,6 @@ namespace impl {
 namespace graph {
 namespace dnnl_impl {
 
-template <bool quantized>
 struct sdp_primitive_v1_kernel_t : public kernel_base_t {
 private:
     allocator_t *g_alloc_ = nullptr;
diff --git a/src/graph/backend/dnnl/layout_propagator.cpp b/src/graph/backend/dnnl/layout_propagator.cpp
@@ -1580,7 +1580,8 @@ status_t layout_propagator_for_sdpa(std::shared_ptr<op_t> &op,
     const logical_tensor_t &out_lt = dst_val->get_logical_tensor();
 
     dnnl::memory::desc expected_md;
-    // Set default output layout format for sdpa as acbd
+    // Set default output layout format for sdpa as acbd if user doesn't specify
+    // the layout since no reorder will required after sdpa.
     if (ltw(out_lt).is_any()) {
         expected_md = {ltw(out_lt).vdims(),
                 static_cast<dnnl::memory::data_type>(ltw(out_lt).data_type()),
diff --git a/src/graph/backend/dnnl/op_executable.hpp b/src/graph/backend/dnnl/op_executable.hpp
@@ -2689,12 +2689,19 @@ struct sdpa_executable_t : public op_executable_t {
                 md_k.get(), md_v.get(), md_dst.get(), md_mask.get(), scale_dt,
                 is_invert_scale_, kv_head_number, is_causal_mask_, attr.get());
         if (s != dnnl::impl::status::success) {
-            throw std::runtime_error("create_sdpa_pd failed");
+            is_initialized_ = false;
+        } else {
+            status_t s = sdpa_pd_->create_primitive(sdpa_prim_, p_engine.get());
+            if (s != dnnl::impl::status::success) {
+                is_initialized_ = false;
+            } else {
+                is_initialized_ = true;
+            }
         }
-
-        sdpa_pd_->create_primitive(sdpa_prim_, p_engine.get());
     }
 
+    bool is_initialized() { return is_initialized_; }
+
     void execute(const stream &stream,
             const std::unordered_map<int, memory> &args) const override {
         exec_args_t exec_args;
@@ -2819,6 +2826,7 @@ struct sdpa_executable_t : public op_executable_t {
     bool with_mask_;
     bool is_invert_scale_;
     bool is_causal_mask_;
+    bool is_initialized_;
 };
 
 } // namespace dnnl_impl
diff --git a/src/graph/backend/dnnl/passes/compile_ops.cpp b/src/graph/backend/dnnl/passes/compile_ops.cpp
@@ -60,19 +60,20 @@ status_t compile_ops(std::shared_ptr<subgraph_t> &sg) {
         auto creator = opm->get_additional_item<executable_creator_func>(
                 "executable_creator");
 
-        try {
-            std::shared_ptr<op_executable_t> exec
-                    = creator(cur_op, p_engine, mgr, pd_cache);
-            VCHECK_COMPILE_OPS(exec != nullptr, status::invalid_graph_op,
-                    "unimplemented op, can't compile op %s",
+        std::shared_ptr<op_executable_t> exec
+                = creator(cur_op, p_engine, mgr, pd_cache);
+        VCHECK_COMPILE_OPS(exec != nullptr, status::invalid_graph_op,
+                "unimplemented op, can't compile op %s",
+                op->get_name().c_str());
+        if (cur_op->get_kind() == op_kind::dnnl_sdpa) {
+            auto sdpa_exec = std::dynamic_pointer_cast<sdpa_executable_t>(exec);
+            VCHECK_COMPILE_OPS(sdpa_exec->is_initialized(),
+                    status::unimplemented,
+                    "failed to create executable for op %s",
                     op->get_name().c_str());
-
-            sg->execs_.emplace_back(exec);
-        } catch (const std::runtime_error &e) {
-            VCHECK_COMPILE_OPS(false, status::unimplemented,
-                    "failed to create executable for op %s: %s",
-                    op->get_name().c_str(), e.what());
         }
+        sg->execs_.emplace_back(exec);
+
         sg->is_constant_.push_back(op->has_attr(op_attr::is_constant)
                 && op->get_attr<bool>(op_attr::is_constant));
         return status::success;