graph: dnnl: add internal sdpa op

xiang1guo · xiang1guo · commit b5d090f3e347 · 2025-03-21T07:50:05.000-07:00
diff --git a/src/graph/backend/dnnl/dnnl_op_def.hpp b/src/graph/backend/dnnl/dnnl_op_def.hpp
@@ -1134,6 +1134,31 @@ DNNL_GRAPH_OP_SCHEMA(dnnl_mask, 1,
                 .SET_EXECUTABLE_CREATOR(executable_creator<memory_reparser_t>)
                 .SET_ARG_INDICES_GETTER(memory_reparser_t))
 
+DNNL_GRAPH_OP_SCHEMA(dnnl_sdpa, 1,
+        op_schema_t()
+                .set_inputs_option(op_schema_t::param_num_option::variadic)
+                .set_num_inputs(std::set<size_t>({3, 32}))
+                .set_num_outputs(2)
+                .set_input(0, "query")
+                .set_input(1, "key")
+                .set_input(2, "value")
+                .set_input(3, "scale") // optional
+                .set_input(4, "mask") // optional
+                .set_output(0, "output")
+                .set_output(1, "scratchpad")
+                .set_attr(op_attr::with_scale, true, attribute_kind::b)
+                .set_attr(op_attr::is_invert_scale, false, attribute_kind::b,
+                        false)
+                .set_attr(op_attr::with_mask, true, attribute_kind::b)
+                // with_causal attribute support top-left mask type only
+                .set_attr(op_attr::with_causal, true, attribute_kind::b)
+                .set_attr(op_attr::fusion_info_key, false, attribute_kind::i,
+                        (int64_t)-1)
+                .set_shape_inference_function(infer_dnnl_sdpa_output_shape)
+                .SET_LAYOUT_PROPAGATOR(layout_propagator_for_sdpa)
+                .SET_EXECUTABLE_CREATOR(executable_creator<sdpa_executable_t>)
+                .SET_ARG_INDICES_GETTER(sdpa_executable_t))
+
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/dnnl_opset.hpp b/src/graph/backend/dnnl/dnnl_opset.hpp
@@ -97,6 +97,7 @@ class dnnl_opset_t {
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_layernorm, 1)>());
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_reorder, 1)>());
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_groupnorm, 1)>());
+        fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_sdpa, 1)>());
     }
 };
 
diff --git a/src/graph/backend/dnnl/dnnl_shape_infer.cpp b/src/graph/backend/dnnl/dnnl_shape_infer.cpp
@@ -545,6 +545,63 @@ status_t infer_dnnl_binary_output_shape(op_t *n,
     }
 }
 
+status_t infer_dnnl_sdpa_output_shape(op_t *n,
+        std::vector<logical_tensor_t *> &inputs,
+        std::vector<logical_tensor_t *> &outputs) {
+    // [batch_size, num_heads_q, seq_len_q, head_size_qk]
+    auto query = logical_tensor_wrapper_t(inputs[0]);
+    // [batch_size, num_heads_q, head_size_qk, seq_len_kv,]
+    auto key = logical_tensor_wrapper_t(inputs[1]);
+    // [batch_size, num_heads_v, seq_len_kv, head_size_v]
+    auto value = logical_tensor_wrapper_t(inputs[2]);
+    // [batch_size, num_heads_q, seq_len_q, head_size_v]
+    auto out0 = logical_tensor_wrapper_t(outputs[0]);
+
+    dims query_dims = query.vdims();
+    dims key_dims = key.vdims();
+    dims value_dims = value.vdims();
+
+    VCHECK_INVALID_SHAPE((query_dims.size() == key_dims.size()
+                                 && key_dims.size() == value_dims.size()),
+            "%s, all input dims should match each other. input0 dims: %s, "
+            "input1 dims: %s, input2 dims: %s ",
+            op_t::kind2str(n->get_kind()).c_str(), dims2str(query_dims).c_str(),
+            dims2str(key_dims).c_str(), dims2str(value_dims).c_str());
+
+    VCHECK_INVALID_SHAPE((query_dims.size() == 4),
+            "%s, only support 4D input for all q/k/v. input0 dimension: %s, "
+            "input1 dimension: %s, input2 dimension: %s ",
+            op_t::kind2str(n->get_kind()).c_str(),
+            std::to_string(query_dims.size()).c_str(),
+            std::to_string(key_dims.size()).c_str(),
+            std::to_string(value_dims.size()).c_str());
+
+    VCHECK_INVALID_SHAPE((query_dims[3] == key_dims[2]),
+            "%s, query head size should be match with key head size. query "
+            "dims: %s, Key dims: %s",
+            op_t::kind2str(n->get_kind()).c_str(), dims2str(query_dims).c_str(),
+            dims2str(key_dims).c_str());
+
+    VCHECK_INVALID_SHAPE((key_dims[3] == value_dims[2]),
+            "%s, key sequence length should be match with value sequence "
+            "length. key dims: %s, value dims: %s ",
+            op_t::kind2str(n->get_kind()).c_str(), dims2str(key_dims).c_str(),
+            dims2str(value_dims).c_str());
+
+    dims inferred_output_shape;
+    inferred_output_shape
+            = {query_dims[0], query_dims[1], query_dims[2], value_dims[3]};
+
+    if (out0.ndims() != -1) {
+        VCHECK_INVALID_SHAPE(validate(inferred_output_shape, out0.vdims()),
+                "%s, inferred out shape and output shape are not compatible",
+                op_t::kind2str(n->get_kind()).c_str());
+    }
+
+    set_shape_and_strides(*outputs[0], inferred_output_shape);
+    return status::success;
+}
+
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/dnnl_shape_infer.hpp b/src/graph/backend/dnnl/dnnl_shape_infer.hpp
@@ -107,6 +107,10 @@ status_t infer_binary_select_output_shape(op_t *n,
         std::vector<logical_tensor_t *> &inputs,
         std::vector<logical_tensor_t *> &outputs);
 
+status_t infer_dnnl_sdpa_output_shape(op_t *n,
+        std::vector<logical_tensor_t *> &inputs,
+        std::vector<logical_tensor_t *> &outputs);
+
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/internal_attrs.hpp b/src/graph/backend/dnnl/internal_attrs.hpp
@@ -45,6 +45,10 @@ const op_attr_t with_runtime_dst_zps = 0x1000c;
 const op_attr_t is_bias_add = 0x1000d;
 const op_attr_t with_sum = 0x1000e;
 const op_attr_t keep_dst_layout = 0x1000f;
+const op_attr_t with_scale = 0x10010;
+const op_attr_t is_invert_scale = 0x10011;
+const op_attr_t with_causal = 0x10012;
+const op_attr_t with_mask = 0x10013;
 
 // int64_t
 const op_attr_t alg_kind = 0x10100;
@@ -86,6 +90,10 @@ static inline std::string internal_attr2str(op_attr_t attr) {
         CASE(is_bias_add);
         CASE(with_sum);
         CASE(keep_dst_layout);
+        CASE(with_scale);
+        CASE(is_invert_scale);
+        CASE(with_causal);
+        CASE(with_mask);
         CASE(alg_kind);
         CASE(fusion_info_key);
         CASE(axis_row);
diff --git a/src/graph/backend/dnnl/internal_ops.hpp b/src/graph/backend/dnnl/internal_ops.hpp
@@ -79,7 +79,8 @@ namespace op_kind {
     X(dnnl_convtranspose_bwd_weights, Dnnl_convtranspose_bwd_weights) \
     X(dnnl_groupnorm, Dnnl_groupnorm) \
     X(dnnl_gen_index, Dnnl_gen_index) \
-    X(dnnl_mask, Dnnl_mask)
+    X(dnnl_mask, Dnnl_mask) \
+    X(dnnl_sdpa, Dnnl_sdpa)
 
 enum kind_t {
     kDNNL_INTERNAL_OP_STARTER = 0x1234,
diff --git a/src/graph/backend/dnnl/layout_propagator.cpp b/src/graph/backend/dnnl/layout_propagator.cpp
@@ -1568,6 +1568,35 @@ status_t layout_propagator_for_mask(std::shared_ptr<op_t> &op,
     return status;
 }
 
+status_t layout_propagator_for_sdpa(std::shared_ptr<op_t> &op,
+        const dnnl::engine &p_engine, fusion_info_mgr_t &mgr,
+        pd_cache_t &pd_cache, subgraph_rewriter_t &rewriter) {
+    UNUSED(p_engine);
+    UNUSED(mgr);
+    UNUSED(pd_cache);
+    UNUSED(rewriter);
+
+    value_ptr dst_val = op->get_output_value(0);
+    const logical_tensor_t &out_lt = dst_val->get_logical_tensor();
+
+    dnnl::memory::desc expected_md;
+    // Set default output layout format for sdpa as acbd
+    if (ltw(out_lt).is_any()) {
+        expected_md = {ltw(out_lt).vdims(),
+                static_cast<dnnl::memory::data_type>(ltw(out_lt).data_type()),
+                dnnl::memory::format_tag::acbd};
+    } else {
+        expected_md = make_dnnl_memory_desc(out_lt);
+    }
+    status_t status = fill_layout_info(dst_val, expected_md);
+
+    // fill scratchpads dimensions and data type to scratchpad value_t
+    value_ptr scratchpad_val = op->get_output_value(1);
+    const memory::desc scratchpad_desc;
+    status = fill_layout_info(scratchpad_val, scratchpad_desc);
+    return status;
+}
+
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/layout_propagator.hpp b/src/graph/backend/dnnl/layout_propagator.hpp
@@ -93,6 +93,7 @@ DECLARE_LAYOUT_PROPAGATOR(add_zps);
 DECLARE_LAYOUT_PROPAGATOR(groupnorm);
 DECLARE_LAYOUT_PROPAGATOR(gen_index);
 DECLARE_LAYOUT_PROPAGATOR(mask);
+DECLARE_LAYOUT_PROPAGATOR(sdpa);
 
 #undef DECLARE_LAYOUT_PROPAGATOR
 
diff --git a/src/graph/backend/dnnl/op_executable.cpp b/src/graph/backend/dnnl/op_executable.cpp
@@ -2405,6 +2405,29 @@ arg_indices_t genindex_executable_t::get_arg_indices(
     return arg_indices;
 }
 
+arg_indices_t sdpa_executable_t::get_arg_indices(
+        const op_t *op, fusion_info_mgr_t &mgr) {
+    UNUSED(mgr);
+
+    arg_indices_t arg_indices;
+    // add input args
+    size_t index = 0;
+    arg_indices.insert({DNNL_ARG_QUERIES, indices_t {input, index++}});
+    arg_indices.insert({DNNL_ARG_KEYS, indices_t {input, index++}});
+    arg_indices.insert({DNNL_ARG_VALUES, indices_t {input, index++}});
+    if (op->get_attr<bool>(dnnl::impl::graph::dnnl_impl::op_attr::with_scale)) {
+        arg_indices.insert({DNNL_ARG_SCALE, indices_t {input, index++}});
+    }
+    if (op->get_attr<bool>(dnnl::impl::graph::dnnl_impl::op_attr::with_mask)) {
+        arg_indices.insert({DNNL_ARG_ATTN_MASK, indices_t {input, index++}});
+    }
+
+    // add output args
+    arg_indices.insert({DNNL_ARG_DST, indices_t {output, 0}});
+    arg_indices.insert({DNNL_ARG_SCRATCHPAD, indices_t {output, 1}});
+    return arg_indices;
+}
+
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/op_executable.hpp b/src/graph/backend/dnnl/op_executable.hpp

Original file line number	Diff line number	Diff line change
`@@ -97,6 +97,7 @@ class dnnl_opset_t {`
`97`	`97`	`fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_layernorm, 1)>());`
`98`	`98`	`fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_reorder, 1)>());`
`99`	`99`	`fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_groupnorm, 1)>());`
	`100`	`+ fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_sdpa, 1)>());`
`100`	`101`	`}`
`101`	`102`	`};`
`102`	`103`