graph: backend: dnnl: enable genindex ref implementation for gpu

gyhintel · TaoLv · commit 9442aa429d8b · 2025-03-05T11:18:49.000+08:00
diff --git a/src/gpu/intel/ocl/graph/gen_index.cl b/src/gpu/intel/ocl/graph/gen_index.cl
@@ -0,0 +1,53 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+__kernel void gen_index(__global int *dst, int axis) {
+    long id = get_global_id(0);
+    long result, offset = 0;
+    long idx;
+
+    idx = id % D0;
+    id = id / D0;
+    offset += idx * S0;
+    if (axis == 0) result = idx;
+
+    idx = id % D1;
+    id = id / D1;
+    offset += idx * S1;
+    if (axis == 1) result = idx;
+
+    idx = id % D2;
+    id = id / D2;
+    offset += idx * S2;
+    if (axis == 2) result = idx;
+
+    idx = id % D3;
+    id = id / D3;
+    offset += idx * S3;
+    if (axis == 3) result = idx;
+
+    idx = id % D4;
+    id = id / D4;
+    offset += idx * S4;
+    if (axis == 4) result = idx;
+
+    idx = id % D5;
+    id = id / D5;
+    offset += idx * S5;
+    if (axis == 5) result = idx;
+
+    dst[offset] = result;
+}
diff --git a/src/graph/backend/dnnl/kernels/gen_index.cpp b/src/graph/backend/dnnl/kernels/gen_index.cpp
@@ -25,6 +25,11 @@
 #include "graph/backend/dnnl/passes/utils.hpp"
 
 #include "graph/backend/dnnl/op_executable.hpp"
+
+#define VCHECK_GENINDEX(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, genindex_t, (cond), status, msg, \
+            ##__VA_ARGS__);
+
 namespace dnnl {
 namespace impl {
 namespace graph {
@@ -41,6 +46,16 @@ status_t genindex_t::compile_impl(const dnnl_partition_impl_t *part,
             part->get_fpmath_mode(), part->get_use_blocked_layout(), true);
     BACKEND_DNNL_CHECK(set_given_inputs_outputs(subgraph_, inputs, outputs));
 
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE
+    if (p_engine_.get_kind() == engine::kind::gpu) {
+        int ndims = inputs[0].ndims;
+        VCHECK_GENINDEX(ndims <= MAX_NDIMS, status::invalid_arguments,
+                "only tensors of 6 or fewer dimensions are supported for "
+                "genindex GPU, but got %dD",
+                ndims);
+    }
+#endif
+
     subgraph_visualizer_t vis(part->id(), [this](const value_t *val) {
         return this->memory_planner_.get_memory_info(val);
     });
@@ -84,7 +99,7 @@ status_t genindex_t::compile_impl(const dnnl_partition_impl_t *part,
 
 void genindex_t::prepare_args_set(const execution_args_set_t *res,
         const std::vector<tensor_t> &inputs,
-        const std::vector<tensor_t> &outputs, const scratchpad_t &scratchpad) {
+        const std::vector<tensor_t> &outputs) {
     // update the data of partition in/outputs args
     for (const auto &mem_idx : res->get_mems_use_external_inputs()) {
         mem_idx.first.set_data_handle(inputs[mem_idx.second].get_data_handle());
@@ -93,13 +108,6 @@ void genindex_t::prepare_args_set(const execution_args_set_t *res,
         mem_idx.first.set_data_handle(
                 outputs[mem_idx.second].get_data_handle());
     }
-
-    grantor_t var_grantor = memory_planner_.internal_temporary_grantor(
-            scratchpad.get_buffer());
-
-    for (auto &mem_offkey : res->get_mems_use_internal_temporary()) {
-        mem_offkey.first.set_data_handle(var_grantor.get(mem_offkey.second));
-    }
 }
 
 status_t genindex_t::execute_impl(const stream_t *g_stream,
@@ -111,14 +119,7 @@ status_t genindex_t::execute_impl(const stream_t *g_stream,
     thread_local_cache_t<execution_args_set_t> res_cache;
     execution_args_set_t *res = res_cache.get_or_add(
             reinterpret_cast<size_t>(this), resource_ctor_);
-
-    temporary_scratchpad_t scratchpad(
-            memory_planner_.total_internal_temporary_size(), p_engine_,
-            *g_alloc_);
-    assertm(scratchpad.size()
-                    >= memory_planner_.total_internal_temporary_size(),
-            "no enough scratchpad memory");
-    prepare_args_set(res, inputs, outputs, scratchpad);
+    prepare_args_set(res, inputs, outputs);
 
     constant_cache_t::cached_t c_buffer;
 
@@ -135,7 +136,26 @@ status_t genindex_t::sycl_execute_impl(const stream_t *g_stream,
         const std::vector<tensor_t> &outputs,
         const std::vector<::sycl::event> &sycl_deps,
         ::sycl::event *sycl_event) {
-    if (p_engine_.get_kind() == engine::kind::gpu) return status::unimplemented;
+    if (p_engine_.get_kind() == engine::kind::gpu) {
+        auto deps = sycl_deps;
+        ::sycl::event returned_event;
+        dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
+
+        thread_local_cache_t<execution_args_set_t> res_cache;
+        execution_args_set_t *res = res_cache.get_or_add(
+                reinterpret_cast<size_t>(this), resource_ctor_);
+        prepare_args_set(res, inputs, outputs);
+        for (size_t i = 0; i < subgraph_->execs_.size(); i++) {
+            if (subgraph_->is_constant_[i]) continue;
+            returned_event = subgraph_->execs_[i]->execute_sycl(
+                    p_stream, res->get_exec_args()[i], deps);
+            deps = {returned_event};
+        }
+
+        if (sycl_event) *sycl_event = returned_event;
+
+        return status::success;
+    }
     return execute_impl(g_stream, inputs, outputs);
 }
 #endif
@@ -144,8 +164,27 @@ status_t genindex_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<tensor_t> &inputs,
         const std::vector<tensor_t> &outputs,
         const std::vector<cl_event> &ocl_deps, cl_event *ocl_event) {
-    // TODO: add support
-    return status::unimplemented;
+    auto deps = ocl_deps;
+    cl_event returned_event {};
+    dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
+
+    // each thread's own local resource
+    thread_local_cache_t<execution_args_set_t> res_cache;
+    execution_args_set_t *res = res_cache.get_or_add(
+            reinterpret_cast<size_t>(this), resource_ctor_);
+
+    prepare_args_set(res, inputs, outputs);
+
+    for (size_t i = 0; i < subgraph_->execs_.size(); i++) {
+        if (subgraph_->is_constant_[i]) continue;
+        returned_event = subgraph_->execs_[i]->execute_ocl(
+                p_stream, res->get_exec_args()[i], deps);
+        deps = {returned_event};
+    }
+
+    if (ocl_event) *ocl_event = returned_event;
+
+    return status::success;
 }
 #endif
 } // namespace dnnl_impl
diff --git a/src/graph/backend/dnnl/kernels/gen_index.hpp b/src/graph/backend/dnnl/kernels/gen_index.hpp
@@ -57,8 +57,7 @@ struct genindex_t : public kernel_base_t {
     }
     void prepare_args_set(const execution_args_set_t *res,
             const std::vector<tensor_t> &inputs,
-            const std::vector<tensor_t> &outputs,
-            const scratchpad_t &scratchpad);
+            const std::vector<tensor_t> &outputs);
     status_t compile_impl(const dnnl_partition_impl_t *part,
             const engine_t *g_engine,
             const std::vector<logical_tensor_t> &inputs,
diff --git a/src/graph/backend/dnnl/op_executable.hpp b/src/graph/backend/dnnl/op_executable.hpp
@@ -46,6 +46,20 @@
 #include "graph/backend/dnnl/fusion_info.hpp"
 #include "graph/backend/dnnl/internal_attrs.hpp"
 
+#if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
+        && (DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL)
+
+#include "gpu/intel/compute/compute_engine.hpp"
+#include "gpu/intel/compute/compute_stream.hpp"
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#include "gpu/intel/ocl/stream.hpp"
+#endif
+
+#ifdef DNNL_WITH_SYCL
+#include "gpu/intel/sycl/stream.hpp"
+#endif
+
+#endif
 namespace dnnl {
 namespace impl {
 namespace graph {
@@ -2467,19 +2481,17 @@ struct groupnorm_executable_t : public op_executable_t {
     dnnl::group_normalization_forward prim_;
 };
 
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE
+using namespace dnnl::impl::gpu::intel;
+#define MAX_NDIMS 6
+#endif
 struct genindex_executable_t : public op_executable_t {
     DECLARE_ARG_INDICES_GETTER;
 
     genindex_executable_t(std::shared_ptr<op_t> &op,
             const dnnl::engine &p_engine, fusion_info_mgr_t &mgr,
             pd_cache_t &pd_cache) {
-        if (p_engine.get_kind() == engine::kind::gpu) {
-            assertm(false,
-                    "genindex opexcutable is unimplemented "
-                    "under SYCL and OCL "
-                    "runtime!");
-            throw std::runtime_error("Unimplement");
-        }
+
         using ltw = logical_tensor_wrapper_t;
         const auto &input_lt = op->get_input_value(0)->get_logical_tensor();
         nelems_ = ltw(input_lt).nelems();
@@ -2490,6 +2502,26 @@ struct genindex_executable_t : public op_executable_t {
             output_dims_[i] = output_lt.dims[i];
             output_strides_[i] = output_lt.layout.strides[i];
         }
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
+        && DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
+        if (p_engine.get_kind() == engine::kind::gpu) {
+            compute::kernel_ctx_t kernel_ctx;
+            kernel_ctx.define_int("NDIMS", ndims_);
+            for (int d = 0; d < MAX_NDIMS; ++d) {
+                dim_t dim = (d < ndims_) ? output_dims_[d] : 1;
+                dim_t stride = (d < ndims_) ? output_strides_[d] : 0;
+                kernel_ctx.define_int(dnnl::impl::utils::format("D%d", d), dim);
+                kernel_ctx.define_int(
+                        dnnl::impl::utils::format("S%d", d), stride);
+            }
+            auto *compute_engine
+                    = dnnl::impl::utils::downcast<compute::compute_engine_t *>(
+                            p_engine.get());
+            std::vector<compute::kernel_t> kernels(1);
+            compute_engine->create_kernels(&kernels, {"gen_index"}, kernel_ctx);
+            kernel_ = kernels[0];
+        }
+#endif
     }
 
     void execute(const stream &stream,
@@ -2498,26 +2530,97 @@ struct genindex_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps) const override {
-        execute(stream, args);
-        return {};
+            const std::vector<::sycl::event> &deps = {}) const override {
+#if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
+        && (DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL)
+        auto compute_stream
+                = dnnl::impl::utils::downcast<compute::compute_stream_t *>(
+                        stream.get());
+        compute::range_t gws = {static_cast<size_t>(nelems_)};
+        auto nd_range = compute::nd_range_t(gws);
+        compute::kernel_arg_list_t arg_list;
+        const auto &dst = *(args.at(DNNL_ARG_DST).get()->memory_storage());
+        arg_list.set(0, dst);
+        arg_list.set(1, axis_);
+        auto *sycl_stream
+                = dnnl::impl::utils::downcast<sycl::stream_t *>(compute_stream);
+        sycl_stream->before_exec_hook();
+        if (!deps.empty()) sycl_stream->sycl_ctx().set_deps(deps);
+
+        kernel_.parallel_for(*compute_stream, nd_range, arg_list,
+                sycl_stream->sycl_ctx().get_deps(),
+                sycl_stream->sycl_ctx().get_deps());
+        auto return_event = sycl_stream->get_output_event();
+
+        sycl_stream->after_exec_hook();
+        return return_event;
+#else
+        assertm(false,
+                "genindex opexcutable is only implemented for intel vendor "
+                "under SYCL runtime ");
+        throw std::runtime_error("Unimplement");
+#endif
     }
 #endif
 
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps) const override {
+            const std::vector<cl_event> &deps = {}) const override {
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
+        auto compute_stream
+                = dnnl::impl::utils::downcast<compute::compute_stream_t *>(
+                        stream.get());
+
+        compute::range_t gws = {static_cast<size_t>(nelems_)};
+
+        auto nd_range = compute::nd_range_t(gws);
+        compute::kernel_arg_list_t arg_list;
+        const auto &dst = *(args.at(DNNL_ARG_DST).get()->memory_storage());
+        arg_list.set(0, dst);
+        arg_list.set(1, axis_);
+        auto *ocl_stream
+                = dnnl::impl::utils::downcast<gpu::intel::ocl::stream_t *>(
+                        compute_stream);
+
+        ocl_stream->before_exec_hook();
+
+        if (!deps.empty()) {
+            std::vector<xpu::ocl::wrapper_t<cl_event>> events(deps.size());
+            for (size_t i = 0; i < deps.size(); i++)
+                events[i] = xpu::ocl::wrapper_t<cl_event>(deps[i], true);
+            ocl_stream->ocl_ctx().set_deps(events);
+        }
+
+        kernel_.parallel_for(*compute_stream, nd_range, arg_list,
+                compute_stream->ctx().get_deps(),
+                compute_stream->ctx().get_deps());
+
+        cl_event return_event = nullptr;
+        if ((ocl_stream->flags() & stream_flags::in_order) == 0) {
+            auto last = ocl_stream->get_output_event();
+            return_event = last.release();
+        }
+
+        ocl_stream->after_exec_hook();
+        return return_event;
+#else
         assertm(false,
-                "genindex op excutable is unimplemented "
-                "under OCL runtime!");
-        return {};
+                "genindex opexcutable is only implemented for intel vendor "
+                "under OCL runtime ");
+        throw std::runtime_error("Unimplement");
+#endif
     }
 #endif
 
 private:
     int axis_, nelems_, ndims_;
     dims_t output_dims_, output_strides_;
+
+#if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
+        && (DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL)
+    compute::kernel_t kernel_;
+#endif
 };
 
 } // namespace dnnl_impl
diff --git a/src/graph/backend/dnnl/patterns/single_op_pattern.cpp b/src/graph/backend/dnnl/patterns/single_op_pattern.cpp
@@ -425,10 +425,8 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, reduce_pass)
             return std::make_shared<float_reduction>();
         });
 
-// GenIndex currently is CPU only
 DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, gen_index_pass)
         .set_priority(DEFAULT_P)
-        .set_engine_kind(engine_kind::cpu)
         .set_kind(partition_kind_t::misc_post_ops)
         .set_attr<FCreatePattern>("FCreatePattern",
                 [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {