uxlfoundation · Rbiessy · Dec 19, 2024 · Dec 18, 2024 · Nov 7, 2024
@@ -94,6 +94,14 @@ The implementation supports both forward and backward directions.
 * Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`, `N`
 * Supported data types: `f32`, `bf16`, `f16`, `s32`, `s8`, `u8`
 
+## Inner Product
+
+The implementation supports the forward direction only.
+
+* Supported formats: All plain formats are supported.
+* Supported data types: All possible data combinations listed in the oneDNN specification are supported.
+* Supported post-ops: All the post operations as mentioned in the specification are supported.
+
 ## Layer Normalization
 
 The implementation supports both forward and backward directions.

@@ -0,0 +1,55 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+* Copyright 2024 Codeplay Software Limited
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/generic/sycl/ref_inner_product.hpp"
+#include "common/primitive_desc_iterator.hpp"
+
+namespace dnnl::impl::gpu::generic::sycl {
+
+status_t ref_inner_product_fwd_t::pd_t::init_matmul(impl::engine_t *engine) {
+    matmul_desc_t matmul_desc;
+    CHECK(matmul_desc_init(&matmul_desc, &src_md_reshaped, &weights_md_reshaped,
+            &bias_md_reshaped, arg_md(DNNL_ARG_DST)));
+    primitive_attr_t matmul_attr(*attr());
+
+    primitive_desc_iterator_t it(engine,
+            reinterpret_cast<op_desc_t *>(&matmul_desc), &matmul_attr, nullptr);
+    if (!it.is_initialized()) return status::out_of_memory;
+    while (++it != it.end()) {
+        matmul_pd = *it;
+        if (matmul_pd) { break; }
+    }
+    if (!matmul_pd) { return status::invalid_arguments; }
+    return status::success;
+}
+
+status_t ref_inner_product_fwd_t::init(impl::engine_t *engine) {
+    std::pair<std::shared_ptr<impl::primitive_t>, cache_state_t> p;
+    CHECK(pd()->matmul_pd->create_primitive_nested(p, engine));
+    matmul_primitive = p.first;
+    return status::success;
+}
+
+status_t ref_inner_product_fwd_t::execute(const exec_ctx_t &ctx) const {
+    nested_scratchpad_t nested_scratchpad(
+            ctx, memory_tracking::names::key_nested, matmul_primitive);
+    exec_ctx_t copied_ctx(ctx);
+    copied_ctx.set_scratchpad_grantor(nested_scratchpad.grantor());
+    return matmul_primitive->execute(copied_ctx);
+}
+
+} // namespace dnnl::impl::gpu::generic::sycl
@@ -0,0 +1,175 @@
+/*******************************************************************************
+* Copyright 2023-2024 Intel Corporation
+* Copyright 2024-2025 Codeplay Software Limited
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_GENERIC_SYCL_REF_INNER_PRODUCT_HPP
+#define GPU_GENERIC_SYCL_REF_INNER_PRODUCT_HPP
+
+#include "gpu/generic/sycl/ref_matmul.hpp"
+#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
+#include "gpu/generic/sycl/sycl_post_ops.hpp"
+#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
+#include "gpu/gpu_inner_product_pd.hpp"
+#include "gpu/gpu_primitive.hpp"
+
+namespace dnnl::impl::gpu::generic::sycl {
+struct ref_inner_product_fwd_t : public gpu::generic::sycl::primitive_t {
+    using gpu::generic::sycl::primitive_t::primitive_t;
+
+    struct pd_t : public gpu_inner_product_fwd_pd_t {
+        using gpu_inner_product_fwd_pd_t::gpu_inner_product_fwd_pd_t;
+        using sm = primitive_attr_t::skip_mask_t;
+
+        DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_inner_product_fwd_t);
+
+        status_t init(impl::engine_t *engine) {
+            auto src_dt = arg_md(DNNL_ARG_SRC)->data_type;
+            auto weights_dt = arg_md(DNNL_ARG_WEIGHTS)->data_type;
+            auto dst_dt = arg_md(DNNL_ARG_DST)->data_type;
+            auto bias_dt = with_bias() ? arg_md(DNNL_ARG_BIAS)->data_type
+                                       : data_type::undef;
+
+            const bool ok = (set_default_params() == status::success)
+                    && is_fwd()
+                    && check_if_dtypes_valid(
+                            src_dt, dst_dt, bias_dt, weights_dt)
+                    && sycl_post_ops_t::post_ops_ok(attr())
+                    && (attr_.set_default_formats(dst_md()) == status::success)
+                    // Blocked memory formats are not supported
+                    && memory_desc_wrapper(src_md()).is_plain()
+                    && memory_desc_wrapper(dst_md()).is_plain()
+                    && memory_desc_wrapper(weights_md()).is_plain();
+
+            if (!ok) { return status::unimplemented; }
+            CHECK(create_ip_mds());
+            CHECK(init_matmul(engine));
+
+            // book scratchpad for the matmul
+            auto scratchpad = scratchpad_registry().registrar();
+            scratchpad.book(memory_tracking::names::key_nested,
+                    matmul_pd->scratchpad_registry());
+            return status::success;
+        }
+
+        std::shared_ptr<primitive_desc_t> matmul_pd;
+
+    private:
+        bool check_if_dtypes_valid(const data_type_t &src_dt,
+                const data_type_t &dst_dt, const data_type_t &bias_dt,
+                const data_type_t &weight_dt) const {
+            using namespace data_type;
+            return (utils::one_of(src_dt, f32) && utils::one_of(weight_dt, f32)
+                           && utils::one_of(dst_dt, f32)
+                           && utils::one_of(bias_dt, f32, undef))
+                    || (utils::one_of(src_dt, f16)
+                            && utils::one_of(weight_dt, f16)
+                            && utils::one_of(dst_dt, f16, f32, s8, u8)
+                            && utils::one_of(bias_dt, f16, f32, undef))
+                    || (utils::one_of(src_dt, u8, s8)
+                            && utils::one_of(weight_dt, s8)
+                            && utils::one_of(dst_dt, u8, s8, s32, bf16, f32)
+                            && utils::one_of(
+                                    bias_dt, u8, s8, s32, bf16, f32, undef))
+                    || (utils::one_of(src_dt, bf16)
+                            && utils::one_of(weight_dt, bf16)
+                            && utils::one_of(dst_dt, f32, bf16)
+                            && utils::one_of(bias_dt, f32, bf16, undef));
+        }
+
+        std::vector<int> get_dim_order(int ndims, const dims_t strides) {
+            std::vector<int> order(ndims);
+            for (int i = 0; i < ndims; ++i) {
+                order[i] = i;
+            }
+
+            std::sort(
+                    order.begin(), order.end(), [&strides](size_t i, size_t j) {
+                        return strides[i] < strides[j];
+                    });
+
+            return order;
+        }
+
+        status_t create_ip_mds() {
+            auto accumulate_dimensions = [](const dims_t dimensions, int start,
+                                                 int end) -> int64_t {
+                int64_t accum = 1;
+                for (int i = start; i < end; i++) {
+                    accum *= dimensions[i];
+                }
+                return accum;
+            };
+
+            const auto src_md_ = arg_md(DNNL_ARG_SRC);
+            const auto weights_md_ = arg_md(DNNL_ARG_WEIGHTS);
+            const auto bias_md_ = arg_md(DNNL_ARG_BIAS);
+            auto src_wrap = memory_desc_wrapper(src_md_);
+            auto w_wrap = memory_desc_wrapper(weights_md_);
+
+            // src and weights dims need to be in the same order
+            if (get_dim_order(src_wrap.ndims(), src_wrap.strides())
+                    != get_dim_order(w_wrap.ndims(), w_wrap.strides())) {
+                return status::unimplemented;
+            }
+
+            // Reshape input into the form of Batch x (\prod_{dim_{n-1}}^dim_0)
+            if (src_md_->ndims == 2) {
+                src_md_reshaped = *src_md_;
+            } else {
+                int64_t src_flattened_dimension = accumulate_dimensions(
+                        src_md_->dims, 1, src_md_->ndims);
+                dims_t src_reshaped_dims {
+                        src_md_->dims[0], src_flattened_dimension};
+                CHECK(memory_desc_init_by_tag(src_md_reshaped, 2,
+                        src_reshaped_dims, src_md_->data_type, format_tag::ab));
+            }
+
+            // Reshape weights as (OC x (\prod_{dim_{n-1}}^dim_0))^T
+            int weights_flattened_dimensions = accumulate_dimensions(
+                    weights_md_->dims, 1, weights_md_->ndims);
+            dims_t weights_reshaped_dims {
+                    weights_flattened_dimensions, weights_md_->dims[0]};
+            CHECK(memory_desc_init_by_tag(weights_md_reshaped, 2,
+                    weights_reshaped_dims, weights_md_->data_type,
+                    format_tag::ba));
+            if (with_bias()) {
+                dims_t bias_reshaped_dims {1, bias_md_->dims[0]};
+                CHECK(memory_desc_init_by_tag(bias_md_reshaped, 2,
+                        bias_reshaped_dims, bias_md_->data_type,
+                        format_tag::ab));
+            }
+            return status::success;
+        }
+
+        status_t init_matmul(impl::engine_t *engine);
+        // Memory descriptors to contain reshaped tensors from nD to 2D for IP
+        memory_desc_t src_md_reshaped;
+        memory_desc_t weights_md_reshaped;
+        memory_desc_t bias_md_reshaped;
+    };
+
+    status_t init(impl::engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    kernel_t kernel_;
+    std::shared_ptr<impl::primitive_t> matmul_primitive;
+};
+} // namespace dnnl::impl::gpu::generic::sycl
+
+#endif
@@ -32,6 +32,10 @@
 #include "gpu/amd/miopen_gemm_inner_product.hpp"
 #endif
 
+#ifdef GENERIC_SYCL_KERNELS_ENABLED
+#include "gpu/generic/sycl/ref_inner_product.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -49,6 +53,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_gemm_inner_product_fwd_t)
+        GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_inner_product_fwd_t)
         nullptr,
     }},
     {{backward}, REG_BWD_PK({

@@ -392,12 +392,12 @@ struct cudnn_matmul_lt_exec_t final : public cudnn_matmul_lt_base_exec_t {
                             memory_tracking::names::key_matmul_dst_in_acc_dt)
                     : xpu::sycl::interop_memory_arg_t<
                             ::sycl::access::mode::read_write>();
-            auto arg_block_a_scratch = params->source_size_ != 0
+            auto arg_block_a_scratch = params->weight_size_ != 0
                     ? CTX_SCRATCH_SYCL_MEMORY(
                             memory_tracking::names::key_gemm_blocked_a)
                     : xpu::sycl::interop_memory_arg_t<
                             ::sycl::access::mode::read_write>();
-            auto arg_block_b_scratch = params->weight_size_ != 0
+            auto arg_block_b_scratch = params->source_size_ != 0
                     ? CTX_SCRATCH_SYCL_MEMORY(
                             memory_tracking::names::key_gemm_blocked_b)
                     : xpu::sycl::interop_memory_arg_t<
@@ -457,10 +457,10 @@ struct cudnn_matmul_lt_runtime_args_exec_t final
                 matmul_params->reorder_scratch_size_, cuda_stream->queue());
 
         uint8_t *block_a_scratch_ptr
-                = alloc_ptr(matmul_params->source_size_, cuda_stream->queue());
+                = alloc_ptr(matmul_params->weight_size_, cuda_stream->queue());
 
         uint8_t *block_b_scratch_ptr
-                = alloc_ptr(matmul_params->weight_size_, cuda_stream->queue());
+                = alloc_ptr(matmul_params->source_size_, cuda_stream->queue());
 
         uint8_t *block_c_scratch_ptr
                 = alloc_ptr(matmul_params->dest_size_, cuda_stream->queue());

@@ -717,7 +717,7 @@ struct cudnn_matmul_lt_impl_t {
             }
             if (!params->w_blocked_) {
                 transform_matrix(lt_handle, params, a_layout, a,
-                        blocked_a_layout, block_a_scratch, !params->trans_a_,
+                        blocked_a_layout, block_a_scratch, params->trans_a_,
                         streamId);
                 a = block_a_scratch;
             }

@@ -88,16 +88,18 @@ class inner_product_test_t
 protected:
     void SetUp() override {
         auto p = ::testing::TestWithParam<inprod_test_params_t>::GetParam();
-        SKIP_IF_CUDA(!cuda_check_format_tags(p.src_format, p.weights_format,
-                             p.bias_format, p.dst_format),
+        SKIP_IF_CUDA(!cuda_generic_check_format_tags(p.src_format,
+                             p.weights_format, p.bias_format, p.dst_format),
+                "Unsupported format tag");
+        SKIP_IF_GENERIC(!cuda_generic_check_format_tags(p.src_format,
+                                p.weights_format, p.bias_format, p.dst_format),
                 "Unsupported format tag");
         SKIP_IF_CUDA(p.ndims > 5, "Unsupported number of dimensions");
-        SKIP_IF_GENERIC(true, "Primitive not implemented");
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
     }
 
-    bool cuda_check_format_tags(memory::format_tag src_format,
+    bool cuda_generic_check_format_tags(memory::format_tag src_format,
             memory::format_tag wei_format, memory::format_tag bia_format,
             memory::format_tag dst_format) {
         bool src_ok = src_format == memory::format_tag::ncdhw
@@ -130,6 +132,20 @@ class inner_product_test_t
         return src_ok && wei_ok && bia_ok && dst_ok;
     }
 
+    std::vector<int> get_dim_order(const memory::dims &strides) {
+        size_t ndims = strides.size();
+        std::vector<int> order(ndims);
+        for (size_t i = 0; i < ndims; ++i) {
+            order[i] = i;
+        }
+
+        std::sort(order.begin(), order.end(), [&strides](size_t i, size_t j) {
+            return strides[i] < strides[j];
+        });
+
+        return order;
+    }
+
     void Test() {
         auto p = ::testing::TestWithParam<inprod_test_params_t>::GetParam();
         test_inner_product_descr_t ipd = p.test_ipd;
@@ -169,18 +185,26 @@ class inner_product_test_t
                 : create_md({}, data_type, p.bias_format);
         auto ip_dst_desc = create_md({ipd.mb, ipd.oc}, data_type, p.dst_format);
 
+        SKIP_IF_GENERIC(get_dim_order(ip_src_desc.get_strides())
+                        != get_dim_order(ip_weights_desc.get_strides()),
+                "Unsupported case for generic");
+
         auto ip_primitive_desc = with_bias
                 ? pd_t(eng, p.aprop_kind, ip_src_desc, ip_weights_desc,
                         ip_bias_desc, ip_dst_desc)
                 : pd_t(eng, p.aprop_kind, ip_src_desc, ip_weights_desc,
                         ip_dst_desc);
 
         auto aa = allows_attr_t {false};
-        aa.po_binary = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
         aa.po_eltwise = true;
-        aa.po_prelu = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
         aa.po_sum = true;
-
+#ifdef DNNL_SYCL_GENERIC
+        aa.po_binary = true;
+        aa.po_prelu = true;
+#else
+        aa.po_binary = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
+        aa.po_prelu = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
+#endif
         test_fwd_pd_constructors<pd_t>(ip_primitive_desc, aa, p.aprop_kind,
                 ip_src_desc, ip_weights_desc, ip_bias_desc, ip_dst_desc);