uxlfoundation · dzarukin · Sep 3, 2024 · Aug 23, 2024
@@ -0,0 +1,162 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/generic/sycl/ref_matmul.hpp"
+#include "gpu/generic/sycl/matmul_kernels.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+status_t ref_matmul_t::pd_t::init_conf() {
+    conf_ = sycl_matmul_conf_t();
+
+    int matmul_dim_1 = ndims() - 2;
+    int matmul_dim_2 = ndims() - 1;
+
+    memory_desc_t data_md_copy = *src_md();
+    auto &data_strides = data_md_copy.format_desc.blocking.strides;
+    if (data_strides[matmul_dim_1] < data_strides[matmul_dim_2]) {
+        std::swap(data_strides[matmul_dim_1], data_strides[matmul_dim_2]);
+        std::swap(data_md_copy.dims[matmul_dim_1],
+                data_md_copy.dims[matmul_dim_2]);
+        conf_.transpose_data = true;
+    }
+    conf_.data_md = xpu::sycl::md_t(&data_md_copy);
+
+    memory_desc_t weights_md_copy = *weights_md();
+    auto &weights_strides = weights_md_copy.format_desc.blocking.strides;
+    if (weights_strides[matmul_dim_1] < weights_strides[matmul_dim_2]) {
+        std::swap(weights_strides[matmul_dim_1], weights_strides[matmul_dim_2]);
+        std::swap(weights_md_copy.dims[matmul_dim_1],
+                weights_md_copy.dims[matmul_dim_2]);
+        conf_.transpose_weights = true;
+    }
+    conf_.weights_md = xpu::sycl::md_t(&weights_md_copy);
+
+    memory_desc_t dst_md_copy = *dst_md();
+    auto &dst_strides = dst_md_copy.format_desc.blocking.strides;
+    if (dst_strides[matmul_dim_1] < dst_strides[matmul_dim_2]) {
+        std::swap(dst_strides[matmul_dim_1], dst_strides[matmul_dim_2]);
+        std::swap(
+                dst_md_copy.dims[matmul_dim_1], dst_md_copy.dims[matmul_dim_2]);
+        conf_.transpose_dst = true;
+    }
+    conf_.dst_md = xpu::sycl::md_t(&dst_md_copy);
+
+    if (with_bias()) {
+        memory_desc_t bias_md_copy = *weights_md(1);
+        auto &bias_strides = bias_md_copy.format_desc.blocking.strides;
+        if (bias_strides[matmul_dim_1] < bias_strides[matmul_dim_2]) {
+            std::swap(bias_strides[matmul_dim_1], bias_strides[matmul_dim_2]);
+            std::swap(bias_md_copy.dims[matmul_dim_1],
+                    bias_md_copy.dims[matmul_dim_2]);
+            conf_.transpose_bias = true;
+        }
+        conf_.bias_md = xpu::sycl::md_t(&bias_md_copy);
+    }
+
+    dims_t dst_blocks;
+    for (int i = 0; i < matmul_kernel_fwd_t::max_supported_ndims; i++) {
+        if (i < conf_.dst_md.ndims()) {
+            dst_blocks[i] = conf_.dst_md.dims()[i];
+        } else {
+            dst_blocks[i] = 1;
+        }
+    }
+    dst_blocks[matmul_dim_1] = math::div_up(
+            dst_blocks[matmul_dim_1], matmul_kernel_fwd_t::register_block_N);
+    dst_blocks[matmul_dim_2] = math::div_up(
+            dst_blocks[matmul_dim_2], matmul_kernel_fwd_t::register_block_M);
+    int n_blocks = 1;
+    for (int i = 0; i < matmul_kernel_fwd_t::max_supported_ndims; i++) {
+        n_blocks *= dst_blocks[i];
+    }
+    conf_.wk_size = n_blocks;
+
+    int high_two_bits = 3 << (ndims() - 2);
+    // last two dimensions of data and weights are never broadcast
+    conf_.data_mask
+            = utils::get_dims_mask(dst_md()->dims, src_md()->dims, ndims())
+            | high_two_bits;
+    conf_.weights_mask
+            = utils::get_dims_mask(dst_md()->dims, weights_md(0)->dims, ndims())
+            | high_two_bits;
+    conf_.bias_mask = utils::get_dims_mask(
+            dst_md()->dims, weights_md(1)->dims, ndims());
+
+    conf_.do_scale_data
+            = !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values();
+    conf_.do_scale_weights
+            = !attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values();
+    conf_.do_scale_dst
+            = !attr()->scales_.get(DNNL_ARG_DST).has_default_values();
+    conf_.single_weights_scale
+            = attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ == 0;
+
+    conf_.use_data_zeropoints
+            = !attr()->zero_points_.has_default_values(DNNL_ARG_SRC_0);
+    conf_.use_weights_zeropoints
+            = !attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS_0);
+    conf_.use_dst_zeropoints
+            = !attr()->zero_points_.has_default_values(DNNL_ARG_DST);
+
+    conf_.use_dropout = !attr()->dropout_.has_default_values();
+
+    conf_.post_ops = sycl_post_ops_t(attr());
+
+    for (auto i = 0; i < conf_.post_ops.get_post_op(); ++i) {
+        const auto &e = attr()->post_ops_.entry_[i];
+        if (e.is_binary() || e.is_prelu()) {
+            conf_.binary_src_arr[i] = xpu::sycl::md_t(
+                    arg_md(DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1));
+        }
+    }
+    return status::success;
+}
+
+status_t ref_matmul_t::init(impl::engine_t *engine) {
+    const auto kid = ::sycl::get_kernel_id<matmul_kernel_fwd_t>();
+    CHECK(create_kernel(engine, kid, &kernel_));
+    return status::success;
+}
+
+status_t ref_matmul_t::execute(const exec_ctx_t &ctx) const {
+
+    parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
+        matmul_kernel_fwd_t matmul_kernel(pd()->conf_, cgh, ctx);
+
+        const int block_size = 32;
+        const int wg_size = 32;
+
+        const int t_work = pd()->conf_.wk_size;
+        const int wg_work = wg_size * block_size;
+        const int wg_cnt = utils::div_up(t_work, wg_work);
+
+        cgh.parallel_for(
+                ::sycl::nd_range<1>(wg_cnt * wg_size, wg_size), matmul_kernel);
+    });
+
+    return status::success;
+}
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
@@ -0,0 +1,158 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_GENERIC_SYCL_REF_MATMUL_HPP
+#define GPU_GENERIC_SYCL_REF_MATMUL_HPP
+
+#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
+#include "gpu/generic/sycl/sycl_io_helper.hpp"
+#include "gpu/generic/sycl/sycl_post_ops.hpp"
+#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
+#include "gpu/generic/sycl/sycl_q10n.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
+#include "gpu/gpu_matmul_pd.hpp"
+#include "xpu/sycl/types.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+struct ref_matmul_t : public gpu::generic::sycl::primitive_t {
+    using gpu::generic::sycl::primitive_t::primitive_t;
+
+    struct pd_t : public gpu_matmul_pd_t {
+        using gpu_matmul_pd_t::gpu_matmul_pd_t;
+
+        DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_matmul_t);
+
+        status_t init(impl::engine_t *engine) {
+            using namespace data_type;
+            using sm = primitive_attr_t::skip_mask_t;
+
+            const memory_desc_wrapper src_d(src_md());
+            const memory_desc_wrapper weights_d(weights_md(0));
+            const memory_desc_wrapper bias_d(weights_md(1));
+            const memory_desc_wrapper dst_d(dst_md());
+
+            const bool ok = set_default_params() == status::success
+                    && attr_.set_default_formats(dst_md()) == status::success
+                    && check_data_types(src_d, weights_d, dst_d)
+                    && check_formats(src_d, weights_d, dst_d)
+                    && attr()->has_default_values(sm::scales_runtime
+                            | sm::zero_points_runtime | sm::post_ops
+                            | sm::dropout | sm::scales_runtime_data_type
+                            | sm::zero_points_runtime_data_type)
+                    && IMPLICATION(!attr()->scales_.has_default_values(),
+                            check_scales_mask())
+                    && post_ops_ok() && md_dims_in_range(src_md())
+                    && md_dims_in_range(weights_md());
+            if (!ok) return status::unimplemented;
+
+            return init_conf();
+        }
+
+        sycl_matmul_conf_t conf_;
+
+    private:
+        status_t init_conf();
+
+        status_t set_default_params() {
+            if (src_md_.format_kind == format_kind::any) {
+                auto src_tag = utils::pick(ndims() - 2, format_tag::ab,
+                        format_tag::abc, format_tag::abcd);
+                CHECK(memory_desc_init_by_tag(src_md_, src_tag));
+            }
+            const memory_desc_wrapper src_d(src_md());
+            if (src_d.is_blocking_desc()) {
+                if (weights_md_.format_kind == format_kind::any) {
+                    CHECK(memory_desc_init_by_blocking_desc(
+                            weights_md_, src_d.blocking_desc()));
+                }
+                if (dst_md_.format_kind == format_kind::any) {
+                    CHECK(memory_desc_init_by_blocking_desc(
+                            dst_md_, src_d.blocking_desc()));
+                }
+            }
+            const memory_desc_wrapper dst_d(dst_md());
+            if (dst_d.is_blocking_desc()) {
+                if (bias_md_.format_kind == format_kind::any) {
+                    CHECK(memory_desc_init_by_blocking_desc(
+                            bias_md_, dst_d.blocking_desc()));
+                }
+            }
+            return status::success;
+        }
+
+        bool check_scales_mask() const {
+            const std::vector<int> supported_args
+                    = {DNNL_ARG_SRC_0, DNNL_ARG_WEIGHTS_0, DNNL_ARG_DST};
+            return attr_scales_ok(supported_args);
+        }
+
+        bool post_ops_ok() const {
+            // Dw conv post-ops are not supported.
+            return attr()->post_ops_.len() <= sycl_post_ops_t::max_post_ops
+                    && attr()->post_ops_.has_default_values(
+                            {primitive_kind::eltwise, primitive_kind::binary,
+                                    primitive_kind::sum});
+        }
+
+        static bool check_data_types(const memory_desc_wrapper &src,
+                const memory_desc_wrapper &weights,
+                const memory_desc_wrapper &dst) {
+            using namespace data_type;
+
+            const auto src_dt = src.data_type();
+            const auto weights_dt = weights.data_type();
+            const auto dst_dt = dst.data_type();
+
+            for (auto t : {src_dt, weights_dt, dst_dt}) {
+                if (!utils::one_of(t, f32, bf16, f16, s8, u8, s32))
+                    return false;
+            }
+
+            return true;
+        }
+
+        static bool check_formats(const memory_desc_wrapper &src,
+                const memory_desc_wrapper &weights,
+                const memory_desc_wrapper &dst) {
+            using namespace format_tag;
+
+            for (const auto &mdw : {src, weights, dst}) {
+                if (!mdw.is_plain() || mdw.has_runtime_dims()) { return false; }
+            }
+            return true;
+        }
+    };
+
+    status_t init(impl::engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    kernel_t kernel_;
+};
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
@@ -239,6 +239,8 @@ struct memory_tensor_t {
         store(val, md_.off_v(offsets));
     }
 
+    inline void *ptr() const { return mem_.get_pointer(); }
+
 private:
     xpu::sycl::memory_arg_t<mode> mem_;
     xpu::sycl::md_t md_;

@@ -34,7 +34,9 @@ struct ref_eltwise_fwd_t {
         : alg_(alg), alpha_(alpha), beta_(beta), scale_(scale) {
         using namespace alg_kind;
         assert(utils::one_of(alg_, eltwise_relu, eltwise_linear, eltwise_clip,
-                eltwise_clip_v2, eltwise_hardswish));
+                eltwise_clip_v2, eltwise_hardswish, eltwise_gelu_tanh,
+                eltwise_gelu_erf, eltwise_tanh, eltwise_logistic, eltwise_swish,
+                eltwise_elu));
     }
 
     ref_eltwise_fwd_t(const post_ops_t::entry_t::eltwise_t &eltwise)
@@ -81,6 +83,14 @@ struct ref_eltwise_fwd_t {
             case eltwise_hardswish:
                 d = dnnl::impl::math::hardswish_fwd(s, alpha, beta);
                 break;
+            case eltwise_gelu_tanh: d = gelu_tanh_fwd(s); break;
+            case eltwise_gelu_erf: d = gelu_erf_fwd(s); break;
+            case eltwise_tanh: d = tanh_fwd(s); break;
+            case eltwise_logistic: d = logistic_fwd(s); break;
+            case eltwise_swish:
+                d = dnnl::impl::math::swish_fwd(s, alpha);
+                break;
+            case eltwise_elu: d = dnnl::impl::math::elu_fwd(s, alpha); break;
             default: d = ::sycl::nan(0u);
         }
         return d;

@@ -109,6 +109,37 @@ struct sycl_eltwise_conf_t {
     sycl_post_ops_t post_ops;
 };
 
+struct sycl_matmul_conf_t {
+    xpu::sycl::md_t data_md;
+    xpu::sycl::md_t dst_md;
+    xpu::sycl::md_t weights_md;
+    xpu::sycl::md_t bias_md;
+    alg_kind_t alg_kind;
+    bool transpose_data; //TODO can we remove?
+    bool transpose_dst;
+    bool transpose_weights;
+    bool transpose_bias;
+    dim_t post_po_len;
+    xpu::sycl::md_t binary_src_arr[sycl::sycl_post_ops_t::max_post_ops];
+    sycl_post_ops_t post_ops;
+    int wk_size;
+
+    int data_mask;
+    int weights_mask;
+    int bias_mask;
+
+    bool do_scale_data;
+    bool do_scale_weights;
+    bool do_scale_dst;
+    bool single_weights_scale;
+
+    bool use_data_zeropoints;
+    bool use_weights_zeropoints;
+    bool use_dst_zeropoints;
+
+    bool use_dropout;
+};
+
 struct sycl_prelu_conf_t {
     prop_kind_t prop_kind;
     xpu::sycl::md_t data_md;