Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

generic: sycl: matmul: implemented #2054

Merged
merged 1 commit into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
731 changes: 731 additions & 0 deletions src/gpu/generic/sycl/matmul_kernels.hpp

Large diffs are not rendered by default.

162 changes: 162 additions & 0 deletions src/gpu/generic/sycl/ref_matmul.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
/*******************************************************************************
* Copyright 2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "gpu/generic/sycl/ref_matmul.hpp"
#include "gpu/generic/sycl/matmul_kernels.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace generic {
namespace sycl {

status_t ref_matmul_t::pd_t::init_conf() {
conf_ = sycl_matmul_conf_t();

int matmul_dim_1 = ndims() - 2;
int matmul_dim_2 = ndims() - 1;

memory_desc_t data_md_copy = *src_md();
auto &data_strides = data_md_copy.format_desc.blocking.strides;
if (data_strides[matmul_dim_1] < data_strides[matmul_dim_2]) {
std::swap(data_strides[matmul_dim_1], data_strides[matmul_dim_2]);
std::swap(data_md_copy.dims[matmul_dim_1],
data_md_copy.dims[matmul_dim_2]);
conf_.transpose_data = true;
}
conf_.data_md = xpu::sycl::md_t(&data_md_copy);

memory_desc_t weights_md_copy = *weights_md();
auto &weights_strides = weights_md_copy.format_desc.blocking.strides;
if (weights_strides[matmul_dim_1] < weights_strides[matmul_dim_2]) {
std::swap(weights_strides[matmul_dim_1], weights_strides[matmul_dim_2]);
std::swap(weights_md_copy.dims[matmul_dim_1],
weights_md_copy.dims[matmul_dim_2]);
conf_.transpose_weights = true;
}
conf_.weights_md = xpu::sycl::md_t(&weights_md_copy);

memory_desc_t dst_md_copy = *dst_md();
auto &dst_strides = dst_md_copy.format_desc.blocking.strides;
if (dst_strides[matmul_dim_1] < dst_strides[matmul_dim_2]) {
std::swap(dst_strides[matmul_dim_1], dst_strides[matmul_dim_2]);
std::swap(
dst_md_copy.dims[matmul_dim_1], dst_md_copy.dims[matmul_dim_2]);
conf_.transpose_dst = true;
}
conf_.dst_md = xpu::sycl::md_t(&dst_md_copy);

if (with_bias()) {
memory_desc_t bias_md_copy = *weights_md(1);
auto &bias_strides = bias_md_copy.format_desc.blocking.strides;
if (bias_strides[matmul_dim_1] < bias_strides[matmul_dim_2]) {
std::swap(bias_strides[matmul_dim_1], bias_strides[matmul_dim_2]);
std::swap(bias_md_copy.dims[matmul_dim_1],
bias_md_copy.dims[matmul_dim_2]);
conf_.transpose_bias = true;
}
conf_.bias_md = xpu::sycl::md_t(&bias_md_copy);
}

dims_t dst_blocks;
for (int i = 0; i < matmul_kernel_fwd_t::max_supported_ndims; i++) {
if (i < conf_.dst_md.ndims()) {
dst_blocks[i] = conf_.dst_md.dims()[i];
} else {
dst_blocks[i] = 1;
}
}
dst_blocks[matmul_dim_1] = math::div_up(
dst_blocks[matmul_dim_1], matmul_kernel_fwd_t::register_block_N);
dst_blocks[matmul_dim_2] = math::div_up(
dst_blocks[matmul_dim_2], matmul_kernel_fwd_t::register_block_M);
int n_blocks = 1;
for (int i = 0; i < matmul_kernel_fwd_t::max_supported_ndims; i++) {
n_blocks *= dst_blocks[i];
}
conf_.wk_size = n_blocks;

int high_two_bits = 3 << (ndims() - 2);
// last two dimensions of data and weights are never broadcast
conf_.data_mask
= utils::get_dims_mask(dst_md()->dims, src_md()->dims, ndims())
| high_two_bits;
conf_.weights_mask
= utils::get_dims_mask(dst_md()->dims, weights_md(0)->dims, ndims())
| high_two_bits;
conf_.bias_mask = utils::get_dims_mask(
dst_md()->dims, weights_md(1)->dims, ndims());

conf_.do_scale_data
= !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values();
conf_.do_scale_weights
= !attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values();
conf_.do_scale_dst
= !attr()->scales_.get(DNNL_ARG_DST).has_default_values();
conf_.single_weights_scale
= attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ == 0;

conf_.use_data_zeropoints
= !attr()->zero_points_.has_default_values(DNNL_ARG_SRC_0);
conf_.use_weights_zeropoints
= !attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS_0);
conf_.use_dst_zeropoints
= !attr()->zero_points_.has_default_values(DNNL_ARG_DST);

conf_.use_dropout = !attr()->dropout_.has_default_values();

conf_.post_ops = sycl_post_ops_t(attr());

for (auto i = 0; i < conf_.post_ops.get_post_op(); ++i) {
const auto &e = attr()->post_ops_.entry_[i];
if (e.is_binary() || e.is_prelu()) {
conf_.binary_src_arr[i] = xpu::sycl::md_t(
arg_md(DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1));
}
}
return status::success;
}

status_t ref_matmul_t::init(impl::engine_t *engine) {
const auto kid = ::sycl::get_kernel_id<matmul_kernel_fwd_t>();
CHECK(create_kernel(engine, kid, &kernel_));
return status::success;
}

status_t ref_matmul_t::execute(const exec_ctx_t &ctx) const {

parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
matmul_kernel_fwd_t matmul_kernel(pd()->conf_, cgh, ctx);

const int block_size = 32;
const int wg_size = 32;

const int t_work = pd()->conf_.wk_size;
const int wg_work = wg_size * block_size;
const int wg_cnt = utils::div_up(t_work, wg_work);

cgh.parallel_for(
::sycl::nd_range<1>(wg_cnt * wg_size, wg_size), matmul_kernel);
});

return status::success;
}

} // namespace sycl
} // namespace generic
} // namespace gpu
} // namespace impl
} // namespace dnnl
158 changes: 158 additions & 0 deletions src/gpu/generic/sycl/ref_matmul.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/*******************************************************************************
* Copyright 2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef GPU_GENERIC_SYCL_REF_MATMUL_HPP
#define GPU_GENERIC_SYCL_REF_MATMUL_HPP

#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
#include "gpu/generic/sycl/sycl_io_helper.hpp"
#include "gpu/generic/sycl/sycl_post_ops.hpp"
#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
#include "gpu/generic/sycl/sycl_q10n.hpp"
#include "gpu/generic/sycl/sycl_utils.hpp"
#include "gpu/gpu_matmul_pd.hpp"
#include "xpu/sycl/types.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace generic {
namespace sycl {

struct ref_matmul_t : public gpu::generic::sycl::primitive_t {
using gpu::generic::sycl::primitive_t::primitive_t;

struct pd_t : public gpu_matmul_pd_t {
using gpu_matmul_pd_t::gpu_matmul_pd_t;

DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_matmul_t);

status_t init(impl::engine_t *engine) {
using namespace data_type;
using sm = primitive_attr_t::skip_mask_t;

const memory_desc_wrapper src_d(src_md());
const memory_desc_wrapper weights_d(weights_md(0));
const memory_desc_wrapper bias_d(weights_md(1));
const memory_desc_wrapper dst_d(dst_md());

const bool ok = set_default_params() == status::success
&& attr_.set_default_formats(dst_md()) == status::success
&& check_data_types(src_d, weights_d, dst_d)
&& check_formats(src_d, weights_d, dst_d)
&& attr()->has_default_values(sm::scales_runtime
| sm::zero_points_runtime | sm::post_ops
| sm::dropout | sm::scales_runtime_data_type
| sm::zero_points_runtime_data_type)
&& IMPLICATION(!attr()->scales_.has_default_values(),
check_scales_mask())
&& post_ops_ok() && md_dims_in_range(src_md())
&& md_dims_in_range(weights_md());
if (!ok) return status::unimplemented;

return init_conf();
}

sycl_matmul_conf_t conf_;

private:
status_t init_conf();

status_t set_default_params() {
if (src_md_.format_kind == format_kind::any) {
auto src_tag = utils::pick(ndims() - 2, format_tag::ab,
format_tag::abc, format_tag::abcd);
CHECK(memory_desc_init_by_tag(src_md_, src_tag));
}
const memory_desc_wrapper src_d(src_md());
if (src_d.is_blocking_desc()) {
if (weights_md_.format_kind == format_kind::any) {
CHECK(memory_desc_init_by_blocking_desc(
weights_md_, src_d.blocking_desc()));
}
if (dst_md_.format_kind == format_kind::any) {
CHECK(memory_desc_init_by_blocking_desc(
dst_md_, src_d.blocking_desc()));
}
}
const memory_desc_wrapper dst_d(dst_md());
if (dst_d.is_blocking_desc()) {
if (bias_md_.format_kind == format_kind::any) {
CHECK(memory_desc_init_by_blocking_desc(
bias_md_, dst_d.blocking_desc()));
}
}
return status::success;
}

bool check_scales_mask() const {
const std::vector<int> supported_args
= {DNNL_ARG_SRC_0, DNNL_ARG_WEIGHTS_0, DNNL_ARG_DST};
return attr_scales_ok(supported_args);
}

bool post_ops_ok() const {
// Dw conv post-ops are not supported.
return attr()->post_ops_.len() <= sycl_post_ops_t::max_post_ops
&& attr()->post_ops_.has_default_values(
{primitive_kind::eltwise, primitive_kind::binary,
primitive_kind::sum});
}

static bool check_data_types(const memory_desc_wrapper &src,
const memory_desc_wrapper &weights,
const memory_desc_wrapper &dst) {
using namespace data_type;

const auto src_dt = src.data_type();
const auto weights_dt = weights.data_type();
const auto dst_dt = dst.data_type();

for (auto t : {src_dt, weights_dt, dst_dt}) {
if (!utils::one_of(t, f32, bf16, f16, s8, u8, s32))
return false;
}

return true;
}

static bool check_formats(const memory_desc_wrapper &src,
const memory_desc_wrapper &weights,
const memory_desc_wrapper &dst) {
using namespace format_tag;

for (const auto &mdw : {src, weights, dst}) {
if (!mdw.is_plain() || mdw.has_runtime_dims()) { return false; }
}
return true;
}
};

status_t init(impl::engine_t *engine) override;
status_t execute(const exec_ctx_t &ctx) const override;

private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
kernel_t kernel_;
};

} // namespace sycl
} // namespace generic
} // namespace gpu
} // namespace impl
} // namespace dnnl

#endif
2 changes: 2 additions & 0 deletions src/gpu/generic/sycl/sycl_io_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ struct memory_tensor_t {
store(val, md_.off_v(offsets));
}

inline void *ptr() const { return mem_.get_pointer(); }

private:
xpu::sycl::memory_arg_t<mode> mem_;
xpu::sycl::md_t md_;
Expand Down
12 changes: 11 additions & 1 deletion src/gpu/generic/sycl/sycl_post_ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ struct ref_eltwise_fwd_t {
: alg_(alg), alpha_(alpha), beta_(beta), scale_(scale) {
using namespace alg_kind;
assert(utils::one_of(alg_, eltwise_relu, eltwise_linear, eltwise_clip,
eltwise_clip_v2, eltwise_hardswish));
eltwise_clip_v2, eltwise_hardswish, eltwise_gelu_tanh,
eltwise_gelu_erf, eltwise_tanh, eltwise_logistic, eltwise_swish,
eltwise_elu));
}

ref_eltwise_fwd_t(const post_ops_t::entry_t::eltwise_t &eltwise)
Expand Down Expand Up @@ -81,6 +83,14 @@ struct ref_eltwise_fwd_t {
case eltwise_hardswish:
d = dnnl::impl::math::hardswish_fwd(s, alpha, beta);
break;
case eltwise_gelu_tanh: d = gelu_tanh_fwd(s); break;
case eltwise_gelu_erf: d = gelu_erf_fwd(s); break;
case eltwise_tanh: d = tanh_fwd(s); break;
case eltwise_logistic: d = logistic_fwd(s); break;
case eltwise_swish:
d = dnnl::impl::math::swish_fwd(s, alpha);
break;
case eltwise_elu: d = dnnl::impl::math::elu_fwd(s, alpha); break;
default: d = ::sycl::nan(0u);
}
return d;
Expand Down
31 changes: 31 additions & 0 deletions src/gpu/generic/sycl/sycl_primitive_conf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,37 @@ struct sycl_eltwise_conf_t {
sycl_post_ops_t post_ops;
};

struct sycl_matmul_conf_t {
xpu::sycl::md_t data_md;
xpu::sycl::md_t dst_md;
xpu::sycl::md_t weights_md;
xpu::sycl::md_t bias_md;
alg_kind_t alg_kind;
bool transpose_data; //TODO can we remove?
bool transpose_dst;
bool transpose_weights;
bool transpose_bias;
dim_t post_po_len;
xpu::sycl::md_t binary_src_arr[sycl::sycl_post_ops_t::max_post_ops];
sycl_post_ops_t post_ops;
int wk_size;

int data_mask;
int weights_mask;
int bias_mask;

bool do_scale_data;
bool do_scale_weights;
bool do_scale_dst;
bool single_weights_scale;

bool use_data_zeropoints;
bool use_weights_zeropoints;
bool use_dst_zeropoints;

bool use_dropout;
};

struct sycl_prelu_conf_t {
prop_kind_t prop_kind;
xpu::sycl::md_t data_md;
Expand Down
Loading