Skip to content

Commit ac9c4aa

Browse files
committed
generic: sycl: Introduce spec constants for sycl matmul
Address comments Aaand its back!
1 parent 63184fe commit ac9c4aa

File tree

7 files changed

+192
-33
lines changed

7 files changed

+192
-33
lines changed

src/gpu/generic/sycl/matmul_kernels.hpp

+17-5
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#define GPU_GENERIC_SYCL_MATMUL_KERNELS_HPP
1919

2020
#include "common/primitive_exec_types.hpp"
21+
#include "gpu/generic/sycl/specialization_constants.hpp"
2122
#include "gpu/generic/sycl/sycl_io_helper.hpp"
2223
#include "gpu/generic/sycl/sycl_math_utils.hpp"
2324
#include "gpu/generic/sycl/sycl_post_ops.hpp"
@@ -409,16 +410,23 @@ struct matmul_kernel_fwd_t {
409410
CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_ATTR_DROPOUT_PROBABILITY))
410411
, po_args_(cgh, ctx, conf_.post_ops) {}
411412

412-
void operator()(::sycl::nd_item<1> item) const {
413+
void operator()(::sycl::nd_item<1> item, ::sycl::kernel_handler kh) const {
413414
using data_block_t = register_block<register_block_M, register_block_K>;
414415
using weights_block_t
415416
= register_block<register_block_K, register_block_N>;
416417
using dst_block_t = register_block<register_block_M, register_block_N>;
417418

418-
memory_tensor_t data_mem(data_, conf_.data_md);
419-
memory_tensor_t weights_mem(weights_, conf_.weights_md);
419+
// Get the value of the spec constant;
420+
auto md_t_spec_const_pod_val = kh.get_specialization_constant<
421+
detail::matmul::md_t_spec_const_id>();
422+
auto data_md = md_t_spec_const_pod_val.data_md_t;
423+
auto weights_md = md_t_spec_const_pod_val.weights_md_t;
424+
auto dst_md = md_t_spec_const_pod_val.dst_md_t;
425+
426+
memory_tensor_t data_mem(data_, data_md);
427+
memory_tensor_t weights_mem(weights_, weights_md);
420428
memory_tensor_t bias_mem(bias_, conf_.bias_md);
421-
memory_tensor_t dst_mem(dst_, conf_.dst_md);
429+
memory_tensor_t dst_mem(dst_, dst_md);
422430
memory_plain_t data_scale_mem(data_scale_, data_scales_dt_);
423431
memory_plain_t weights_scale_mem(weights_scale_, weights_scales_dt_);
424432
memory_plain_t dst_scale_mem(dst_scale_, dst_scales_dt_);
@@ -513,7 +521,11 @@ struct matmul_kernel_fwd_t {
513521
off_dst[matmul_dim_2] *= conf_.transpose_dst ? register_block_M
514522
: register_block_N;
515523
int m = off_dst[conf_.transpose_dst ? matmul_dim_2 : matmul_dim_1];
516-
int n = off_dst[conf_.transpose_dst ? matmul_dim_1 : matmul_dim_2];
524+
// TODO: the following code is changed due to a correctness bug
525+
// specific for PVC, needs further investigation and a better fix
526+
// or explanation.
527+
int n = off_dst[matmul_dim_2];
528+
if (conf_.transpose_dst) { n = off_dst[matmul_dim_1]; }
517529

518530
dims_t off_src, off_weights, off_bias;
519531
for (int i = max_supported_ndims - 1; i >= 0; i--) {

src/gpu/generic/sycl/ref_matmul.cpp

+72-21
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,18 @@
1515
*******************************************************************************/
1616

1717
#include "gpu/generic/sycl/ref_matmul.hpp"
18+
#include "common/c_types_map.hpp"
1819
#include "gpu/generic/sycl/matmul_kernels.hpp"
20+
#include "gpu/generic/sycl/specialization_constants.hpp"
21+
#include "xpu/sycl/types.hpp"
1922

2023
namespace dnnl {
2124
namespace impl {
2225
namespace gpu {
2326
namespace generic {
2427
namespace sycl {
2528

26-
void ref_matmul_t::pd_t::init_conf() {
29+
status_t ref_matmul_t::pd_t::init_conf() {
2730
conf_ = sycl_matmul_conf_t();
2831

2932
conf_.do_scale_data
@@ -52,16 +55,56 @@ void ref_matmul_t::pd_t::init_conf() {
5255
for (const auto &mdw : {src_d, weights_d, dst_d, bias_d}) {
5356
if (mdw.has_runtime_dims()) {
5457
any_runtime_params_ = true;
55-
return;
58+
return status::unimplemented;
5659
}
5760
}
58-
init_rt_conf(conf_, src_d, weights_d, dst_d, bias_d);
61+
init_rt_conf(conf_, data_md_t, dst_md_t, weights_md_t, src_d,
62+
weights_d, dst_d, bias_d);
63+
return status::success;
5964
}
6065

6166
void ref_matmul_t::pd_t::init_rt_conf(sycl_matmul_conf_t &conf,
67+
xpu::sycl::md_t_spec_const &data_md_t_,
68+
xpu::sycl::md_t_spec_const &dst_md_t_,
69+
xpu::sycl::md_t_spec_const &weights_md_t_,
6270
const memory_desc_wrapper src_d, const memory_desc_wrapper weights_d,
6371
const memory_desc_wrapper dst_d,
6472
const memory_desc_wrapper bias_d) const {
73+
74+
// Lambda because this function will not be used anywhere else
75+
auto init_md_t_sc_from_md = [=](xpu::sycl::md_t_spec_const &md_t_sc,
76+
const memory_desc_t *md) -> void {
77+
constexpr int max_dims = 6;
78+
using dim32_t = int32_t;
79+
80+
memory_desc_wrapper mdw(md);
81+
82+
assert(mdw.format_kind() == format_kind::blocked);
83+
assert(mdw.ndims() <= max_dims);
84+
85+
const auto &blk = mdw.blocking_desc();
86+
87+
md_t_sc.data_type_ = mdw.data_type();
88+
#define CHECK_AND_ASSIGN(lhs, rhs) \
89+
assert((rhs) <= INT32_MAX); \
90+
(lhs) = static_cast<dim32_t>(rhs)
91+
92+
CHECK_AND_ASSIGN(md_t_sc.ndims_, mdw.ndims());
93+
CHECK_AND_ASSIGN(md_t_sc.offset0_, mdw.offset0());
94+
CHECK_AND_ASSIGN(md_t_sc.inner_nblks_, blk.inner_nblks);
95+
96+
for (int d = 0; d < mdw.ndims(); d++) {
97+
CHECK_AND_ASSIGN(md_t_sc.dims_[d], mdw.dims()[d]);
98+
CHECK_AND_ASSIGN(md_t_sc.padded_dims_[d], mdw.padded_dims()[d]);
99+
CHECK_AND_ASSIGN(
100+
md_t_sc.padded_offsets_[d], mdw.padded_offsets()[d]);
101+
CHECK_AND_ASSIGN(md_t_sc.strides_[d], blk.strides[d]);
102+
CHECK_AND_ASSIGN(md_t_sc.inner_blks_[d], blk.inner_blks[d]);
103+
CHECK_AND_ASSIGN(md_t_sc.inner_idxs_[d], blk.inner_idxs[d]);
104+
}
105+
#undef CHECK_AND_ASSIGN
106+
};
107+
65108
int matmul_dim_1 = ndims() - 2;
66109
int matmul_dim_2 = ndims() - 1;
67110

@@ -73,7 +116,7 @@ void ref_matmul_t::pd_t::init_rt_conf(sycl_matmul_conf_t &conf,
73116
data_md_copy.dims[matmul_dim_2]);
74117
conf.transpose_data = true;
75118
}
76-
conf.data_md = xpu::sycl::md_t(&data_md_copy);
119+
init_md_t_sc_from_md(data_md_t_, &data_md_copy);
77120

78121
memory_desc_t weights_md_copy = *weights_d.md_;
79122
auto &weights_strides = weights_md_copy.format_desc.blocking.strides;
@@ -83,7 +126,7 @@ void ref_matmul_t::pd_t::init_rt_conf(sycl_matmul_conf_t &conf,
83126
weights_md_copy.dims[matmul_dim_2]);
84127
conf.transpose_weights = true;
85128
}
86-
conf.weights_md = xpu::sycl::md_t(&weights_md_copy);
129+
init_md_t_sc_from_md(weights_md_t_, &weights_md_copy);
87130

88131
memory_desc_t dst_md_copy = *dst_d.md_;
89132
auto &dst_strides = dst_md_copy.format_desc.blocking.strides;
@@ -93,7 +136,7 @@ void ref_matmul_t::pd_t::init_rt_conf(sycl_matmul_conf_t &conf,
93136
dst_md_copy.dims[matmul_dim_1], dst_md_copy.dims[matmul_dim_2]);
94137
conf.transpose_dst = true;
95138
}
96-
conf.dst_md = xpu::sycl::md_t(&dst_md_copy);
139+
init_md_t_sc_from_md(dst_md_t_, &dst_md_copy);
97140

98141
if (with_bias()) {
99142
memory_desc_t bias_md_copy = *bias_d.md_;
@@ -109,8 +152,8 @@ void ref_matmul_t::pd_t::init_rt_conf(sycl_matmul_conf_t &conf,
109152

110153
dims_t dst_blocks;
111154
for (int i = 0; i < matmul_kernel_fwd_t::max_supported_ndims; i++) {
112-
if (i < conf.dst_md.ndims()) {
113-
dst_blocks[i] = conf.dst_md.dims()[i];
155+
if (i < dst_md_t.ndims_) {
156+
dst_blocks[i] = dst_md_t.dims_[i];
114157
} else {
115158
dst_blocks[i] = 1;
116159
}
@@ -137,30 +180,38 @@ void ref_matmul_t::pd_t::init_rt_conf(sycl_matmul_conf_t &conf,
137180

138181
status_t ref_matmul_t::init(impl::engine_t *engine) {
139182
const auto kid = ::sycl::get_kernel_id<matmul_kernel_fwd_t>();
140-
CHECK(create_kernel(engine, kid, &kernel_));
183+
CHECK(create_matmul_kernel(engine, kid, &kernel_,
184+
{pd()->data_md_t, pd()->dst_md_t, pd()->weights_md_t}));
185+
return status::success;
186+
}
187+
188+
status_t ref_matmul_t::create_matmul_kernel(impl::engine_t *engine,
189+
::sycl::kernel_id kid, kernel_t *kernel,
190+
xpu::sycl::md_t_spec_const_pod pod) {
191+
192+
auto ctx = utils::downcast<const xpu::sycl::engine_impl_t *>(engine->impl())
193+
->context();
194+
auto input_bundle = ::sycl::get_kernel_bundle<::sycl::bundle_state::input>(
195+
ctx, {kid});
196+
197+
input_bundle.template set_specialization_constant<
198+
detail::matmul::md_t_spec_const_id>(pod);
199+
try {
200+
(*kernel) = kernel_t(::sycl::build(input_bundle));
201+
} catch (const ::sycl::exception &e) { return status::runtime_error; }
141202
return status::success;
142203
}
143204

144205
status_t ref_matmul_t::execute(const exec_ctx_t &ctx) const {
145206
if (memory_desc_wrapper(pd()->dst_md()).size() == 0) return status::success;
146207

147-
sycl_matmul_conf_t conf = pd()->conf_;
148-
if (pd()->any_runtime_params_) {
149-
const auto src_d = ctx.memory_mdw(DNNL_ARG_SRC, pd()->src_md());
150-
const auto weights_d
151-
= ctx.memory_mdw(DNNL_ARG_WEIGHTS, pd()->weights_md());
152-
const auto dst_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
153-
const auto bias_d = ctx.memory_mdw(DNNL_ARG_BIAS, pd()->weights_md(1));
154-
pd()->init_rt_conf(conf, src_d, weights_d, dst_d, bias_d);
155-
}
156-
157208
parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
158-
matmul_kernel_fwd_t matmul_kernel(conf, cgh, ctx);
209+
matmul_kernel_fwd_t matmul_kernel(pd()->conf_, cgh, ctx);
159210

160211
const int block_size = 32;
161212
const int wg_size = 32;
162213

163-
const int t_work = conf.wk_size;
214+
const int t_work = pd()->conf_.wk_size;
164215
const int wg_work = wg_size * block_size;
165216
const int wg_cnt = utils::div_up(t_work, wg_work);
166217

src/gpu/generic/sycl/ref_matmul.hpp

+16-3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
#ifndef GPU_GENERIC_SYCL_REF_MATMUL_HPP
1818
#define GPU_GENERIC_SYCL_REF_MATMUL_HPP
1919

20+
#include "common/c_types_map.hpp"
21+
#include "gpu/generic/sycl/specialization_constants.hpp"
2022
#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
2123
#include "gpu/generic/sycl/sycl_io_helper.hpp"
2224
#include "gpu/generic/sycl/sycl_post_ops.hpp"
@@ -64,21 +66,30 @@ struct ref_matmul_t : public gpu::generic::sycl::primitive_t {
6466
&& md_dims_in_range(weights_md());
6567
if (!ok) return status::unimplemented;
6668

67-
init_conf();
68-
return status::success;
69+
auto init_status = init_conf();
70+
71+
return init_status;
6972
}
7073

7174
sycl_matmul_conf_t conf_;
75+
76+
xpu::sycl::md_t_spec_const data_md_t;
77+
xpu::sycl::md_t_spec_const dst_md_t;
78+
xpu::sycl::md_t_spec_const weights_md_t;
79+
7280
bool any_runtime_params_ = false;
7381

7482
void init_rt_conf(sycl_matmul_conf_t &conf,
83+
xpu::sycl::md_t_spec_const &data_md_t_,
84+
xpu::sycl::md_t_spec_const &dst_md_t_,
85+
xpu::sycl::md_t_spec_const &weights_md_t_,
7586
const memory_desc_wrapper src_d,
7687
const memory_desc_wrapper weights_d,
7788
const memory_desc_wrapper dst_d,
7889
const memory_desc_wrapper bias_d) const;
7990

8091
private:
81-
void init_conf();
92+
status_t init_conf();
8293

8394
status_t set_default_params() {
8495
if (src_md_.format_kind == format_kind::any) {
@@ -153,6 +164,8 @@ struct ref_matmul_t : public gpu::generic::sycl::primitive_t {
153164
status_t execute(const exec_ctx_t &ctx) const override;
154165

155166
private:
167+
status_t create_matmul_kernel(impl::engine_t *engine, ::sycl::kernel_id kid,
168+
kernel_t *kernel, xpu::sycl::md_t_spec_const_pod pod);
156169
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
157170
kernel_t kernel_;
158171
};
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*******************************************************************************
2+
* Copyright 2024 Intel Corporation
3+
* Copyright 2024 Codeplay Software
4+
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*******************************************************************************/
17+
18+
#ifndef GPU_GENERIC_SYCL_SPECIALIZATION_CONSTANTS_HPP
19+
#define GPU_GENERIC_SYCL_SPECIALIZATION_CONSTANTS_HPP
20+
21+
#include <sycl/sycl.hpp>
22+
23+
#include "xpu/sycl/types.hpp"
24+
25+
namespace dnnl::impl::gpu::generic::sycl {
26+
namespace detail {
27+
namespace matmul {
28+
static constexpr ::sycl::specialization_id<xpu::sycl::md_t_spec_const_pod>
29+
md_t_spec_const_id;
30+
}
31+
} // namespace detail
32+
} // namespace dnnl::impl::gpu::generic::sycl
33+
34+
#endif

src/gpu/generic/sycl/sycl_primitive_conf.hpp

-3
Original file line numberDiff line numberDiff line change
@@ -112,9 +112,6 @@ struct sycl_eltwise_conf_t {
112112
};
113113

114114
struct sycl_matmul_conf_t {
115-
xpu::sycl::md_t data_md;
116-
xpu::sycl::md_t dst_md;
117-
xpu::sycl::md_t weights_md;
118115
xpu::sycl::md_t bias_md;
119116
alg_kind_t alg_kind;
120117
bool transpose_data; //TODO can we remove?

src/xpu/sycl/types.hpp

+46
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,37 @@ using in_memory_arg_t = memory_arg_t<::sycl::access::mode::read>;
9393
using out_memory_arg_t = memory_arg_t<::sycl::access::mode::write>;
9494
using inout_memory_arg_t = memory_arg_t<::sycl::access::mode::read_write>;
9595

96+
//TODO: This is a work-around for reducing the size of kernel parameters being passed
97+
// to the matmul kernel. This is to be removed when we shift to sycl-RTC
98+
struct md_t_spec_const {
99+
static constexpr int max_dims = 6;
100+
101+
using dim32_t = int32_t;
102+
using dims32_t = dim32_t[max_dims];
103+
104+
// ordering of elements is important during initialization.
105+
// This struct cannot have a non trivial constructor, or any non trivial types.
106+
data_type_t data_type_;
107+
108+
dim32_t ndims_;
109+
110+
dims32_t dims_;
111+
dims32_t padded_dims_;
112+
dims32_t padded_offsets_;
113+
dim32_t offset0_;
114+
115+
dims32_t strides_;
116+
dim32_t inner_nblks_;
117+
dims32_t inner_blks_;
118+
dims32_t inner_idxs_;
119+
};
120+
121+
struct md_t_spec_const_pod {
122+
struct md_t_spec_const data_md_t;
123+
struct md_t_spec_const dst_md_t;
124+
struct md_t_spec_const weights_md_t;
125+
};
126+
96127
// TODO: this class mimics memory_desc_t and makes sure it can be passed
97128
// to SYCL kernels as a kernel argument. SYCL puts restrictions on kernel
98129
// arguments, e.g. those cannot contain unions.
@@ -146,6 +177,21 @@ struct md_t {
146177
#undef CHECK_AND_ASSIGN
147178
}
148179

180+
md_t(const md_t_spec_const &mimicker)
181+
: data_type_(mimicker.data_type_)
182+
, ndims_(mimicker.ndims_)
183+
, offset0_(mimicker.offset0_)
184+
, inner_nblks_(mimicker.inner_nblks_) {
185+
for (dim32_t i = 0; i < ndims_; i++) {
186+
dims_[i] = mimicker.dims_[i];
187+
padded_dims_[i] = mimicker.padded_dims_[i];
188+
padded_offsets_[i] = mimicker.padded_offsets_[i];
189+
strides_[i] = mimicker.strides_[i];
190+
inner_blks_[i] = mimicker.inner_blks_[i];
191+
inner_idxs_[i] = mimicker.inner_idxs_[i];
192+
}
193+
}
194+
149195
template <typename... Args>
150196
dim_t off(Args... args) const {
151197
dims_t pos = {args...};

tests/benchdnn/dnnl_common.hpp

+7-1
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,13 @@ int check_dnnl_status(dnnl_status_t status, const prb_t *prb, res_t *res) {
310310
case dnnl_unimplemented: {
311311
// Unconditionally set all Nvidia backend unimplemented cases as
312312
// not supported.
313-
if (is_nvidia_gpu() || is_amd_gpu()) {
313+
if (is_nvidia_gpu()
314+
|| is_amd_gpu()
315+
#ifdef DNNL_SYCL_GENERIC
316+
// skip unimplemented configs for sycl impl
317+
|| is_gpu()
318+
#endif
319+
) {
314320
res->state = SKIPPED;
315321
res->reason = skip_reason::case_not_supported;
316322
return OK;

0 commit comments

Comments
 (0)