Skip to content

Commit ee131d3

Browse files
authored
gpu: generic: add simple SYCL reduction implementation (#2249)
1 parent 7e450f8 commit ee131d3

5 files changed

+290
-0
lines changed
+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#include "simple_reduction.hpp"
2+
3+
#include "gpu/generic/sycl/engine.hpp"
4+
#include "gpu/generic/sycl/simple_reduction_kernels.hpp"
5+
6+
namespace dnnl {
7+
namespace impl {
8+
namespace gpu {
9+
namespace generic {
10+
namespace sycl {
11+
12+
status_t simple_reduction_t::pd_t::init_conf() {
13+
conf_.alg = desc()->alg_kind;
14+
conf_.src_md = xpu::sycl::md_t(src_md());
15+
conf_.dst_md = xpu::sycl::md_t(dst_md());
16+
conf_.p = desc()->p;
17+
conf_.eps = desc()->eps;
18+
19+
auto src_wrap = memory_desc_wrapper(src_md());
20+
auto dst_wrap = memory_desc_wrapper(dst_md());
21+
dst_nelems_ = dst_wrap.nelems();
22+
23+
const auto ndims = dst_wrap.ndims();
24+
for (int d = 0; d < xpu::sycl::md_t::max_dims; d++) {
25+
conf_.reduce_dims[d] = dim_t {1};
26+
if (d < ndims) {
27+
if (src_wrap.dims()[d] != dst_wrap.dims()[d]) {
28+
conf_.reduce_dims[d] = src_wrap.dims()[d];
29+
conf_.reduce_size *= conf_.reduce_dims[d];
30+
}
31+
}
32+
}
33+
34+
conf_.post_ops = sycl_post_ops_t(attr(), dst_wrap);
35+
36+
return status::success;
37+
}
38+
39+
status_t simple_reduction_t::init(impl::engine_t *engine) {
40+
const auto kid = ::sycl::get_kernel_id<reduction_kernel_fwd_t>();
41+
CHECK(create_kernel(engine, kid, &kernel_));
42+
43+
return status::success;
44+
}
45+
46+
status_t simple_reduction_t::execute(const exec_ctx_t &ctx) const {
47+
return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
48+
reduction_kernel_fwd_t reduction_kernel(pd()->conf_, cgh, ctx);
49+
cgh.parallel_for(::sycl::range<1>(pd()->dst_nelems_), reduction_kernel);
50+
});
51+
}
52+
53+
} // namespace sycl
54+
} // namespace generic
55+
} // namespace gpu
56+
} // namespace impl
57+
} // namespace dnnl
+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*******************************************************************************
2+
* Copyright 2024 Intel Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*******************************************************************************/
16+
17+
#ifndef GPU_GENERIC_SYCL_SIMPLE_REDUCTION_HPP
18+
#define GPU_GENERIC_SYCL_SIMPLE_REDUCTION_HPP
19+
20+
#include "common/primitive_desc_iterator.hpp"
21+
#include "common/reorder.hpp"
22+
#include "common/reorder_pd.hpp"
23+
#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
24+
#include "gpu/generic/sycl/sycl_io_helper.hpp"
25+
#include "gpu/generic/sycl/sycl_post_ops.hpp"
26+
#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
27+
#include "gpu/generic/sycl/sycl_utils.hpp"
28+
#include "gpu/gpu_reduction_pd.hpp"
29+
30+
namespace dnnl {
31+
namespace impl {
32+
namespace gpu {
33+
namespace generic {
34+
namespace sycl {
35+
36+
struct simple_reduction_t : public gpu::generic::sycl::primitive_t {
37+
using gpu::generic::sycl::primitive_t::primitive_t;
38+
39+
struct pd_t : public gpu_reduction_pd_t {
40+
using gpu_reduction_pd_t::gpu_reduction_pd_t;
41+
42+
DECLARE_COMMON_PD_T("dpcpp:ref:any", simple_reduction_t);
43+
44+
status_t init(impl::engine_t *engine) {
45+
using sm = primitive_attr_t::skip_mask_t;
46+
47+
memory_desc_wrapper src_wrap(src_md());
48+
memory_desc_wrapper dst_wrap(dst_md());
49+
50+
bool ok = set_default_params() == status::success
51+
&& attr()->has_default_values(sm::post_ops)
52+
&& sycl_post_ops_t::post_ops_ok(attr())
53+
&& attr_.set_default_formats(dst_md()) == status::success
54+
&& src_wrap.is_plain() && dst_wrap.is_plain()
55+
&& src_wrap.ndims() == dst_wrap.ndims()
56+
&& md_dims_in_range(src_md()) && md_dims_in_range(dst_md());
57+
if (!ok) return status::unimplemented;
58+
59+
return init_conf();
60+
}
61+
62+
sycl_simple_reduction_conf_t conf_;
63+
dim_t dst_nelems_;
64+
65+
private:
66+
status_t init_conf();
67+
};
68+
69+
status_t init(impl::engine_t *engine) override;
70+
status_t execute(const exec_ctx_t &ctx) const override;
71+
72+
private:
73+
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
74+
kernel_t kernel_;
75+
std::shared_ptr<impl::primitive_t> reorder_p_;
76+
};
77+
78+
} // namespace sycl
79+
} // namespace generic
80+
} // namespace gpu
81+
} // namespace impl
82+
} // namespace dnnl
83+
84+
#endif
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
2+
#ifndef GPU_GENERIC_SYCL_SIMPLE_REDUCTION_KERNELS_HPP
3+
#define GPU_GENERIC_SYCL_SIMPLE_REDUCTION_KERNELS_HPP
4+
5+
#include "common/c_types_map.hpp"
6+
#include "common/dnnl_thread.hpp"
7+
#include "common/primitive_exec_types.hpp"
8+
#include "common/utils.hpp"
9+
#include "gpu/generic/sycl/sycl_io_helper.hpp"
10+
#include "gpu/generic/sycl/sycl_math_utils.hpp"
11+
#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
12+
#include "xpu/sycl/memory_storage_base.hpp"
13+
#include "xpu/sycl/types.hpp"
14+
15+
namespace dnnl {
16+
namespace impl {
17+
namespace gpu {
18+
namespace generic {
19+
namespace sycl {
20+
21+
struct Reducer {
22+
dnnl_alg_kind_t alg_;
23+
float p_, eps_;
24+
25+
Reducer(dnnl_alg_kind_t alg, float p, float eps)
26+
: alg_(alg), p_(p), eps_(eps) {}
27+
28+
float identity() const {
29+
if (alg_ == dnnl_reduction_min) {
30+
return std::numeric_limits<float>::max();
31+
} else if (alg_ == dnnl_reduction_max) {
32+
return std::numeric_limits<float>::lowest();
33+
} else if (alg_ == dnnl_reduction_mul) {
34+
return 1.f;
35+
}
36+
37+
return 0.f;
38+
}
39+
40+
float reduce(float lhs, float rhs) const {
41+
if (alg_ == dnnl_reduction_sum || alg_ == dnnl_reduction_mean) {
42+
return lhs + rhs;
43+
} else if (alg_ == dnnl_reduction_min) {
44+
return ::sycl::min(lhs, rhs);
45+
} else if (alg_ == dnnl_reduction_max) {
46+
return ::sycl::max(lhs, rhs);
47+
} else if (alg_ == dnnl_reduction_mul) {
48+
return lhs * rhs;
49+
} else if (alg_ == dnnl_reduction_norm_lp_max
50+
|| alg_ == dnnl_reduction_norm_lp_sum
51+
|| alg_ == dnnl_reduction_norm_lp_power_p_max
52+
|| alg_ == dnnl_reduction_norm_lp_power_p_sum) {
53+
return lhs + ::sycl::pow(::sycl::fabs(rhs), p_);
54+
}
55+
56+
return ::sycl::nan(0U);
57+
}
58+
59+
float finalize(float val, int size) const {
60+
if (alg_ == dnnl_reduction_mean) {
61+
return val / size;
62+
} else if (alg_ == dnnl_reduction_norm_lp_max) {
63+
return ::sycl::rootn(::sycl::max(val, eps_), p_);
64+
} else if (alg_ == dnnl_reduction_norm_lp_sum) {
65+
return ::sycl::rootn(val + eps_, p_);
66+
} else if (alg_ == dnnl_reduction_norm_lp_power_p_max) {
67+
return ::sycl::max(val, eps_);
68+
} else if (alg_ == dnnl_reduction_norm_lp_power_p_sum) {
69+
return val + eps_;
70+
}
71+
72+
return val;
73+
}
74+
};
75+
76+
struct reduction_kernel_fwd_t {
77+
sycl_simple_reduction_conf_t conf_;
78+
xpu::sycl::in_memory_arg_t src_;
79+
xpu::sycl::out_memory_arg_t dst_;
80+
post_op_input_args po_args_;
81+
82+
reduction_kernel_fwd_t(const sycl_simple_reduction_conf_t &conf,
83+
::sycl::handler &cgh, const exec_ctx_t &ctx)
84+
: conf_(conf)
85+
, src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC))
86+
, dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
87+
, po_args_(cgh, ctx, conf_.post_ops) {}
88+
89+
void operator()(::sycl::item<1> item) const {
90+
Reducer reducer(conf_.alg, conf_.p, conf_.eps);
91+
92+
memory_tensor_t<::sycl::access_mode::read> src(src_, conf_.src_md);
93+
memory_tensor_t<::sycl::access_mode::write> dst(dst_, conf_.dst_md);
94+
const int id = item.get_linear_id();
95+
96+
const auto &dst_md = conf_.dst_md;
97+
dims_t pos;
98+
int l_offset = id;
99+
for (int i = 0; i < dst_md.ndims(); i++) {
100+
const int d = dst_md.ndims() - 1 - i;
101+
const dim_t cur_dim = dst_md.dims()[d];
102+
pos[d] = l_offset % cur_dim;
103+
l_offset = l_offset / cur_dim;
104+
}
105+
106+
float acc = reducer.identity();
107+
for (off_t d0 = 0; d0 < conf_.reduce_dims[0]; d0++)
108+
for (off_t d1 = 0; d1 < conf_.reduce_dims[1]; d1++)
109+
for (off_t d2 = 0; d2 < conf_.reduce_dims[2]; d2++)
110+
for (off_t d3 = 0; d3 < conf_.reduce_dims[3]; d3++)
111+
for (off_t d4 = 0; d4 < conf_.reduce_dims[4]; d4++)
112+
for (off_t d5 = 0; d5 < conf_.reduce_dims[5];
113+
d5++) {
114+
dims_t src_off = {pos[0] + d0, pos[1] + d1,
115+
pos[2] + d2, pos[3] + d3, pos[4] + d4,
116+
pos[5] + d5};
117+
const float val = src.load_md(src_off);
118+
acc = reducer.reduce(acc, val);
119+
}
120+
121+
float result = reducer.finalize(acc, conf_.reduce_size);
122+
result = conf_.post_ops.apply(result, dst.load_md(pos), po_args_, pos);
123+
dst.store_md(result, pos);
124+
}
125+
};
126+
127+
} // namespace sycl
128+
} // namespace generic
129+
} // namespace gpu
130+
} // namespace impl
131+
} // namespace dnnl
132+
#endif

src/gpu/generic/sycl/sycl_primitive_conf.hpp

+12
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,17 @@ struct sycl_pooling_bwd_conf_t : public sycl_pooling_base_conf_t {
415415
xpu::sycl::md_t diff_dst_md;
416416
};
417417

418+
struct sycl_simple_reduction_conf_t {
419+
dnnl_alg_kind_t alg = dnnl_alg_kind_undef;
420+
xpu::sycl::md_t src_md;
421+
xpu::sycl::md_t dst_md;
422+
float p;
423+
float eps;
424+
sycl_post_ops_t post_ops;
425+
dim_t reduce_dims[xpu::sycl::md_t::max_dims];
426+
int reduce_size = 1;
427+
};
428+
418429
CHECK_SYCL_KERNEL_ARG_TYPE(sycl_binary_conf_t);
419430
CHECK_SYCL_KERNEL_ARG_TYPE(sycl_prelu_conf_t);
420431
CHECK_SYCL_KERNEL_ARG_TYPE(sycl_shuffle_conf_t);
@@ -431,6 +442,7 @@ CHECK_SYCL_KERNEL_ARG_TYPE(sycl_pooling_bwd_conf_t);
431442
CHECK_SYCL_KERNEL_ARG_TYPE(sycl_convolution_fwd_conf_t);
432443
CHECK_SYCL_KERNEL_ARG_TYPE(sycl_convolution_bwd_data_conf_t);
433444
CHECK_SYCL_KERNEL_ARG_TYPE(sycl_convolution_bwd_weights_conf_t);
445+
CHECK_SYCL_KERNEL_ARG_TYPE(sycl_simple_reduction_conf_t);
434446

435447
} // namespace sycl
436448
} // namespace generic

src/gpu/gpu_reduction_list.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
#include "gpu/amd/miopen_reduction.hpp"
3737
#endif
3838

39+
#ifdef GENERIC_SYCL_KERNELS_ENABLED
40+
#include "gpu/generic/sycl/simple_reduction.hpp"
41+
#endif
42+
3943
namespace dnnl {
4044
namespace impl {
4145
namespace gpu {
@@ -51,6 +55,7 @@ constexpr impl_list_item_t impl_list[] = REG_REDUCTION_P({
5155
GPU_INSTANCE_INTEL(intel::ocl::reusable_ref_reduction_t)
5256
GPU_INSTANCE_NVIDIA(nvidia::cudnn_reduction_t)
5357
GPU_INSTANCE_AMD(amd::miopen_reduction_t)
58+
GPU_INSTANCE_GENERIC_SYCL(generic::sycl::simple_reduction_t)
5459
nullptr,
5560
});
5661
// clang-format on

0 commit comments

Comments
 (0)