|
| 1 | + |
| 2 | +#ifndef GPU_GENERIC_SYCL_SIMPLE_REDUCTION_KERNELS_HPP |
| 3 | +#define GPU_GENERIC_SYCL_SIMPLE_REDUCTION_KERNELS_HPP |
| 4 | + |
| 5 | +#include "common/c_types_map.hpp" |
| 6 | +#include "common/dnnl_thread.hpp" |
| 7 | +#include "common/primitive_exec_types.hpp" |
| 8 | +#include "common/utils.hpp" |
| 9 | +#include "gpu/generic/sycl/sycl_io_helper.hpp" |
| 10 | +#include "gpu/generic/sycl/sycl_math_utils.hpp" |
| 11 | +#include "gpu/generic/sycl/sycl_primitive_conf.hpp" |
| 12 | +#include "xpu/sycl/memory_storage_base.hpp" |
| 13 | +#include "xpu/sycl/types.hpp" |
| 14 | + |
| 15 | +namespace dnnl { |
| 16 | +namespace impl { |
| 17 | +namespace gpu { |
| 18 | +namespace generic { |
| 19 | +namespace sycl { |
| 20 | + |
| 21 | +struct Reducer { |
| 22 | + dnnl_alg_kind_t alg_; |
| 23 | + float p_, eps_; |
| 24 | + |
| 25 | + Reducer(dnnl_alg_kind_t alg, float p, float eps) |
| 26 | + : alg_(alg), p_(p), eps_(eps) {} |
| 27 | + |
| 28 | + float identity() const { |
| 29 | + if (alg_ == dnnl_reduction_min) { |
| 30 | + return std::numeric_limits<float>::max(); |
| 31 | + } else if (alg_ == dnnl_reduction_max) { |
| 32 | + return std::numeric_limits<float>::lowest(); |
| 33 | + } else if (alg_ == dnnl_reduction_mul) { |
| 34 | + return 1.f; |
| 35 | + } |
| 36 | + |
| 37 | + return 0.f; |
| 38 | + } |
| 39 | + |
| 40 | + float reduce(float lhs, float rhs) const { |
| 41 | + if (alg_ == dnnl_reduction_sum || alg_ == dnnl_reduction_mean) { |
| 42 | + return lhs + rhs; |
| 43 | + } else if (alg_ == dnnl_reduction_min) { |
| 44 | + return ::sycl::min(lhs, rhs); |
| 45 | + } else if (alg_ == dnnl_reduction_max) { |
| 46 | + return ::sycl::max(lhs, rhs); |
| 47 | + } else if (alg_ == dnnl_reduction_mul) { |
| 48 | + return lhs * rhs; |
| 49 | + } else if (alg_ == dnnl_reduction_norm_lp_max |
| 50 | + || alg_ == dnnl_reduction_norm_lp_sum |
| 51 | + || alg_ == dnnl_reduction_norm_lp_power_p_max |
| 52 | + || alg_ == dnnl_reduction_norm_lp_power_p_sum) { |
| 53 | + return lhs + ::sycl::pow(::sycl::fabs(rhs), p_); |
| 54 | + } |
| 55 | + |
| 56 | + return ::sycl::nan(0U); |
| 57 | + } |
| 58 | + |
| 59 | + float finalize(float val, int size) const { |
| 60 | + if (alg_ == dnnl_reduction_mean) { |
| 61 | + return val / size; |
| 62 | + } else if (alg_ == dnnl_reduction_norm_lp_max) { |
| 63 | + return ::sycl::rootn(::sycl::max(val, eps_), p_); |
| 64 | + } else if (alg_ == dnnl_reduction_norm_lp_sum) { |
| 65 | + return ::sycl::rootn(val + eps_, p_); |
| 66 | + } else if (alg_ == dnnl_reduction_norm_lp_power_p_max) { |
| 67 | + return ::sycl::max(val, eps_); |
| 68 | + } else if (alg_ == dnnl_reduction_norm_lp_power_p_sum) { |
| 69 | + return val + eps_; |
| 70 | + } |
| 71 | + |
| 72 | + return val; |
| 73 | + } |
| 74 | +}; |
| 75 | + |
| 76 | +struct reduction_kernel_fwd_t { |
| 77 | + sycl_simple_reduction_conf_t conf_; |
| 78 | + xpu::sycl::in_memory_arg_t src_; |
| 79 | + xpu::sycl::out_memory_arg_t dst_; |
| 80 | + post_op_input_args po_args_; |
| 81 | + |
| 82 | + reduction_kernel_fwd_t(const sycl_simple_reduction_conf_t &conf, |
| 83 | + ::sycl::handler &cgh, const exec_ctx_t &ctx) |
| 84 | + : conf_(conf) |
| 85 | + , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC)) |
| 86 | + , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) |
| 87 | + , po_args_(cgh, ctx, conf_.post_ops) {} |
| 88 | + |
| 89 | + void operator()(::sycl::item<1> item) const { |
| 90 | + Reducer reducer(conf_.alg, conf_.p, conf_.eps); |
| 91 | + |
| 92 | + memory_tensor_t<::sycl::access_mode::read> src(src_, conf_.src_md); |
| 93 | + memory_tensor_t<::sycl::access_mode::write> dst(dst_, conf_.dst_md); |
| 94 | + const int id = item.get_linear_id(); |
| 95 | + |
| 96 | + const auto &dst_md = conf_.dst_md; |
| 97 | + dims_t pos; |
| 98 | + int l_offset = id; |
| 99 | + for (int i = 0; i < dst_md.ndims(); i++) { |
| 100 | + const int d = dst_md.ndims() - 1 - i; |
| 101 | + const dim_t cur_dim = dst_md.dims()[d]; |
| 102 | + pos[d] = l_offset % cur_dim; |
| 103 | + l_offset = l_offset / cur_dim; |
| 104 | + } |
| 105 | + |
| 106 | + float acc = reducer.identity(); |
| 107 | + for (off_t d0 = 0; d0 < conf_.reduce_dims[0]; d0++) |
| 108 | + for (off_t d1 = 0; d1 < conf_.reduce_dims[1]; d1++) |
| 109 | + for (off_t d2 = 0; d2 < conf_.reduce_dims[2]; d2++) |
| 110 | + for (off_t d3 = 0; d3 < conf_.reduce_dims[3]; d3++) |
| 111 | + for (off_t d4 = 0; d4 < conf_.reduce_dims[4]; d4++) |
| 112 | + for (off_t d5 = 0; d5 < conf_.reduce_dims[5]; |
| 113 | + d5++) { |
| 114 | + dims_t src_off = {pos[0] + d0, pos[1] + d1, |
| 115 | + pos[2] + d2, pos[3] + d3, pos[4] + d4, |
| 116 | + pos[5] + d5}; |
| 117 | + const float val = src.load_md(src_off); |
| 118 | + acc = reducer.reduce(acc, val); |
| 119 | + } |
| 120 | + |
| 121 | + float result = reducer.finalize(acc, conf_.reduce_size); |
| 122 | + result = conf_.post_ops.apply(result, dst.load_md(pos), po_args_, pos); |
| 123 | + dst.store_md(result, pos); |
| 124 | + } |
| 125 | +}; |
| 126 | + |
| 127 | +} // namespace sycl |
| 128 | +} // namespace generic |
| 129 | +} // namespace gpu |
| 130 | +} // namespace impl |
| 131 | +} // namespace dnnl |
| 132 | +#endif |
0 commit comments