diff --git a/src/gpu/gpu_reorder_list.cpp b/src/gpu/gpu_reorder_list.cpp index 0d868d40035..cc704d17922 100644 --- a/src/gpu/gpu_reorder_list.cpp +++ b/src/gpu/gpu_reorder_list.cpp @@ -28,6 +28,7 @@ #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA #include "gpu/intel/ocl/cross_engine_reorder.hpp" #include "gpu/nvidia/cudnn_reorder.hpp" +#include "gpu/sycl/ref_reorder.hpp" #endif #if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD @@ -55,6 +56,7 @@ constexpr impl_list_item_t impl_list[] = REG_REORDER_P({ GPU_REORDER_INSTANCE_NVIDIA(nvidia::cudnn_reorder_t::pd_t) GPU_REORDER_INSTANCE_AMD(intel::ocl::cross_engine_reorder_t::pd_t) GPU_REORDER_INSTANCE_AMD(amd::miopen_reorder_t::pd_t) + GPU_REORDER_INSTANCE_GENERIC_SYCL(sycl::ref_reorder_t::pd_t) nullptr, }); // clang-format on diff --git a/src/gpu/sycl/ref_reorder.cpp b/src/gpu/sycl/ref_reorder.cpp new file mode 100644 index 00000000000..955c780749e --- /dev/null +++ b/src/gpu/sycl/ref_reorder.cpp @@ -0,0 +1,95 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "gpu/sycl/ref_reorder.hpp" +#include "gpu/sycl/reorder_kernels.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace sycl { + +using namespace impl::sycl; + +status_t ref_reorder_t::pd_t::init_conf() { + conf_ = sycl_reorder_conf_t(); + + conf_.src_md = xpu::sycl::md_t(src_md(0)); + conf_.dst_md = xpu::sycl::md_t(dst_md()); + + // XXX: should probably be tuned. + conf_.block_size = 16; + conf_.wg_size = 32; + + conf_.wk_size = memory_desc_wrapper(src_md(0)).nelems(); + + conf_.do_scale_src + = !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values(); + conf_.scale_src_mask = attr()->scales_.get(DNNL_ARG_SRC_0).mask_; + conf_.do_scale_dst + = !attr()->scales_.get(DNNL_ARG_DST).has_default_values(); + conf_.scale_dst_mask = attr()->scales_.get(DNNL_ARG_DST).mask_; + conf_.post_ops = sycl_post_ops_t(attr()); + + return status::success; +} + +status_t ref_reorder_t::init(engine_t *engine) { + const auto kid = ::sycl::get_kernel_id(); + CHECK(create_kernel(engine, kid, &kernel_)); + return status::success; +} + +status_t ref_reorder_t::execute(const exec_ctx_t &ctx) const { + parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) { + auto src_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_0); + auto src_scale_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY( + DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0); + auto dst_mem_arg = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST); + auto dst_scale_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY( + DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST); + + auto scales_src_dt = (pd()->conf_.do_scale_src) + ? ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0) + .data_type() + : data_type_t::dnnl_f32; + auto scales_dst_dt = (pd()->conf_.do_scale_dst) + ? ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST) + .data_type() + : data_type_t::dnnl_f32; + + reorder_kernel_t reorder_kernel(pd()->conf_, src_mem_arg, dst_mem_arg, + src_scale_mem_arg, dst_scale_mem_arg, scales_src_dt, + scales_dst_dt); + + const int block_size = pd()->conf_.block_size; + const int wg_size = pd()->conf_.wg_size; + + const int t_work = pd()->conf_.wk_size; + const int wg_work = wg_size * block_size; + const int wg_cnt = utils::div_up(t_work, wg_work); + + cgh.parallel_for( + ::sycl::nd_range<1>(wg_cnt * wg_size, wg_size), reorder_kernel); + }); + + return status::success; +} + +} // namespace sycl +} // namespace gpu +} // namespace impl +} // namespace dnnl diff --git a/src/gpu/sycl/ref_reorder.hpp b/src/gpu/sycl/ref_reorder.hpp new file mode 100644 index 00000000000..216c8c485a0 --- /dev/null +++ b/src/gpu/sycl/ref_reorder.hpp @@ -0,0 +1,117 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_SYCL_REF_REORDER_HPP +#define GPU_SYCL_REF_REORDER_HPP + +#include "gpu/gpu_reorder_pd.hpp" +#include "gpu/sycl/sycl_gpu_primitive.hpp" +#include "gpu/sycl/sycl_io_helper.hpp" +#include "gpu/sycl/sycl_post_ops.hpp" +#include "gpu/sycl/sycl_primitive_conf.hpp" +#include "gpu/sycl/sycl_q10n.hpp" +#include "sycl/sycl_stream.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace sycl { + +struct ref_reorder_t : public sycl_gpu_primitive_t { + using sycl_gpu_primitive_t::sycl_gpu_primitive_t; + + struct pd_t : public gpu_reorder_pd_t { + using gpu_reorder_pd_t::gpu_reorder_pd_t; + + DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_reorder_t); + + status_t init( + engine_t *engine, engine_t *src_engine, engine_t *dst_engine) { + using namespace data_type; + using sm = primitive_attr_t::skip_mask_t; + + const memory_desc_wrapper src_d(src_md()); + const memory_desc_wrapper dst_d(dst_md()); + + const bool ok = check_data_types(src_d, dst_d) + && check_formats(src_d, dst_d) + && attr()->has_default_values( + sm::scales_runtime | sm::post_ops) + && post_ops_ok(); + if (!ok) return status::unimplemented; + + for (int i = 0; i < dst_d.ndims(); i++) { + if (dst_d.dims()[i] > INT_MAX) { return status::unimplemented; } + } + + return init_conf(); + } + + sycl_reorder_conf_t conf_; + + private: + DECLARE_GPU_REORDER_CREATE(); + + status_t init_conf(); + + bool post_ops_ok() const { + for (int i = 0; i < attr()->post_ops_.len(); i++) { + if (!attr()->post_ops_.entry_[i].is_sum()) { return false; } + } + return attr()->post_ops_.len() <= sycl_post_ops_t::max_post_ops + && attr()->post_ops_.has_default_values( + {primitive_kind::sum}); + } + + static bool check_data_types(const memory_desc_wrapper &src, + const memory_desc_wrapper &dst) { + using namespace data_type; + + const auto src_dt = src.data_type(); + const auto dst_dt = dst.data_type(); + + for (auto t : {src_dt, dst_dt}) { + if (!utils::one_of(t, f32, bf16, f16, s8, u8)) return false; + } + + return true; + } + + static bool check_formats(const memory_desc_wrapper &src, + const memory_desc_wrapper &dst) { + using namespace format_tag; + + for (const auto &mdw : {src, dst}) { + if (!mdw.is_plain()) { return false; } + } + return true; + } + }; + + status_t init(engine_t *engine) override; + status_t execute(const exec_ctx_t &ctx) const override; + +private: + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + intel::compute::kernel_t kernel_; +}; + +} // namespace sycl +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/sycl/reorder_kernels.hpp b/src/gpu/sycl/reorder_kernels.hpp new file mode 100644 index 00000000000..e3ed74baaa5 --- /dev/null +++ b/src/gpu/sycl/reorder_kernels.hpp @@ -0,0 +1,169 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef GPU_SYCL_REORDER_KERNELS_HPP +#define GPU_SYCL_REORDER_KERNELS_HPP + +#include "gpu/sycl/sycl_io_helper.hpp" +#include "gpu/sycl/sycl_post_ops.hpp" +#include "gpu/sycl/sycl_primitive_conf.hpp" +#include "gpu/sycl/sycl_q10n.hpp" + +namespace dnnl { +namespace impl { +namespace gpu { +namespace sycl { + +struct reorder_kernel_t { + static constexpr int vec_len = 8; + static constexpr int max_supported_ndims = 6; + + reorder_kernel_t(const sycl_reorder_conf_t &conf, + xpu::sycl::in_memory_arg_t &src, xpu::sycl::out_memory_arg_t &dst, + xpu::sycl::in_memory_arg_t &src_scale, + xpu::sycl::in_memory_arg_t &dst_scale, data_type_t scales_src_dt, + data_type_t scales_dst_dt) + : conf_(conf) + , src_(src) + , dst_(dst) + , src_scale_(src_scale) + , dst_scale_(dst_scale) + , scales_src_dt_(scales_src_dt) + , scales_dst_dt_(scales_dst_dt) {} + + void operator()(::sycl::nd_item<1> item) const { + auto sg = item.get_sub_group(); + size_t wg_offset_t = item.get_group(0) * conf_.wg_size; + size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0]; + size_t wi_offset_t = sg.get_local_id(); + size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t; + + size_t base_idx = offset_t * conf_.block_size; + + float scale_src = conf_.do_scale_src && conf_.scale_src_mask == 0 + ? load_float_value(scales_src_dt_, src_scale_ptr(), 0) + : 1.f; + float scale_dst = conf_.do_scale_dst && conf_.scale_dst_mask == 0 + ? load_float_value(scales_dst_dt_, dst_scale_ptr(), 0) + : 1.f; + + dims_t dims, off, strides; + for (int i = 0; i < max_supported_ndims; i++) { + dims[i] = (i < src_md().ndims()) ? src_md().dims()[i] : 1; + strides[i] + = (i < src_md().ndims()) ? src_md().strides()[i] : INT_MAX; + } + dims_t dims_scales_src; + if (conf_.scale_src_mask != 0) { + for (int i = 0; i < max_supported_ndims; i++) { + dims_scales_src[i] + = conf_.scale_src_mask >> i & 1 ? dims[i] : 1; + } + } + dims_t dims_scales_dst; + if (conf_.scale_dst_mask != 0) { + for (int i = 0; i < max_supported_ndims; i++) { + dims_scales_dst[i] + = conf_.scale_dst_mask >> i & 1 ? dims[i] : 1; + } + } + + for (int i = 0; i < conf_.block_size; i++) { + int idx = base_idx + i; + if (idx < conf_.wk_size) { + for (int i = 0; i < max_supported_ndims; i++) { + off[i] = idx / strides[i] % dims[i]; + } + + int dst_idx = dst_md().off_v(off); + auto src = load_float_value( + src_md().data_type(), src_ptr(), idx); + auto dst = load_float_value( + dst_md().data_type(), dst_ptr(), dst_idx); + + if (conf_.do_scale_src) { + if (conf_.scale_src_mask != 0) { + int scale_idx = 0; + for (int i = 0; i < max_supported_ndims; i++) { + if (i < src_md().ndims()) { + int off_scales_i = conf_.scale_src_mask >> i & 1 + ? off[i] + : 0; + scale_idx = scale_idx * dims_scales_src[i] + + off_scales_i; + } + } + scale_src = load_float_value( + scales_src_dt_, src_scale_ptr(), scale_idx); + } + src *= scale_src; + } + + auto acc = src; + acc = conf_.post_ops.apply(acc, dst); + if (conf_.do_scale_dst) { + if (conf_.scale_dst_mask != 0) { + int scale_idx = 0; + for (int i = 0; i < max_supported_ndims; i++) { + if (i < src_md().ndims()) { + int off_scales_i = conf_.scale_dst_mask >> i & 1 + ? off[i] + : 0; + scale_idx = scale_idx * dims_scales_dst[i] + + off_scales_i; + } + } + + scale_dst = load_float_value( + scales_dst_dt_, dst_scale_ptr(), scale_idx); + } + acc /= scale_dst; + } + store_float_value( + dst_md().data_type(), acc, dst_ptr(), dst_idx); + } + } + } + +private: + const xpu::sycl::md_t &src_md() const { return conf_.src_md; } + const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; } + + void *src_ptr() const { return src_.get_pointer(); } + void *dst_ptr() const { return dst_.get_pointer(); } + float *src_scale_ptr() const { + return static_cast(src_scale_.get_pointer()); + } + float *dst_scale_ptr() const { + return static_cast(dst_scale_.get_pointer()); + } + + sycl_reorder_conf_t conf_; + + xpu::sycl::in_memory_arg_t src_; + xpu::sycl::out_memory_arg_t dst_; + xpu::sycl::in_memory_arg_t src_scale_; + xpu::sycl::in_memory_arg_t dst_scale_; + data_type_t scales_src_dt_; + data_type_t scales_dst_dt_; +}; + +} // namespace sycl +} // namespace gpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/gpu/sycl/sycl_primitive_conf.hpp b/src/gpu/sycl/sycl_primitive_conf.hpp index d809134905c..7a1e148989f 100644 --- a/src/gpu/sycl/sycl_primitive_conf.hpp +++ b/src/gpu/sycl/sycl_primitive_conf.hpp @@ -127,6 +127,25 @@ struct sycl_shuffle_conf_t { dim_t work_amount; }; +struct sycl_reorder_conf_t { + xpu::sycl::md_t src_md; + xpu::sycl::md_t dst_md; + xpu::sycl::md_t scales; + + bool do_scale_src; + int scale_src_mask; + bool do_scale_dst; + int scale_dst_mask; + + int ndims; + + int block_size; + int wg_size; + int wk_size; + + sycl_post_ops_t post_ops; +}; + struct sycl_resampling_conf_t { dim_t MB; dim_t C;