Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gpu: sycl: add reorder primitive #1926

Merged
merged 1 commit into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/gpu/gpu_reorder_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
#include "gpu/intel/ocl/cross_engine_reorder.hpp"
#include "gpu/nvidia/cudnn_reorder.hpp"
#include "gpu/sycl/ref_reorder.hpp"
#endif

#if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
Expand Down Expand Up @@ -55,6 +56,7 @@ constexpr impl_list_item_t impl_list[] = REG_REORDER_P({
GPU_REORDER_INSTANCE_NVIDIA(nvidia::cudnn_reorder_t::pd_t)
GPU_REORDER_INSTANCE_AMD(intel::ocl::cross_engine_reorder_t::pd_t)
GPU_REORDER_INSTANCE_AMD(amd::miopen_reorder_t::pd_t)
GPU_REORDER_INSTANCE_GENERIC_SYCL(sycl::ref_reorder_t::pd_t)
nullptr,
});
// clang-format on
Expand Down
95 changes: 95 additions & 0 deletions src/gpu/sycl/ref_reorder.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*******************************************************************************
* Copyright 2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "gpu/sycl/ref_reorder.hpp"
#include "gpu/sycl/reorder_kernels.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace sycl {

using namespace impl::sycl;

status_t ref_reorder_t::pd_t::init_conf() {
conf_ = sycl_reorder_conf_t();

conf_.src_md = xpu::sycl::md_t(src_md(0));
conf_.dst_md = xpu::sycl::md_t(dst_md());

// XXX: should probably be tuned.
conf_.block_size = 16;
conf_.wg_size = 32;

conf_.wk_size = memory_desc_wrapper(src_md(0)).nelems();

conf_.do_scale_src
= !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values();
conf_.scale_src_mask = attr()->scales_.get(DNNL_ARG_SRC_0).mask_;
conf_.do_scale_dst
= !attr()->scales_.get(DNNL_ARG_DST).has_default_values();
conf_.scale_dst_mask = attr()->scales_.get(DNNL_ARG_DST).mask_;
conf_.post_ops = sycl_post_ops_t(attr());

return status::success;
}

status_t ref_reorder_t::init(engine_t *engine) {
const auto kid = ::sycl::get_kernel_id<reorder_kernel_t>();
CHECK(create_kernel(engine, kid, &kernel_));
return status::success;
}

status_t ref_reorder_t::execute(const exec_ctx_t &ctx) const {
parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
auto src_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_0);
auto src_scale_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY(
DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0);
auto dst_mem_arg = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST);
auto dst_scale_mem_arg = CTX_IN_SYCL_KERNEL_MEMORY(
DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);

auto scales_src_dt = (pd()->conf_.do_scale_src)
? ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0)
.data_type()
: data_type_t::dnnl_f32;
auto scales_dst_dt = (pd()->conf_.do_scale_dst)
? ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST)
.data_type()
: data_type_t::dnnl_f32;

reorder_kernel_t reorder_kernel(pd()->conf_, src_mem_arg, dst_mem_arg,
src_scale_mem_arg, dst_scale_mem_arg, scales_src_dt,
scales_dst_dt);

const int block_size = pd()->conf_.block_size;
const int wg_size = pd()->conf_.wg_size;

const int t_work = pd()->conf_.wk_size;
const int wg_work = wg_size * block_size;
const int wg_cnt = utils::div_up(t_work, wg_work);

cgh.parallel_for(
::sycl::nd_range<1>(wg_cnt * wg_size, wg_size), reorder_kernel);
});

return status::success;
}

} // namespace sycl
} // namespace gpu
} // namespace impl
} // namespace dnnl
117 changes: 117 additions & 0 deletions src/gpu/sycl/ref_reorder.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
/*******************************************************************************
* Copyright 2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef GPU_SYCL_REF_REORDER_HPP
#define GPU_SYCL_REF_REORDER_HPP

#include "gpu/gpu_reorder_pd.hpp"
#include "gpu/sycl/sycl_gpu_primitive.hpp"
#include "gpu/sycl/sycl_io_helper.hpp"
#include "gpu/sycl/sycl_post_ops.hpp"
#include "gpu/sycl/sycl_primitive_conf.hpp"
#include "gpu/sycl/sycl_q10n.hpp"
#include "sycl/sycl_stream.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace sycl {

struct ref_reorder_t : public sycl_gpu_primitive_t {
using sycl_gpu_primitive_t::sycl_gpu_primitive_t;

struct pd_t : public gpu_reorder_pd_t {
using gpu_reorder_pd_t::gpu_reorder_pd_t;

DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_reorder_t);

status_t init(
engine_t *engine, engine_t *src_engine, engine_t *dst_engine) {
using namespace data_type;
using sm = primitive_attr_t::skip_mask_t;

const memory_desc_wrapper src_d(src_md());
const memory_desc_wrapper dst_d(dst_md());

const bool ok = check_data_types(src_d, dst_d)
&& check_formats(src_d, dst_d)
&& attr()->has_default_values(
sm::scales_runtime | sm::post_ops)
&& post_ops_ok();
if (!ok) return status::unimplemented;

for (int i = 0; i < dst_d.ndims(); i++) {
if (dst_d.dims()[i] > INT_MAX) { return status::unimplemented; }
}

return init_conf();
}

sycl_reorder_conf_t conf_;

private:
DECLARE_GPU_REORDER_CREATE();

status_t init_conf();

bool post_ops_ok() const {
for (int i = 0; i < attr()->post_ops_.len(); i++) {
if (!attr()->post_ops_.entry_[i].is_sum()) { return false; }
}
return attr()->post_ops_.len() <= sycl_post_ops_t::max_post_ops
&& attr()->post_ops_.has_default_values(
{primitive_kind::sum});
}

static bool check_data_types(const memory_desc_wrapper &src,
const memory_desc_wrapper &dst) {
using namespace data_type;

const auto src_dt = src.data_type();
const auto dst_dt = dst.data_type();

for (auto t : {src_dt, dst_dt}) {
if (!utils::one_of(t, f32, bf16, f16, s8, u8)) return false;
}

return true;
}

static bool check_formats(const memory_desc_wrapper &src,
const memory_desc_wrapper &dst) {
using namespace format_tag;

for (const auto &mdw : {src, dst}) {
if (!mdw.is_plain()) { return false; }
}
return true;
}
};

status_t init(engine_t *engine) override;
status_t execute(const exec_ctx_t &ctx) const override;

private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
intel::compute::kernel_t kernel_;
};

} // namespace sycl
} // namespace gpu
} // namespace impl
} // namespace dnnl

#endif
169 changes: 169 additions & 0 deletions src/gpu/sycl/reorder_kernels.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
/*******************************************************************************
* Copyright 2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef GPU_SYCL_REORDER_KERNELS_HPP
#define GPU_SYCL_REORDER_KERNELS_HPP

#include "gpu/sycl/sycl_io_helper.hpp"
#include "gpu/sycl/sycl_post_ops.hpp"
#include "gpu/sycl/sycl_primitive_conf.hpp"
#include "gpu/sycl/sycl_q10n.hpp"

namespace dnnl {
namespace impl {
namespace gpu {
namespace sycl {

struct reorder_kernel_t {
static constexpr int vec_len = 8;
static constexpr int max_supported_ndims = 6;

reorder_kernel_t(const sycl_reorder_conf_t &conf,
xpu::sycl::in_memory_arg_t &src, xpu::sycl::out_memory_arg_t &dst,
xpu::sycl::in_memory_arg_t &src_scale,
xpu::sycl::in_memory_arg_t &dst_scale, data_type_t scales_src_dt,
data_type_t scales_dst_dt)
: conf_(conf)
, src_(src)
, dst_(dst)
, src_scale_(src_scale)
, dst_scale_(dst_scale)
, scales_src_dt_(scales_src_dt)
, scales_dst_dt_(scales_dst_dt) {}

void operator()(::sycl::nd_item<1> item) const {
auto sg = item.get_sub_group();
size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
size_t wi_offset_t = sg.get_local_id();
size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;

size_t base_idx = offset_t * conf_.block_size;

float scale_src = conf_.do_scale_src && conf_.scale_src_mask == 0
? load_float_value(scales_src_dt_, src_scale_ptr(), 0)
: 1.f;
float scale_dst = conf_.do_scale_dst && conf_.scale_dst_mask == 0
? load_float_value(scales_dst_dt_, dst_scale_ptr(), 0)
: 1.f;

dims_t dims, off, strides;
for (int i = 0; i < max_supported_ndims; i++) {
dims[i] = (i < src_md().ndims()) ? src_md().dims()[i] : 1;
strides[i]
= (i < src_md().ndims()) ? src_md().strides()[i] : INT_MAX;
}
dims_t dims_scales_src;
if (conf_.scale_src_mask != 0) {
for (int i = 0; i < max_supported_ndims; i++) {
dims_scales_src[i]
= conf_.scale_src_mask >> i & 1 ? dims[i] : 1;
}
}
dims_t dims_scales_dst;
if (conf_.scale_dst_mask != 0) {
for (int i = 0; i < max_supported_ndims; i++) {
dims_scales_dst[i]
= conf_.scale_dst_mask >> i & 1 ? dims[i] : 1;
}
}

for (int i = 0; i < conf_.block_size; i++) {
int idx = base_idx + i;
if (idx < conf_.wk_size) {
for (int i = 0; i < max_supported_ndims; i++) {
off[i] = idx / strides[i] % dims[i];
}

int dst_idx = dst_md().off_v(off);
auto src = load_float_value(
src_md().data_type(), src_ptr(), idx);
auto dst = load_float_value(
dst_md().data_type(), dst_ptr(), dst_idx);

if (conf_.do_scale_src) {
if (conf_.scale_src_mask != 0) {
int scale_idx = 0;
for (int i = 0; i < max_supported_ndims; i++) {
if (i < src_md().ndims()) {
int off_scales_i = conf_.scale_src_mask >> i & 1
? off[i]
: 0;
scale_idx = scale_idx * dims_scales_src[i]
+ off_scales_i;
}
}
scale_src = load_float_value(
scales_src_dt_, src_scale_ptr(), scale_idx);
}
src *= scale_src;
}

auto acc = src;
acc = conf_.post_ops.apply(acc, dst);
if (conf_.do_scale_dst) {
if (conf_.scale_dst_mask != 0) {
int scale_idx = 0;
for (int i = 0; i < max_supported_ndims; i++) {
if (i < src_md().ndims()) {
int off_scales_i = conf_.scale_dst_mask >> i & 1
? off[i]
: 0;
scale_idx = scale_idx * dims_scales_dst[i]
+ off_scales_i;
}
}

scale_dst = load_float_value(
scales_dst_dt_, dst_scale_ptr(), scale_idx);
}
acc /= scale_dst;
}
store_float_value(
dst_md().data_type(), acc, dst_ptr(), dst_idx);
}
}
}

private:
const xpu::sycl::md_t &src_md() const { return conf_.src_md; }
const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; }

void *src_ptr() const { return src_.get_pointer(); }
void *dst_ptr() const { return dst_.get_pointer(); }
float *src_scale_ptr() const {
return static_cast<float *>(src_scale_.get_pointer());
}
float *dst_scale_ptr() const {
return static_cast<float *>(dst_scale_.get_pointer());
}

sycl_reorder_conf_t conf_;

xpu::sycl::in_memory_arg_t src_;
xpu::sycl::out_memory_arg_t dst_;
xpu::sycl::in_memory_arg_t src_scale_;
xpu::sycl::in_memory_arg_t dst_scale_;
data_type_t scales_src_dt_;
data_type_t scales_dst_dt_;
};

} // namespace sycl
} // namespace gpu
} // namespace impl
} // namespace dnnl

#endif
Loading