Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

generic:sycl: Inner Product FWD #2248

Merged
merged 2 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/gpu/generic/sycl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,14 @@ The implementation supports both forward and backward directions.
* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`, `N`
* Supported data types: `f32`, `bf16`, `f16`, `s32`, `s8`, `u8`

## Inner Product

The implementation supports the forward direction only.

* Supported formats: All plain formats are supported.
* Supported data types: All possible data combinations listed in the oneDNN specification are supported.
* Supported post-ops: All the post operations as mentioned in the specification are supported.

## Layer Normalization

The implementation supports both forward and backward directions.
Expand Down
55 changes: 55 additions & 0 deletions src/gpu/generic/sycl/ref_inner_product.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*******************************************************************************
* Copyright 2024 Intel Corporation
* Copyright 2024 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "gpu/generic/sycl/ref_inner_product.hpp"
#include "common/primitive_desc_iterator.hpp"

namespace dnnl::impl::gpu::generic::sycl {

status_t ref_inner_product_fwd_t::pd_t::init_matmul(impl::engine_t *engine) {
matmul_desc_t matmul_desc;
CHECK(matmul_desc_init(&matmul_desc, &src_md_reshaped, &weights_md_reshaped,
&bias_md_reshaped, arg_md(DNNL_ARG_DST)));
primitive_attr_t matmul_attr(*attr());

primitive_desc_iterator_t it(engine,
reinterpret_cast<op_desc_t *>(&matmul_desc), &matmul_attr, nullptr);
if (!it.is_initialized()) return status::out_of_memory;
while (++it != it.end()) {
matmul_pd = *it;
if (matmul_pd) { break; }
}
if (!matmul_pd) { return status::invalid_arguments; }
return status::success;
}

status_t ref_inner_product_fwd_t::init(impl::engine_t *engine) {
std::pair<std::shared_ptr<impl::primitive_t>, cache_state_t> p;
CHECK(pd()->matmul_pd->create_primitive_nested(p, engine));
matmul_primitive = p.first;
return status::success;
}

status_t ref_inner_product_fwd_t::execute(const exec_ctx_t &ctx) const {
nested_scratchpad_t nested_scratchpad(
ctx, memory_tracking::names::key_nested, matmul_primitive);
exec_ctx_t copied_ctx(ctx);
copied_ctx.set_scratchpad_grantor(nested_scratchpad.grantor());
return matmul_primitive->execute(copied_ctx);
}

} // namespace dnnl::impl::gpu::generic::sycl
175 changes: 175 additions & 0 deletions src/gpu/generic/sycl/ref_inner_product.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
/*******************************************************************************
* Copyright 2023-2024 Intel Corporation
* Copyright 2024-2025 Codeplay Software Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef GPU_GENERIC_SYCL_REF_INNER_PRODUCT_HPP
#define GPU_GENERIC_SYCL_REF_INNER_PRODUCT_HPP

#include "gpu/generic/sycl/ref_matmul.hpp"
#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
#include "gpu/generic/sycl/sycl_post_ops.hpp"
#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
#include "gpu/generic/sycl/sycl_utils.hpp"
#include "gpu/gpu_inner_product_pd.hpp"
#include "gpu/gpu_primitive.hpp"

namespace dnnl::impl::gpu::generic::sycl {
struct ref_inner_product_fwd_t : public gpu::generic::sycl::primitive_t {
using gpu::generic::sycl::primitive_t::primitive_t;

struct pd_t : public gpu_inner_product_fwd_pd_t {
using gpu_inner_product_fwd_pd_t::gpu_inner_product_fwd_pd_t;
using sm = primitive_attr_t::skip_mask_t;

DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_inner_product_fwd_t);

status_t init(impl::engine_t *engine) {
auto src_dt = arg_md(DNNL_ARG_SRC)->data_type;
auto weights_dt = arg_md(DNNL_ARG_WEIGHTS)->data_type;
auto dst_dt = arg_md(DNNL_ARG_DST)->data_type;
auto bias_dt = with_bias() ? arg_md(DNNL_ARG_BIAS)->data_type
: data_type::undef;

const bool ok = (set_default_params() == status::success)
&& is_fwd()
&& check_if_dtypes_valid(
src_dt, dst_dt, bias_dt, weights_dt)
&& sycl_post_ops_t::post_ops_ok(attr())
&& (attr_.set_default_formats(dst_md()) == status::success)
// Blocked memory formats are not supported
&& memory_desc_wrapper(src_md()).is_plain()
&& memory_desc_wrapper(dst_md()).is_plain()
&& memory_desc_wrapper(weights_md()).is_plain();

if (!ok) { return status::unimplemented; }
CHECK(create_ip_mds());
CHECK(init_matmul(engine));

// book scratchpad for the matmul
auto scratchpad = scratchpad_registry().registrar();
scratchpad.book(memory_tracking::names::key_nested,
matmul_pd->scratchpad_registry());
return status::success;
}

std::shared_ptr<primitive_desc_t> matmul_pd;

private:
bool check_if_dtypes_valid(const data_type_t &src_dt,
const data_type_t &dst_dt, const data_type_t &bias_dt,
const data_type_t &weight_dt) const {
using namespace data_type;
return (utils::one_of(src_dt, f32) && utils::one_of(weight_dt, f32)
&& utils::one_of(dst_dt, f32)
&& utils::one_of(bias_dt, f32, undef))
|| (utils::one_of(src_dt, f16)
&& utils::one_of(weight_dt, f16)
&& utils::one_of(dst_dt, f16, f32, s8, u8)
&& utils::one_of(bias_dt, f16, f32, undef))
|| (utils::one_of(src_dt, u8, s8)
&& utils::one_of(weight_dt, s8)
&& utils::one_of(dst_dt, u8, s8, s32, bf16, f32)
&& utils::one_of(
bias_dt, u8, s8, s32, bf16, f32, undef))
|| (utils::one_of(src_dt, bf16)
&& utils::one_of(weight_dt, bf16)
&& utils::one_of(dst_dt, f32, bf16)
&& utils::one_of(bias_dt, f32, bf16, undef));
}

std::vector<int> get_dim_order(int ndims, const dims_t strides) {
std::vector<int> order(ndims);
for (int i = 0; i < ndims; ++i) {
order[i] = i;
}

std::sort(
order.begin(), order.end(), [&strides](size_t i, size_t j) {
return strides[i] < strides[j];
});

return order;
}

status_t create_ip_mds() {
auto accumulate_dimensions = [](const dims_t dimensions, int start,
int end) -> int64_t {
int64_t accum = 1;
for (int i = start; i < end; i++) {
accum *= dimensions[i];
}
return accum;
};

const auto src_md_ = arg_md(DNNL_ARG_SRC);
const auto weights_md_ = arg_md(DNNL_ARG_WEIGHTS);
const auto bias_md_ = arg_md(DNNL_ARG_BIAS);
auto src_wrap = memory_desc_wrapper(src_md_);
auto w_wrap = memory_desc_wrapper(weights_md_);

// src and weights dims need to be in the same order
if (get_dim_order(src_wrap.ndims(), src_wrap.strides())
!= get_dim_order(w_wrap.ndims(), w_wrap.strides())) {
return status::unimplemented;
}

// Reshape input into the form of Batch x (\prod_{dim_{n-1}}^dim_0)
if (src_md_->ndims == 2) {
src_md_reshaped = *src_md_;
} else {
int64_t src_flattened_dimension = accumulate_dimensions(
src_md_->dims, 1, src_md_->ndims);
dims_t src_reshaped_dims {
src_md_->dims[0], src_flattened_dimension};
CHECK(memory_desc_init_by_tag(src_md_reshaped, 2,
src_reshaped_dims, src_md_->data_type, format_tag::ab));
}

// Reshape weights as (OC x (\prod_{dim_{n-1}}^dim_0))^T
int weights_flattened_dimensions = accumulate_dimensions(
weights_md_->dims, 1, weights_md_->ndims);
dims_t weights_reshaped_dims {
weights_flattened_dimensions, weights_md_->dims[0]};
CHECK(memory_desc_init_by_tag(weights_md_reshaped, 2,
weights_reshaped_dims, weights_md_->data_type,
format_tag::ba));
if (with_bias()) {
dims_t bias_reshaped_dims {1, bias_md_->dims[0]};
CHECK(memory_desc_init_by_tag(bias_md_reshaped, 2,
bias_reshaped_dims, bias_md_->data_type,
format_tag::ab));
}
return status::success;
}

status_t init_matmul(impl::engine_t *engine);
// Memory descriptors to contain reshaped tensors from nD to 2D for IP
memory_desc_t src_md_reshaped;
memory_desc_t weights_md_reshaped;
memory_desc_t bias_md_reshaped;
};

status_t init(impl::engine_t *engine) override;
status_t execute(const exec_ctx_t &ctx) const override;

private:
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
kernel_t kernel_;
std::shared_ptr<impl::primitive_t> matmul_primitive;
};
} // namespace dnnl::impl::gpu::generic::sycl

#endif
5 changes: 5 additions & 0 deletions src/gpu/gpu_inner_product_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@
#include "gpu/amd/miopen_gemm_inner_product.hpp"
#endif

#ifdef GENERIC_SYCL_KERNELS_ENABLED
#include "gpu/generic/sycl/ref_inner_product.hpp"
#endif

namespace dnnl {
namespace impl {
namespace gpu {
Expand All @@ -49,6 +53,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_fwd_t)
GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_fwd_t)
GPU_INSTANCE_AMD(amd::miopen_gemm_inner_product_fwd_t)
GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_inner_product_fwd_t)
nullptr,
}},
{{backward}, REG_BWD_PK({
Expand Down
8 changes: 4 additions & 4 deletions src/gpu/nvidia/cudnn_matmul_executor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,12 +392,12 @@ struct cudnn_matmul_lt_exec_t final : public cudnn_matmul_lt_base_exec_t {
memory_tracking::names::key_matmul_dst_in_acc_dt)
: xpu::sycl::interop_memory_arg_t<
::sycl::access::mode::read_write>();
auto arg_block_a_scratch = params->source_size_ != 0
auto arg_block_a_scratch = params->weight_size_ != 0
? CTX_SCRATCH_SYCL_MEMORY(
memory_tracking::names::key_gemm_blocked_a)
: xpu::sycl::interop_memory_arg_t<
::sycl::access::mode::read_write>();
auto arg_block_b_scratch = params->weight_size_ != 0
auto arg_block_b_scratch = params->source_size_ != 0
? CTX_SCRATCH_SYCL_MEMORY(
memory_tracking::names::key_gemm_blocked_b)
: xpu::sycl::interop_memory_arg_t<
Expand Down Expand Up @@ -457,10 +457,10 @@ struct cudnn_matmul_lt_runtime_args_exec_t final
matmul_params->reorder_scratch_size_, cuda_stream->queue());

uint8_t *block_a_scratch_ptr
= alloc_ptr(matmul_params->source_size_, cuda_stream->queue());
= alloc_ptr(matmul_params->weight_size_, cuda_stream->queue());

uint8_t *block_b_scratch_ptr
= alloc_ptr(matmul_params->weight_size_, cuda_stream->queue());
= alloc_ptr(matmul_params->source_size_, cuda_stream->queue());

uint8_t *block_c_scratch_ptr
= alloc_ptr(matmul_params->dest_size_, cuda_stream->queue());
Expand Down
2 changes: 1 addition & 1 deletion src/gpu/nvidia/cudnn_matmul_lt_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -717,7 +717,7 @@ struct cudnn_matmul_lt_impl_t {
}
if (!params->w_blocked_) {
transform_matrix(lt_handle, params, a_layout, a,
blocked_a_layout, block_a_scratch, !params->trans_a_,
blocked_a_layout, block_a_scratch, params->trans_a_,
streamId);
a = block_a_scratch;
}
Expand Down
38 changes: 31 additions & 7 deletions tests/gtests/test_inner_product_forward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,16 +88,18 @@ class inner_product_test_t
protected:
void SetUp() override {
auto p = ::testing::TestWithParam<inprod_test_params_t>::GetParam();
SKIP_IF_CUDA(!cuda_check_format_tags(p.src_format, p.weights_format,
p.bias_format, p.dst_format),
SKIP_IF_CUDA(!cuda_generic_check_format_tags(p.src_format,
p.weights_format, p.bias_format, p.dst_format),
"Unsupported format tag");
SKIP_IF_GENERIC(!cuda_generic_check_format_tags(p.src_format,
p.weights_format, p.bias_format, p.dst_format),
"Unsupported format tag");
SKIP_IF_CUDA(p.ndims > 5, "Unsupported number of dimensions");
SKIP_IF_GENERIC(true, "Primitive not implemented");
catch_expected_failures(
[&]() { Test(); }, p.expect_to_fail, p.expected_status);
}

bool cuda_check_format_tags(memory::format_tag src_format,
bool cuda_generic_check_format_tags(memory::format_tag src_format,
memory::format_tag wei_format, memory::format_tag bia_format,
memory::format_tag dst_format) {
bool src_ok = src_format == memory::format_tag::ncdhw
Expand Down Expand Up @@ -130,6 +132,20 @@ class inner_product_test_t
return src_ok && wei_ok && bia_ok && dst_ok;
}

std::vector<int> get_dim_order(const memory::dims &strides) {
size_t ndims = strides.size();
std::vector<int> order(ndims);
for (size_t i = 0; i < ndims; ++i) {
order[i] = i;
}

std::sort(order.begin(), order.end(), [&strides](size_t i, size_t j) {
return strides[i] < strides[j];
});

return order;
}

void Test() {
auto p = ::testing::TestWithParam<inprod_test_params_t>::GetParam();
test_inner_product_descr_t ipd = p.test_ipd;
Expand Down Expand Up @@ -169,18 +185,26 @@ class inner_product_test_t
: create_md({}, data_type, p.bias_format);
auto ip_dst_desc = create_md({ipd.mb, ipd.oc}, data_type, p.dst_format);

SKIP_IF_GENERIC(get_dim_order(ip_src_desc.get_strides())
!= get_dim_order(ip_weights_desc.get_strides()),
"Unsupported case for generic");

auto ip_primitive_desc = with_bias
? pd_t(eng, p.aprop_kind, ip_src_desc, ip_weights_desc,
ip_bias_desc, ip_dst_desc)
: pd_t(eng, p.aprop_kind, ip_src_desc, ip_weights_desc,
ip_dst_desc);

auto aa = allows_attr_t {false};
aa.po_binary = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
aa.po_eltwise = true;
aa.po_prelu = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
aa.po_sum = true;

#ifdef DNNL_SYCL_GENERIC
aa.po_binary = true;
aa.po_prelu = true;
#else
aa.po_binary = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
aa.po_prelu = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
#endif
test_fwd_pd_constructors<pd_t>(ip_primitive_desc, aa, p.aprop_kind,
ip_src_desc, ip_weights_desc, ip_bias_desc, ip_dst_desc);

Expand Down
Loading