From 804f76aceb47b1e66b9eeefadb79bbd129c81e1d Mon Sep 17 00:00:00 2001 From: John Osorio Date: Mon, 26 Aug 2024 08:49:41 +0100 Subject: [PATCH] generic: sycl: resampling: Avoid using the cudnn resampling to use the sycl impl --- src/gpu/gpu_resampling_list.cpp | 10 +- src/gpu/nvidia/README.md | 37 ---- src/gpu/nvidia/cudnn_resampling.cpp | 95 -------- src/gpu/nvidia/cudnn_resampling.hpp | 270 ----------------------- src/gpu/nvidia/cudnn_resampling_impl.hpp | 171 -------------- 5 files changed, 2 insertions(+), 581 deletions(-) delete mode 100644 src/gpu/nvidia/cudnn_resampling.cpp delete mode 100644 src/gpu/nvidia/cudnn_resampling.hpp delete mode 100644 src/gpu/nvidia/cudnn_resampling_impl.hpp diff --git a/src/gpu/gpu_resampling_list.cpp b/src/gpu/gpu_resampling_list.cpp index 0e6e6d1d64f..60766f2e831 100644 --- a/src/gpu/gpu_resampling_list.cpp +++ b/src/gpu/gpu_resampling_list.cpp @@ -21,10 +21,6 @@ #include "gpu/intel/ocl/vectorized_resampling.hpp" #endif -#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA -#include "gpu/nvidia/cudnn_resampling.hpp" -#endif - #ifdef GENERIC_SYCL_KERNELS_ENABLED #include "gpu/generic/sycl/ref_resampling.hpp" #endif @@ -40,15 +36,13 @@ using namespace dnnl::impl::prop_kind; const std::map> impl_list_map REG_RESAMPLING_P({ {{forward}, { - GPU_INSTANCE_INTEL(intel::ocl::ref_resampling_fwd_t) - GPU_INSTANCE_NVIDIA(nvidia::cudnn_resampling_fwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_resampling_fwd_t) GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_resampling_fwd_t) nullptr, }}, {{backward}, REG_BWD_PK({ GPU_INSTANCE_INTEL(intel::ocl::vectorized_resampling_bwd_t) - GPU_INSTANCE_INTEL(intel::ocl::ref_resampling_bwd_t) - GPU_INSTANCE_NVIDIA(nvidia::cudnn_resampling_bwd_t) + GPU_INSTANCE_INTEL(intel::ocl::ref_resampling_bwd_t) GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_resampling_bwd_t) nullptr, })}, diff --git a/src/gpu/nvidia/README.md b/src/gpu/nvidia/README.md index 597adfaef37..0e7f8b8aa42 100644 --- a/src/gpu/nvidia/README.md +++ b/src/gpu/nvidia/README.md @@ -350,43 +350,6 @@ GPU: * Forward pass supports `f32`, `f16`, `bf16` and `s8` data types. * Backward pass supports `f32` and `bf16` data types. -### Resampling - -#### Using cuDNN - -The `cudnnSpatialTfSamplerForward` and `cudnnSpatialTfSamplerBackward` are used -to implement the resampling primitive. - -The Nvidia's spatial sampling is based on -[Spacial Transformer Network](https://papers.nips.cc/paper/5854-spatial-transformer-networks.pdf) -where all the data locations are normalized between `-1 <= (xi, yi) <= 1`. - -* cuDNN backend requires a grid of coordinates that can be sample-up/down based - on `theta`. The grid is generated by `cudnnSpatialTfGridGeneratorForward`. -* The `theta` is a `MB * 2 * 3` matrix scaling factor for each coordinate and is - used to generate the grid. -* The grid value must be normalized in range [-1 , 1]. cuDNN clamps the out of - bounds coordinate to zero. Therefore, it is needed to manually clamp the out - of bound coordinate to edges in order to avoid incorrect result. -* 3D spatial sampling is not supported in cuDNN. -* `Nearest neighbour` algorithm is not supported in cuDNN. -* Since cuDNN computation is different from that of oneDNN, the error threshold - is smaller than other oneDNN implementation, so reduced testing accuracy for - `fp32` and `fp16` data types are required. -* The backward pass requires an output parameter for `d_grid` which cannot be - `nullptr`. However, since the grid coordinates are not a tunable parameter in - oneDNN, a dummy memory for `d_grid` is created and is deleted when the - destructor of the primitive is called. - -##### Forward Direction -* Supported data types: `f32`, `bf16`, `f16`, `s8`, `u8` and `s32` -* Supported post-ops: `sum`, `eltwise`, `binary` -* Supported algorithms: nearest neighbor, bilinear - -##### Backward Direction -* Supported data types: `f32`, `bf16` -* Supported algorithms: nearest neighbor, bilinear - ### Softmax/LogSoftmax #### Using cuDNN diff --git a/src/gpu/nvidia/cudnn_resampling.cpp b/src/gpu/nvidia/cudnn_resampling.cpp deleted file mode 100644 index 3fb7cb6288e..00000000000 --- a/src/gpu/nvidia/cudnn_resampling.cpp +++ /dev/null @@ -1,95 +0,0 @@ -/******************************************************************************* -* Copyright 2020-2024 Intel Corporation -* Copyright 2020-2022 Codeplay Software Limited -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "gpu/nvidia/cudnn_resampling.hpp" -#include "gpu/nvidia/stream.hpp" -#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" -#include "xpu/sycl/buffer_memory_storage.hpp" -#include "xpu/sycl/memory_storage_helper.hpp" - -namespace dnnl { -namespace impl { -namespace gpu { -namespace nvidia { - -status_t cudnn_resampling_fwd_t::execute(const exec_ctx_t &ctx) const { - if (memory_desc_wrapper(pd()->src_md()).has_zero_dim()) - return status::success; - - nvidia::stream_t *cuda_stream - = utils::downcast(ctx.stream()); - - cuda_stream->interop_task([&](::sycl::handler &cgh) { - auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC); - auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST); - - auto grid_acc = buffer(grid_storage_.get()) - .get_access<::sycl::access::mode::read>(cgh); - compat::host_task(cgh, [=, this](const compat::interop_handle &ih) { - auto &sycl_engine = *utils::downcast( - cuda_stream->engine()); - auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); - auto handle = cuda_stream->get_cudnn_handle(); - std::vector args; - - args.push_back(arg_src.get_native_pointer(ih)); - args.push_back(sc.memory(ih, grid_acc)); - args.push_back(arg_dst.get_native_pointer(ih)); - - pd()->resampling_impl_->execute(handle, args); - }); - }); - - return status::success; -} - -status_t cudnn_resampling_bwd_t::execute(const exec_ctx_t &ctx) const { - if (memory_desc_wrapper(pd()->diff_src_md()).has_zero_dim()) - return status::success; - - nvidia::stream_t *cuda_stream - = utils::downcast(ctx.stream()); - - cuda_stream->interop_task([&](::sycl::handler &cgh) { - auto arg_diff_src = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SRC); - auto arg_diff_dst = CTX_IN_SYCL_MEMORY(DNNL_ARG_DIFF_DST); - auto grid_acc = buffer(grid_storage_.get()) - .get_access<::sycl::access::mode::read>(cgh); - auto arg_diff_grid - = CTX_SCRATCH_SYCL_MEMORY(memory_tracking::names::key_none); - compat::host_task(cgh, [=, this](const compat::interop_handle &ih) { - auto &sycl_engine = *utils::downcast( - cuda_stream->engine()); - auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine); - auto handle = cuda_stream->get_cudnn_handle(); - std::vector args; - args.push_back(arg_diff_src.get_native_pointer(ih)); - args.push_back(arg_diff_dst.get_native_pointer(ih)); - args.push_back(sc.memory(ih, grid_acc)); - args.push_back(arg_diff_grid.get_native_pointer(ih)); - - pd()->resampling_impl_->execute(handle, args); - }); - }); - - return status::success; -} - -} // namespace nvidia -} // namespace gpu -} // namespace impl -} // namespace dnnl diff --git a/src/gpu/nvidia/cudnn_resampling.hpp b/src/gpu/nvidia/cudnn_resampling.hpp deleted file mode 100644 index 32c5cd664c9..00000000000 --- a/src/gpu/nvidia/cudnn_resampling.hpp +++ /dev/null @@ -1,270 +0,0 @@ -/******************************************************************************* -* Copyright 2020-2024 Intel Corporation -* Copyright 2020 Codeplay Software Limited -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef GPU_NVIDIA_CUDNN_RESAMPLING_HPP -#define GPU_NVIDIA_CUDNN_RESAMPLING_HPP - -#include - -#include "common/c_types_map.hpp" -#include "common/resampling_pd.hpp" -#include "common/type_helpers.hpp" -#include "gpu/gpu_primitive.hpp" - -#include "xpu/sycl/memory_storage.hpp" - -#include "gpu/nvidia/engine.hpp" -#include "gpu/nvidia/stream.hpp" -#include "gpu/nvidia/sycl_cuda_scoped_context.hpp" -#include "gpu/nvidia/sycl_cuda_utils.hpp" - -#include "gpu/nvidia/cudnn_resampling_impl.hpp" - -namespace dnnl { -namespace impl { -namespace gpu { -namespace nvidia { - -struct cudnn_resampling_pd_base_t { -protected: - status_t init_mem_by_tag(format_tag_t tag, memory_desc_t &md) { - if (tag == format_tag::undef) return status::unimplemented; - CHECK(memory_desc_init_by_tag(md, tag)); - return status::success; - } -}; - -struct cudnn_resampling_base_t : public gpu::primitive_t { -protected: - using gpu::primitive_t::primitive_t; - template - struct theta_t { - data_t s0_, i_, tx_; - data_t j_, s1_, ty_; - theta_t(data_t s0, data_t i, data_t tx, data_t j, data_t s1, data_t ty) - : s0_(s0), i_(i), tx_(tx), j_(j), s1_(s1), ty_(ty) {} - }; - - ::sycl::buffer &buffer(memory_storage_t *mem_storage) { - return utils::downcast( - mem_storage) - ->buffer(); - } - ::sycl::buffer &buffer(memory_storage_t *mem_storage) const { - return utils::downcast( - mem_storage) - ->buffer(); - } - template - status_t prepare_coordinate_grid(impl::engine_t *engine, const pd_t *pd) { - using io = cudnn_resampling_impl_base_t::io; - int ndims = pd->resampling_impl_->ndims(); - data_t OW = pd->resampling_impl_->dims_[io::dst][ndims - 1], - IW = pd->resampling_impl_->dims_[io::src][ndims - 1], - OH = pd->resampling_impl_->dims_[io::dst][ndims - 2], - IH = pd->resampling_impl_->dims_[io::src][ndims - 2]; - // cudnn uses the normalized value between -1<=(xsi, ysi)<= 1 for - // building the grid. Therefore, scaling parameter for tau_theta must be - // adjusted for computing the normalized value per grid. - data_t w = 1; - if (IW != 1 && IW != OW) w = IW * (OW - 1) / (OW * (IW - 1)); - - data_t h = 1; - if (IH != 1 && IH != OH) h = IH * (OH - 1) / (OH * (IH - 1)); - - // the taue of theta size is fixed in cudnn - int tau_thea_size = 2 * 3; - auto theta_size = pd->MB(); - auto tau_theta = theta_t {w, 0.f, 0.f, 0.f, h, 0.f}; - std::vector> theta_data(theta_size, tau_theta); - - auto grid_size = pd->MB() * pd->OH() * pd->OW() * 2; - auto sycl_engine = utils::downcast(engine); - - auto theta_size_in_byte = tau_thea_size * theta_size * sizeof(data_t); - auto grid_size_in_byte = grid_size * sizeof(data_t); - - memory_storage_t *mem_grid_ptr; - CHECK(sycl_engine->create_memory_storage(&mem_grid_ptr, - memory_flags_t::alloc, grid_size_in_byte, nullptr)); - grid_storage_.reset(mem_grid_ptr); - - memory_storage_t *mem_theta_ptr; - CHECK(sycl_engine->create_memory_storage(&mem_theta_ptr, - memory_flags_t::alloc, theta_size_in_byte, nullptr)); - theta_storage_.reset(mem_theta_ptr); - - impl::stream_t *service_stream; - CHECK(sycl_engine->get_service_stream(service_stream)); - - auto cuda_stream = utils::downcast(service_stream); - auto event = copy(cuda_stream->queue(), - reinterpret_cast(theta_data.data()), - buffer(theta_storage_.get())); - auto &st_desc_ = pd->resampling_impl_->st_desc_; - cuda_stream->interop_task([&](::sycl::handler &cgh) { - cgh.depends_on(event); - auto theta_acc - = buffer(theta_storage_.get()) - .get_access<::sycl::access::mode::read>(cgh); - auto grid_acc - = buffer(grid_storage_.get()) - .get_access<::sycl::access::mode::write>(cgh); - - compat::host_task(cgh, [=](const compat::interop_handle &ih) { - // scoped context will make sure the top of the stack context is - // the engine context while creating the cublas handle. - auto &s_engine = *utils::downcast(engine); - cuda_sycl_scoped_context_handler_t sc(s_engine); - auto handle = cuda_stream->get_cudnn_handle(); - auto theta = sc.memory(ih, theta_acc); - auto grid = sc.memory(ih, grid_acc); - CUDNN_EXECUTE_FUNC(cudnnSpatialTfGridGeneratorForward, handle, - st_desc_, theta, grid); - }); - }); - - // cudnn requires the grid data to be normalized between (-1, -1) <= - // (xsi, ysi) <= (1,1) when the value is outside of the boundary, cudnn - // assume the values are 0, while oneDNN uses the boundary values. So we - // clamp the outside of the boundary values to the boundary,. This will - // fix the upsampling issue. - std::vector unbound_raw_grid(grid_size); - auto event2 = copy(cuda_stream->queue(), buffer(grid_storage_.get()), - reinterpret_cast(unbound_raw_grid.data())); - event2.wait(); - for (int i = 0; i < grid_size; i++) { - if (std::fabs(unbound_raw_grid[i]) > 1) - unbound_raw_grid[i] = unbound_raw_grid[i] - / (std::fabs(unbound_raw_grid[i])); - } - - auto event3 = copy(cuda_stream->queue(), - reinterpret_cast(unbound_raw_grid.data()), - buffer(grid_storage_.get())); - event3.wait(); - return status::success; - } - std::unique_ptr grid_storage_; - std::unique_ptr theta_storage_; -}; - -struct cudnn_resampling_fwd_t : public cudnn_resampling_base_t { - using cudnn_resampling_base_t::cudnn_resampling_base_t; - struct pd_t : public resampling_fwd_pd_t, - public cudnn_resampling_pd_base_t { - using cudnn_resampling_pd_base_t::cudnn_resampling_pd_base_t; - using resampling_fwd_pd_t::resampling_fwd_pd_t; - DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_resampling_fwd_t); - - status_t init(impl::engine_t *engine) { - using namespace data_type; - using namespace format_tag; - - assert(engine->kind() == engine_kind::gpu); - - bool ok = desc()->alg_kind == alg_kind::resampling_linear - && is_fwd() && utils::one_of(src_md()->data_type, f32, f16) - && src_md()->data_type == dst_md()->data_type - && set_default_params() == status::success - && attr()->has_default_values(); - if (!ok) return status::unimplemented; - - // src must have a tag and src must follow the same tag - format_tag_t dat_tag = memory_desc_matches_one_of_tag( - *src_md(), ncw, nchw, nwc, nhwc); - if (dat_tag == format_tag::undef) return status::unimplemented; - if (!memory_desc_matches_tag(*dst_md(), dat_tag)) { - return status::unimplemented; - } - - resampling_impl_.reset(new cudnn_resampling_fwd_impl_t()); - return resampling_impl_->init(this); - } - - std::shared_ptr resampling_impl_; - }; - - status_t init(impl::engine_t *engine) override { - status_t status; - auto wrap = memory_desc_wrapper(pd()->src_md()); - switch (wrap.data_type()) { - case data_type::f32: - status = prepare_coordinate_grid(engine, pd()); - break; - case data_type::f16: - status = prepare_coordinate_grid(engine, pd()); - break; - default: status = status::unimplemented; - } - return status; - } - - status_t execute(const exec_ctx_t &ctx) const override; - -private: - const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } -}; - -struct cudnn_resampling_bwd_t : public cudnn_resampling_base_t { - using cudnn_resampling_base_t::cudnn_resampling_base_t; - struct pd_t : public resampling_bwd_pd_t, - public cudnn_resampling_pd_base_t { - using cudnn_resampling_pd_base_t::cudnn_resampling_pd_base_t; - using resampling_bwd_pd_t::resampling_bwd_pd_t; - DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_resampling_bwd_t); - - status_t init(impl::engine_t *engine) { - using namespace data_type; - using namespace format_tag; - - assert(engine->kind() == engine_kind::gpu); - bool ok = desc()->alg_kind == alg_kind::resampling_linear - && !is_fwd() && utils::one_of(diff_src_md()->data_type, f32) - && diff_src_md()->data_type == diff_dst_md()->data_type - && set_default_params() == status::success - && attr()->has_default_values(); - if (!ok) return status::unimplemented; - // dst must have a tag and src must follow the same tag - format_tag_t dat_tag = memory_desc_matches_one_of_tag( - *diff_dst_md(), ncw, nchw, nwc, nhwc); - if (dat_tag == format_tag::undef) return status::unimplemented; - if (!memory_desc_matches_tag(*diff_src_md(), dat_tag)) { - return status::unimplemented; - } - - resampling_impl_.reset(new cudnn_resampling_bwd_impl_t()); - return resampling_impl_->init(this); - } - std::shared_ptr resampling_impl_; - }; - status_t init(impl::engine_t *engine) override { - return prepare_coordinate_grid(engine, pd()); - } - - status_t execute(const exec_ctx_t &ctx) const override; - -private: - const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } -}; - -} // namespace nvidia -} // namespace gpu -} // namespace impl -} // namespace dnnl - -#endif diff --git a/src/gpu/nvidia/cudnn_resampling_impl.hpp b/src/gpu/nvidia/cudnn_resampling_impl.hpp deleted file mode 100644 index cdd62c8290d..00000000000 --- a/src/gpu/nvidia/cudnn_resampling_impl.hpp +++ /dev/null @@ -1,171 +0,0 @@ -/******************************************************************************* -* Copyright 2020-2024 Intel Corporation -* Copyright 2020 Codeplay Software Limited -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef GPU_NVIDIA_CUDNN_RESAMPLING_IMPL_HPP -#define GPU_NVIDIA_CUDNN_RESAMPLING_IMPL_HPP - -#include - -#include "gpu/nvidia/engine.hpp" -#include "gpu/nvidia/sycl_cuda_utils.hpp" - -namespace dnnl { -namespace impl { -namespace gpu { -namespace nvidia { - -struct cudnn_resampling_impl_base_t { - virtual ~cudnn_resampling_impl_base_t() { - for (int i = 0; i < NUM_IO; ++i) { - if (tensor_descs_[i]) { - CUDNN_EXECUTE_FUNC_V( - cudnnDestroyTensorDescriptor, tensor_descs_[i]); - } - } - - if (st_desc_) { - CUDNN_EXECUTE_FUNC_V( - cudnnDestroySpatialTransformerDescriptor, st_desc_); - } - } - - virtual status_t init(resampling_pd_t *pd) = 0; - - virtual void execute( - cudnnHandle_t handle, const std::vector &args) const = 0; - - int ndims() { return ndims_; } - - status_t create_and_set_st_desc() { - CHECK(CUDNN_EXECUTE_FUNC_S( - cudnnCreateSpatialTransformerDescriptor, &st_desc_)); - - CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetSpatialTransformerNdDescriptor, - st_desc_, CUDNN_SAMPLER_BILINEAR, data_types_[dst], ndims_, - dims_[dst])); - - return status::success; - } - - enum io { src, dst, NUM_IO }; - int dims_[NUM_IO][DNNL_MAX_NDIMS]; - int strides_[NUM_IO][DNNL_MAX_NDIMS]; - cudnnDataType_t data_types_[NUM_IO]; - cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {}; - cudnnSpatialTransformerDescriptor_t st_desc_; - int ndims_; - const float alpha_ = 1.f, beta_ = 0.f; -}; - -struct cudnn_resampling_fwd_impl_t : public cudnn_resampling_impl_base_t { - status_t init(resampling_pd_t *pd) override { - ndims_ = std::max(4, pd->ndims()); - - if (ndims_ > 4) return status::unimplemented; - - cudnnTensorFormat_t src_format, dst_format; - CHECK(get_format(pd->src_md(), dst_format)); - CHECK(get_format(pd->dst_md(), src_format)); - convert_dims(pd->src_md()->padded_dims, dims_[src], pd->ndims()); - convert_dims(pd->src_md()->format_desc.blocking.strides, strides_[src], - pd->ndims(), 4, - (dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[src][1])); - convert_dims(pd->dst_md()->padded_dims, dims_[dst], pd->ndims()); - convert_dims(pd->dst_md()->format_desc.blocking.strides, strides_[dst], - pd->ndims(), 4, - (dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[dst][1])); - - CHECK(convert_data_type(pd->src_md(), &data_types_[src])); - CHECK(convert_data_type(pd->dst_md(), &data_types_[dst])); - - CHECK(create_and_set_tensor_descriptor(&tensor_descs_[src], - data_types_[src], ndims_, dims_[src], strides_[src])); - CHECK(create_and_set_tensor_descriptor(&tensor_descs_[dst], - data_types_[dst], ndims_, dims_[dst], strides_[dst])); - - CHECK(create_and_set_st_desc()); - return status::success; - } - - void execute(cudnnHandle_t handle, - const std::vector &args) const override { - - CUDNN_EXECUTE_FUNC(cudnnSpatialTfSamplerForward, handle, st_desc_, - &alpha_, tensor_descs_[src], args[0], args[1], &beta_, - tensor_descs_[dst], args[2]); - } -}; - -struct cudnn_resampling_bwd_impl_t : public cudnn_resampling_impl_base_t { - - status_t init(resampling_pd_t *pd) override { - ndims_ = std::max(4, pd->ndims()); - - if (ndims_ > 4) return status::unimplemented; - - cudnnTensorFormat_t src_format, dst_format; - CHECK(get_format(pd->diff_src_md(), dst_format)); - CHECK(get_format(pd->diff_dst_md(), src_format)); - convert_dims(pd->diff_src_md()->padded_dims, dims_[src], pd->ndims()); - convert_dims(pd->diff_src_md()->format_desc.blocking.strides, - strides_[src], pd->ndims(), 4, - (dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[src][1])); - convert_dims(pd->diff_dst_md()->padded_dims, dims_[dst], pd->ndims()); - convert_dims(pd->diff_dst_md()->format_desc.blocking.strides, - strides_[dst], pd->ndims(), 4, - (dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[dst][1])); - - CHECK(convert_data_type(pd->diff_src_md(), &data_types_[src])); - CHECK(convert_data_type(pd->diff_dst_md(), &data_types_[dst])); - - CHECK(create_and_set_tensor_descriptor(&tensor_descs_[src], - data_types_[src], ndims_, dims_[src], strides_[src])); - CHECK(create_and_set_tensor_descriptor(&tensor_descs_[dst], - data_types_[dst], ndims_, dims_[dst], strides_[dst])); - - CHECK(create_and_set_st_desc()); - auto wrap = memory_desc_wrapper(pd->diff_src_md()); - - auto grid_size = pd->MB() * pd->OH() * pd->OW() * 2; - auto grid_size_in_byte = grid_size * wrap.data_type_size(); - // cuDNN does not allow the dgrid to be NULL ptr. Although we dont - // need to compute dgrid since the theta is not comming from a - // local network, we have to set that since Nvidia does not accept - // so we allocate an scratchpad for dgrid - pd->scratchpad_registry().registrar().book( - memory_tracking::names::key_none, grid_size_in_byte, size_t(1)); - return status::success; - } - - void execute(cudnnHandle_t handle, - const std::vector &args) const override { - // we are not backpropagating for the grid here. - // So both alpha and beta are zero and the dgrid value - // wont be used - CUDNN_EXECUTE_FUNC(cudnnSpatialTfSamplerBackward, handle, st_desc_, - &alpha_, tensor_descs_[src], args[0], &beta_, - tensor_descs_[src], args[0], &beta_, tensor_descs_[dst], - args[1], args[2], &beta_, args[3]); - } -}; - -} // namespace nvidia -} // namespace gpu -} // namespace impl -} // namespace dnnl - -#endif