From 804f76aceb47b1e66b9eeefadb79bbd129c81e1d Mon Sep 17 00:00:00 2001
From: John Osorio <john.osorio@codeplay.com>
Date: Mon, 26 Aug 2024 08:49:41 +0100
Subject: [PATCH] generic: sycl: resampling: Avoid using the cudnn resampling
 to use the sycl impl

---
 src/gpu/gpu_resampling_list.cpp          |  10 +-
 src/gpu/nvidia/README.md                 |  37 ----
 src/gpu/nvidia/cudnn_resampling.cpp      |  95 --------
 src/gpu/nvidia/cudnn_resampling.hpp      | 270 -----------------------
 src/gpu/nvidia/cudnn_resampling_impl.hpp | 171 --------------
 5 files changed, 2 insertions(+), 581 deletions(-)
 delete mode 100644 src/gpu/nvidia/cudnn_resampling.cpp
 delete mode 100644 src/gpu/nvidia/cudnn_resampling.hpp
 delete mode 100644 src/gpu/nvidia/cudnn_resampling_impl.hpp

diff --git a/src/gpu/gpu_resampling_list.cpp b/src/gpu/gpu_resampling_list.cpp
index 0e6e6d1d64f..60766f2e831 100644
--- a/src/gpu/gpu_resampling_list.cpp
+++ b/src/gpu/gpu_resampling_list.cpp
@@ -21,10 +21,6 @@
 #include "gpu/intel/ocl/vectorized_resampling.hpp"
 #endif
 
-#if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
-#include "gpu/nvidia/cudnn_resampling.hpp"
-#endif
-
 #ifdef GENERIC_SYCL_KERNELS_ENABLED
 #include "gpu/generic/sycl/ref_resampling.hpp"
 #endif
@@ -40,15 +36,13 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         impl_list_map REG_RESAMPLING_P({
     {{forward}, {
-        GPU_INSTANCE_INTEL(intel::ocl::ref_resampling_fwd_t)
-        GPU_INSTANCE_NVIDIA(nvidia::cudnn_resampling_fwd_t)
+        GPU_INSTANCE_INTEL(intel::ocl::ref_resampling_fwd_t)        
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_resampling_fwd_t)
         nullptr,
     }},
     {{backward}, REG_BWD_PK({
         GPU_INSTANCE_INTEL(intel::ocl::vectorized_resampling_bwd_t)
-        GPU_INSTANCE_INTEL(intel::ocl::ref_resampling_bwd_t)
-        GPU_INSTANCE_NVIDIA(nvidia::cudnn_resampling_bwd_t)
+        GPU_INSTANCE_INTEL(intel::ocl::ref_resampling_bwd_t)        
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_resampling_bwd_t)
         nullptr,
     })},
diff --git a/src/gpu/nvidia/README.md b/src/gpu/nvidia/README.md
index 597adfaef37..0e7f8b8aa42 100644
--- a/src/gpu/nvidia/README.md
+++ b/src/gpu/nvidia/README.md
@@ -350,43 +350,6 @@ GPU:
 * Forward pass supports `f32`, `f16`, `bf16` and `s8` data types.
 * Backward pass supports `f32` and `bf16` data types.
 
-### Resampling
-
-#### Using cuDNN
-
-The `cudnnSpatialTfSamplerForward` and `cudnnSpatialTfSamplerBackward` are used
-to implement the resampling primitive.
-
-The Nvidia's spatial sampling is based on
-[Spacial Transformer Network](https://papers.nips.cc/paper/5854-spatial-transformer-networks.pdf)
-where all the data locations are normalized between `-1 <= (xi, yi) <= 1`.
-
-* cuDNN backend requires a grid of coordinates that can be sample-up/down based
-  on `theta`. The grid is generated by `cudnnSpatialTfGridGeneratorForward`.
-* The `theta` is a `MB * 2 * 3` matrix scaling factor for each coordinate and is
-  used to generate the grid.
-* The grid value must be normalized in range [-1 , 1]. cuDNN clamps the out of
-  bounds coordinate to zero. Therefore, it is needed to manually clamp the out
-  of bound coordinate to edges in order to avoid incorrect result.
-* 3D spatial sampling is not supported in cuDNN.
-* `Nearest neighbour` algorithm is not supported in cuDNN.
-* Since cuDNN computation is different from that of oneDNN, the error threshold
-  is smaller than other oneDNN implementation, so reduced testing accuracy for
-  `fp32` and `fp16` data types are required.
-* The backward pass requires an output parameter for `d_grid` which cannot be
-  `nullptr`. However, since the grid coordinates are not a tunable parameter in
-  oneDNN, a dummy memory for `d_grid` is created and is deleted when the
-  destructor of the primitive is called.
-
-##### Forward Direction
-* Supported data types: `f32`, `bf16`, `f16`, `s8`, `u8` and `s32`
-* Supported post-ops: `sum`, `eltwise`, `binary`
-* Supported algorithms: nearest neighbor, bilinear
-
-##### Backward Direction
-* Supported data types: `f32`, `bf16`
-* Supported algorithms: nearest neighbor, bilinear
-
 ### Softmax/LogSoftmax
 
 #### Using cuDNN
diff --git a/src/gpu/nvidia/cudnn_resampling.cpp b/src/gpu/nvidia/cudnn_resampling.cpp
deleted file mode 100644
index 3fb7cb6288e..00000000000
--- a/src/gpu/nvidia/cudnn_resampling.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-* Copyright 2020-2022 Codeplay Software Limited
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "gpu/nvidia/cudnn_resampling.hpp"
-#include "gpu/nvidia/stream.hpp"
-#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
-#include "xpu/sycl/buffer_memory_storage.hpp"
-#include "xpu/sycl/memory_storage_helper.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace nvidia {
-
-status_t cudnn_resampling_fwd_t::execute(const exec_ctx_t &ctx) const {
-    if (memory_desc_wrapper(pd()->src_md()).has_zero_dim())
-        return status::success;
-
-    nvidia::stream_t *cuda_stream
-            = utils::downcast<nvidia::stream_t *>(ctx.stream());
-
-    cuda_stream->interop_task([&](::sycl::handler &cgh) {
-        auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
-        auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
-
-        auto grid_acc = buffer(grid_storage_.get())
-                                .get_access<::sycl::access::mode::read>(cgh);
-        compat::host_task(cgh, [=, this](const compat::interop_handle &ih) {
-            auto &sycl_engine = *utils::downcast<nvidia::engine_t *>(
-                    cuda_stream->engine());
-            auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = cuda_stream->get_cudnn_handle();
-            std::vector<void *> args;
-
-            args.push_back(arg_src.get_native_pointer(ih));
-            args.push_back(sc.memory<void *>(ih, grid_acc));
-            args.push_back(arg_dst.get_native_pointer(ih));
-
-            pd()->resampling_impl_->execute(handle, args);
-        });
-    });
-
-    return status::success;
-}
-
-status_t cudnn_resampling_bwd_t::execute(const exec_ctx_t &ctx) const {
-    if (memory_desc_wrapper(pd()->diff_src_md()).has_zero_dim())
-        return status::success;
-
-    nvidia::stream_t *cuda_stream
-            = utils::downcast<nvidia::stream_t *>(ctx.stream());
-
-    cuda_stream->interop_task([&](::sycl::handler &cgh) {
-        auto arg_diff_src = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SRC);
-        auto arg_diff_dst = CTX_IN_SYCL_MEMORY(DNNL_ARG_DIFF_DST);
-        auto grid_acc = buffer(grid_storage_.get())
-                                .get_access<::sycl::access::mode::read>(cgh);
-        auto arg_diff_grid
-                = CTX_SCRATCH_SYCL_MEMORY(memory_tracking::names::key_none);
-        compat::host_task(cgh, [=, this](const compat::interop_handle &ih) {
-            auto &sycl_engine = *utils::downcast<nvidia::engine_t *>(
-                    cuda_stream->engine());
-            auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = cuda_stream->get_cudnn_handle();
-            std::vector<void *> args;
-            args.push_back(arg_diff_src.get_native_pointer(ih));
-            args.push_back(arg_diff_dst.get_native_pointer(ih));
-            args.push_back(sc.memory<void *>(ih, grid_acc));
-            args.push_back(arg_diff_grid.get_native_pointer(ih));
-
-            pd()->resampling_impl_->execute(handle, args);
-        });
-    });
-
-    return status::success;
-}
-
-} // namespace nvidia
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/nvidia/cudnn_resampling.hpp b/src/gpu/nvidia/cudnn_resampling.hpp
deleted file mode 100644
index 32c5cd664c9..00000000000
--- a/src/gpu/nvidia/cudnn_resampling.hpp
+++ /dev/null
@@ -1,270 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-* Copyright 2020 Codeplay Software Limited
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_NVIDIA_CUDNN_RESAMPLING_HPP
-#define GPU_NVIDIA_CUDNN_RESAMPLING_HPP
-
-#include <cudnn.h>
-
-#include "common/c_types_map.hpp"
-#include "common/resampling_pd.hpp"
-#include "common/type_helpers.hpp"
-#include "gpu/gpu_primitive.hpp"
-
-#include "xpu/sycl/memory_storage.hpp"
-
-#include "gpu/nvidia/engine.hpp"
-#include "gpu/nvidia/stream.hpp"
-#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
-#include "gpu/nvidia/sycl_cuda_utils.hpp"
-
-#include "gpu/nvidia/cudnn_resampling_impl.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace nvidia {
-
-struct cudnn_resampling_pd_base_t {
-protected:
-    status_t init_mem_by_tag(format_tag_t tag, memory_desc_t &md) {
-        if (tag == format_tag::undef) return status::unimplemented;
-        CHECK(memory_desc_init_by_tag(md, tag));
-        return status::success;
-    }
-};
-
-struct cudnn_resampling_base_t : public gpu::primitive_t {
-protected:
-    using gpu::primitive_t::primitive_t;
-    template <typename data_t>
-    struct theta_t {
-        data_t s0_, i_, tx_;
-        data_t j_, s1_, ty_;
-        theta_t(data_t s0, data_t i, data_t tx, data_t j, data_t s1, data_t ty)
-            : s0_(s0), i_(i), tx_(tx), j_(j), s1_(s1), ty_(ty) {}
-    };
-
-    ::sycl::buffer<uint8_t, 1> &buffer(memory_storage_t *mem_storage) {
-        return utils::downcast<xpu::sycl::buffer_memory_storage_t *>(
-                mem_storage)
-                ->buffer();
-    }
-    ::sycl::buffer<uint8_t, 1> &buffer(memory_storage_t *mem_storage) const {
-        return utils::downcast<xpu::sycl::buffer_memory_storage_t *>(
-                mem_storage)
-                ->buffer();
-    }
-    template <typename data_t, typename pd_t>
-    status_t prepare_coordinate_grid(impl::engine_t *engine, const pd_t *pd) {
-        using io = cudnn_resampling_impl_base_t::io;
-        int ndims = pd->resampling_impl_->ndims();
-        data_t OW = pd->resampling_impl_->dims_[io::dst][ndims - 1],
-               IW = pd->resampling_impl_->dims_[io::src][ndims - 1],
-               OH = pd->resampling_impl_->dims_[io::dst][ndims - 2],
-               IH = pd->resampling_impl_->dims_[io::src][ndims - 2];
-        // cudnn uses the normalized value between -1<=(xsi, ysi)<= 1 for
-        // building the grid. Therefore, scaling parameter for tau_theta must be
-        // adjusted for computing the normalized value per grid.
-        data_t w = 1;
-        if (IW != 1 && IW != OW) w = IW * (OW - 1) / (OW * (IW - 1));
-
-        data_t h = 1;
-        if (IH != 1 && IH != OH) h = IH * (OH - 1) / (OH * (IH - 1));
-
-        // the taue of theta size is fixed in cudnn
-        int tau_thea_size = 2 * 3;
-        auto theta_size = pd->MB();
-        auto tau_theta = theta_t<data_t> {w, 0.f, 0.f, 0.f, h, 0.f};
-        std::vector<theta_t<data_t>> theta_data(theta_size, tau_theta);
-
-        auto grid_size = pd->MB() * pd->OH() * pd->OW() * 2;
-        auto sycl_engine = utils::downcast<nvidia::engine_t *>(engine);
-
-        auto theta_size_in_byte = tau_thea_size * theta_size * sizeof(data_t);
-        auto grid_size_in_byte = grid_size * sizeof(data_t);
-
-        memory_storage_t *mem_grid_ptr;
-        CHECK(sycl_engine->create_memory_storage(&mem_grid_ptr,
-                memory_flags_t::alloc, grid_size_in_byte, nullptr));
-        grid_storage_.reset(mem_grid_ptr);
-
-        memory_storage_t *mem_theta_ptr;
-        CHECK(sycl_engine->create_memory_storage(&mem_theta_ptr,
-                memory_flags_t::alloc, theta_size_in_byte, nullptr));
-        theta_storage_.reset(mem_theta_ptr);
-
-        impl::stream_t *service_stream;
-        CHECK(sycl_engine->get_service_stream(service_stream));
-
-        auto cuda_stream = utils::downcast<nvidia::stream_t *>(service_stream);
-        auto event = copy(cuda_stream->queue(),
-                reinterpret_cast<uint8_t *>(theta_data.data()),
-                buffer(theta_storage_.get()));
-        auto &st_desc_ = pd->resampling_impl_->st_desc_;
-        cuda_stream->interop_task([&](::sycl::handler &cgh) {
-            cgh.depends_on(event);
-            auto theta_acc
-                    = buffer(theta_storage_.get())
-                              .get_access<::sycl::access::mode::read>(cgh);
-            auto grid_acc
-                    = buffer(grid_storage_.get())
-                              .get_access<::sycl::access::mode::write>(cgh);
-
-            compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-                // scoped context will make sure the top of the stack context is
-                // the engine context while creating the cublas handle.
-                auto &s_engine = *utils::downcast<nvidia::engine_t *>(engine);
-                cuda_sycl_scoped_context_handler_t sc(s_engine);
-                auto handle = cuda_stream->get_cudnn_handle();
-                auto theta = sc.memory<void *>(ih, theta_acc);
-                auto grid = sc.memory<void *>(ih, grid_acc);
-                CUDNN_EXECUTE_FUNC(cudnnSpatialTfGridGeneratorForward, handle,
-                        st_desc_, theta, grid);
-            });
-        });
-
-        // cudnn requires the grid data to be normalized between (-1, -1) <=
-        // (xsi, ysi) <= (1,1) when the value is outside of the boundary, cudnn
-        // assume the values are 0, while oneDNN uses the boundary values. So we
-        // clamp the outside of the boundary values to the boundary,. This will
-        // fix the upsampling issue.
-        std::vector<data_t> unbound_raw_grid(grid_size);
-        auto event2 = copy(cuda_stream->queue(), buffer(grid_storage_.get()),
-                reinterpret_cast<uint8_t *>(unbound_raw_grid.data()));
-        event2.wait();
-        for (int i = 0; i < grid_size; i++) {
-            if (std::fabs(unbound_raw_grid[i]) > 1)
-                unbound_raw_grid[i] = unbound_raw_grid[i]
-                        / (std::fabs(unbound_raw_grid[i]));
-        }
-
-        auto event3 = copy(cuda_stream->queue(),
-                reinterpret_cast<uint8_t *>(unbound_raw_grid.data()),
-                buffer(grid_storage_.get()));
-        event3.wait();
-        return status::success;
-    }
-    std::unique_ptr<memory_storage_t> grid_storage_;
-    std::unique_ptr<memory_storage_t> theta_storage_;
-};
-
-struct cudnn_resampling_fwd_t : public cudnn_resampling_base_t {
-    using cudnn_resampling_base_t::cudnn_resampling_base_t;
-    struct pd_t : public resampling_fwd_pd_t,
-                  public cudnn_resampling_pd_base_t {
-        using cudnn_resampling_pd_base_t::cudnn_resampling_pd_base_t;
-        using resampling_fwd_pd_t::resampling_fwd_pd_t;
-        DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_resampling_fwd_t);
-
-        status_t init(impl::engine_t *engine) {
-            using namespace data_type;
-            using namespace format_tag;
-
-            assert(engine->kind() == engine_kind::gpu);
-
-            bool ok = desc()->alg_kind == alg_kind::resampling_linear
-                    && is_fwd() && utils::one_of(src_md()->data_type, f32, f16)
-                    && src_md()->data_type == dst_md()->data_type
-                    && set_default_params() == status::success
-                    && attr()->has_default_values();
-            if (!ok) return status::unimplemented;
-
-            // src must have a tag and src must follow the same tag
-            format_tag_t dat_tag = memory_desc_matches_one_of_tag(
-                    *src_md(), ncw, nchw, nwc, nhwc);
-            if (dat_tag == format_tag::undef) return status::unimplemented;
-            if (!memory_desc_matches_tag(*dst_md(), dat_tag)) {
-                return status::unimplemented;
-            }
-
-            resampling_impl_.reset(new cudnn_resampling_fwd_impl_t());
-            return resampling_impl_->init(this);
-        }
-
-        std::shared_ptr<cudnn_resampling_impl_base_t> resampling_impl_;
-    };
-
-    status_t init(impl::engine_t *engine) override {
-        status_t status;
-        auto wrap = memory_desc_wrapper(pd()->src_md());
-        switch (wrap.data_type()) {
-            case data_type::f32:
-                status = prepare_coordinate_grid<float>(engine, pd());
-                break;
-            case data_type::f16:
-                status = prepare_coordinate_grid<float16_t>(engine, pd());
-                break;
-            default: status = status::unimplemented;
-        }
-        return status;
-    }
-
-    status_t execute(const exec_ctx_t &ctx) const override;
-
-private:
-    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-};
-
-struct cudnn_resampling_bwd_t : public cudnn_resampling_base_t {
-    using cudnn_resampling_base_t::cudnn_resampling_base_t;
-    struct pd_t : public resampling_bwd_pd_t,
-                  public cudnn_resampling_pd_base_t {
-        using cudnn_resampling_pd_base_t::cudnn_resampling_pd_base_t;
-        using resampling_bwd_pd_t::resampling_bwd_pd_t;
-        DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_resampling_bwd_t);
-
-        status_t init(impl::engine_t *engine) {
-            using namespace data_type;
-            using namespace format_tag;
-
-            assert(engine->kind() == engine_kind::gpu);
-            bool ok = desc()->alg_kind == alg_kind::resampling_linear
-                    && !is_fwd() && utils::one_of(diff_src_md()->data_type, f32)
-                    && diff_src_md()->data_type == diff_dst_md()->data_type
-                    && set_default_params() == status::success
-                    && attr()->has_default_values();
-            if (!ok) return status::unimplemented;
-            // dst must have a tag and src must follow the same tag
-            format_tag_t dat_tag = memory_desc_matches_one_of_tag(
-                    *diff_dst_md(), ncw, nchw, nwc, nhwc);
-            if (dat_tag == format_tag::undef) return status::unimplemented;
-            if (!memory_desc_matches_tag(*diff_src_md(), dat_tag)) {
-                return status::unimplemented;
-            }
-
-            resampling_impl_.reset(new cudnn_resampling_bwd_impl_t());
-            return resampling_impl_->init(this);
-        }
-        std::shared_ptr<cudnn_resampling_impl_base_t> resampling_impl_;
-    };
-    status_t init(impl::engine_t *engine) override {
-        return prepare_coordinate_grid<float>(engine, pd());
-    }
-
-    status_t execute(const exec_ctx_t &ctx) const override;
-
-private:
-    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-};
-
-} // namespace nvidia
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/gpu/nvidia/cudnn_resampling_impl.hpp b/src/gpu/nvidia/cudnn_resampling_impl.hpp
deleted file mode 100644
index cdd62c8290d..00000000000
--- a/src/gpu/nvidia/cudnn_resampling_impl.hpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-* Copyright 2020 Codeplay Software Limited
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_NVIDIA_CUDNN_RESAMPLING_IMPL_HPP
-#define GPU_NVIDIA_CUDNN_RESAMPLING_IMPL_HPP
-
-#include <cudnn.h>
-
-#include "gpu/nvidia/engine.hpp"
-#include "gpu/nvidia/sycl_cuda_utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace nvidia {
-
-struct cudnn_resampling_impl_base_t {
-    virtual ~cudnn_resampling_impl_base_t() {
-        for (int i = 0; i < NUM_IO; ++i) {
-            if (tensor_descs_[i]) {
-                CUDNN_EXECUTE_FUNC_V(
-                        cudnnDestroyTensorDescriptor, tensor_descs_[i]);
-            }
-        }
-
-        if (st_desc_) {
-            CUDNN_EXECUTE_FUNC_V(
-                    cudnnDestroySpatialTransformerDescriptor, st_desc_);
-        }
-    }
-
-    virtual status_t init(resampling_pd_t *pd) = 0;
-
-    virtual void execute(
-            cudnnHandle_t handle, const std::vector<void *> &args) const = 0;
-
-    int ndims() { return ndims_; }
-
-    status_t create_and_set_st_desc() {
-        CHECK(CUDNN_EXECUTE_FUNC_S(
-                cudnnCreateSpatialTransformerDescriptor, &st_desc_));
-
-        CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetSpatialTransformerNdDescriptor,
-                st_desc_, CUDNN_SAMPLER_BILINEAR, data_types_[dst], ndims_,
-                dims_[dst]));
-
-        return status::success;
-    }
-
-    enum io { src, dst, NUM_IO };
-    int dims_[NUM_IO][DNNL_MAX_NDIMS];
-    int strides_[NUM_IO][DNNL_MAX_NDIMS];
-    cudnnDataType_t data_types_[NUM_IO];
-    cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {};
-    cudnnSpatialTransformerDescriptor_t st_desc_;
-    int ndims_;
-    const float alpha_ = 1.f, beta_ = 0.f;
-};
-
-struct cudnn_resampling_fwd_impl_t : public cudnn_resampling_impl_base_t {
-    status_t init(resampling_pd_t *pd) override {
-        ndims_ = std::max(4, pd->ndims());
-
-        if (ndims_ > 4) return status::unimplemented;
-
-        cudnnTensorFormat_t src_format, dst_format;
-        CHECK(get_format(pd->src_md(), dst_format));
-        CHECK(get_format(pd->dst_md(), src_format));
-        convert_dims(pd->src_md()->padded_dims, dims_[src], pd->ndims());
-        convert_dims(pd->src_md()->format_desc.blocking.strides, strides_[src],
-                pd->ndims(), 4,
-                (dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[src][1]));
-        convert_dims(pd->dst_md()->padded_dims, dims_[dst], pd->ndims());
-        convert_dims(pd->dst_md()->format_desc.blocking.strides, strides_[dst],
-                pd->ndims(), 4,
-                (dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[dst][1]));
-
-        CHECK(convert_data_type(pd->src_md(), &data_types_[src]));
-        CHECK(convert_data_type(pd->dst_md(), &data_types_[dst]));
-
-        CHECK(create_and_set_tensor_descriptor(&tensor_descs_[src],
-                data_types_[src], ndims_, dims_[src], strides_[src]));
-        CHECK(create_and_set_tensor_descriptor(&tensor_descs_[dst],
-                data_types_[dst], ndims_, dims_[dst], strides_[dst]));
-
-        CHECK(create_and_set_st_desc());
-        return status::success;
-    }
-
-    void execute(cudnnHandle_t handle,
-            const std::vector<void *> &args) const override {
-
-        CUDNN_EXECUTE_FUNC(cudnnSpatialTfSamplerForward, handle, st_desc_,
-                &alpha_, tensor_descs_[src], args[0], args[1], &beta_,
-                tensor_descs_[dst], args[2]);
-    }
-};
-
-struct cudnn_resampling_bwd_impl_t : public cudnn_resampling_impl_base_t {
-
-    status_t init(resampling_pd_t *pd) override {
-        ndims_ = std::max(4, pd->ndims());
-
-        if (ndims_ > 4) return status::unimplemented;
-
-        cudnnTensorFormat_t src_format, dst_format;
-        CHECK(get_format(pd->diff_src_md(), dst_format));
-        CHECK(get_format(pd->diff_dst_md(), src_format));
-        convert_dims(pd->diff_src_md()->padded_dims, dims_[src], pd->ndims());
-        convert_dims(pd->diff_src_md()->format_desc.blocking.strides,
-                strides_[src], pd->ndims(), 4,
-                (dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[src][1]));
-        convert_dims(pd->diff_dst_md()->padded_dims, dims_[dst], pd->ndims());
-        convert_dims(pd->diff_dst_md()->format_desc.blocking.strides,
-                strides_[dst], pd->ndims(), 4,
-                (dst_format != CUDNN_TENSOR_NHWC ? 1 : dims_[dst][1]));
-
-        CHECK(convert_data_type(pd->diff_src_md(), &data_types_[src]));
-        CHECK(convert_data_type(pd->diff_dst_md(), &data_types_[dst]));
-
-        CHECK(create_and_set_tensor_descriptor(&tensor_descs_[src],
-                data_types_[src], ndims_, dims_[src], strides_[src]));
-        CHECK(create_and_set_tensor_descriptor(&tensor_descs_[dst],
-                data_types_[dst], ndims_, dims_[dst], strides_[dst]));
-
-        CHECK(create_and_set_st_desc());
-        auto wrap = memory_desc_wrapper(pd->diff_src_md());
-
-        auto grid_size = pd->MB() * pd->OH() * pd->OW() * 2;
-        auto grid_size_in_byte = grid_size * wrap.data_type_size();
-        // cuDNN does not allow the dgrid to be NULL ptr. Although we dont
-        // need to compute dgrid since the theta is not comming from a
-        // local network, we have to set that since Nvidia does not accept
-        // so we allocate an scratchpad for dgrid
-        pd->scratchpad_registry().registrar().book(
-                memory_tracking::names::key_none, grid_size_in_byte, size_t(1));
-        return status::success;
-    }
-
-    void execute(cudnnHandle_t handle,
-            const std::vector<void *> &args) const override {
-        // we are not backpropagating for the grid here.
-        // So both alpha and beta are zero and the dgrid value
-        //  wont be used
-        CUDNN_EXECUTE_FUNC(cudnnSpatialTfSamplerBackward, handle, st_desc_,
-                &alpha_, tensor_descs_[src], args[0], &beta_,
-                tensor_descs_[src], args[0], &beta_, tensor_descs_[dst],
-                args[1], args[2], &beta_, args[3]);
-    }
-};
-
-} // namespace nvidia
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif