tests/benchdnn/dnnl_memory.cpp

/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include <algorithm>
#include <atomic>
#include <cctype>
#include <memory>
#include <numeric>
#include <string>

#include "oneapi/dnnl/dnnl.h"

#ifdef DNNL_WITH_SYCL
#include "oneapi/dnnl/dnnl_sycl.hpp"
#endif

#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
#include "oneapi/dnnl/dnnl_ocl.hpp"
#include "src/xpu/ocl/usm_utils.hpp"
#endif

#include "tests/test_thread.hpp"

#include "dnn_types.hpp"
#include "dnnl_common.hpp"
#include "dnnl_memory.hpp"
#include "utils/cold_cache.hpp"
#include "utils/dnnl_query.hpp"
#include "utils/parallel.hpp"

extern "C" dnnl_status_t dnnl_memory_desc_create_with_string_tag(
        dnnl_memory_desc_t *, int, const dnnl_dims_t, dnnl_data_type_t,
        const char *);

extern "C" dnnl_status_t dnnl_memory_desc_set_data_type(
        dnnl_memory_desc_t memory_desc, dnnl_data_type_t data_type);

dnn_mem_t::dnn_mem_t(const_dnnl_memory_desc_t md, dnnl_engine_t engine,
        const handle_info_t &handle_info) {
    if (query_md_ndims(md) > 0) {
        auto status = dnnl_memory_desc_clone(&md_, md);
        (void)status;
        assert(status == dnnl_success);
        active_ = (initialize(engine, handle_info) == OK);
    }
}

dnn_mem_t::dnn_mem_t(const_dnnl_memory_desc_t md, dnnl_data_type_t dt,
        const std::string &tag, dnnl_engine_t engine) {
    const int ndims = query_md_ndims(md);
    if (ndims > 0) {
        auto md_wrapper = dnn_mem_t::init_md(ndims, query_md_dims(md), dt, tag);
        md_ = md_wrapper.release();
        active_ = (initialize(engine) == OK);
    }
}

dnn_mem_t::dnn_mem_t(int ndims, const dnnl_dims_t dims, dnnl_data_type_t dt,
        const std::string &tag, dnnl_engine_t engine) {
    if (ndims > 0) {
        auto md_wrapper = dnn_mem_t::init_md(ndims, dims, dt, tag);
        md_ = md_wrapper.release();
        active_ = (initialize(engine) == OK);
    }
}

dnn_mem_t::dnn_mem_t(int ndims, const dnnl_dims_t dims, dnnl_data_type_t dt,
        const dnnl_dims_t strides, dnnl_engine_t engine) {
    if (ndims > 0) {
        auto status = dnnl_memory_desc_create_with_strides(
                &md_, ndims, dims, dt, strides);
        (void)status;
        assert(status == dnnl_success);
        active_ = (initialize(engine) == OK);
    }
}

dnn_mem_t::dnn_mem_t(const dnn_mem_t &rhs, dnnl_data_type_t dt,
        const std::string &tag, dnnl_engine_t engine)
    : dnn_mem_t(rhs.md_, dt, tag, engine) {
    if (active_) {
        int status = reorder(rhs);
        if (status != OK) {
            BENCHDNN_PRINT(
                    0, "%s\n", "Error: reorder in memory constructor failed.");
        }
    }
}

int execute_reorder(const dnn_mem_t &src, dnn_mem_t &dst,
        const_dnnl_primitive_attr_t attr) {
    std::shared_ptr<const dnn_mem_t> r_src(&src, [](const dnn_mem_t *) {});
    std::shared_ptr<dnn_mem_t> r_dst(&dst, [](dnn_mem_t *) {});

    dnnl_primitive_desc_t r_pd_ {};
    dnnl_primitive_t prim_ {};

    // Optimization to reduce testing time for GPU.
    //
    // For CPU <-> GPU reorders, the library creates GPU-side kernels.
    // Benchdnn heavily relies on reorders and this greatly increases execution
    // time because of big overhead on building OpenCL kernels.
    //
    // First, try to create CPU reorder for the requested GPU reorder. If
    // succeeded, then create CPU memory object wrapping mapped pointers of
    // source and destination and execute CPU reorder. If CPU reorder can't be
    // create, then just execute a regular GPU reorder.
#if ((DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL) \
        || (DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL)) \
        && DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE
    const auto &cpu_engine = get_cpu_engine();
    if (src.engine_kind() == dnnl_gpu || dst.engine_kind() == dnnl_gpu) {

        dnnl_status_t status = dnnl_reorder_primitive_desc_create(
                &r_pd_, src.md_, cpu_engine, dst.md_, cpu_engine, attr);
        if (status == dnnl_success) {
            // Create CPU memory objects wrapping mapped pointers of source and
            // destination
            r_src = std::make_shared<dnn_mem_t>(dnn_mem_t::create_from_host_ptr(
                    src.md_, cpu_engine, (void *)src));
            r_dst = std::make_shared<dnn_mem_t>(dnn_mem_t::create_from_host_ptr(
                    dst.md_, cpu_engine, (void *)dst));
        }
    }
#endif

    while (!r_pd_) {
        // Fallback to GPU reorder.
        auto status = dnnl_reorder_primitive_desc_create(
                &r_pd_, src.md_, src.engine(), dst.md_, dst.engine(), attr);
        if (status == dnnl_success) break;
        if (dnnl_memory_desc_equal(src.md_, dst.md_)) {
            // If fail to create reorder pd, use plain data copy for identical
            // mds.
            BENCHDNN_PRINT(2, "%s\n", "[REORDER] Fallback to plain copy.");
            const int64_t chunk_size = 64;
            const int64_t n_chunks = div_up(src.nelems(), chunk_size);
            benchdnn_parallel_nd(n_chunks, [&](int64_t idx_chunk) {
                int64_t idx_start = idx_chunk * chunk_size;
                int64_t idx_end = MIN2(idx_start + chunk_size, src.nelems());
                for (int64_t idx = idx_start; idx < idx_end; ++idx) {
                    float e = src.get_elem(idx);
                    dst.set_elem(idx, e);
                }
            });
            return OK;
        }
        return FAIL;
    }

    auto r_pd = make_benchdnn_dnnl_wrapper(r_pd_);
    const auto &scratchpad_md = query_md(r_pd, DNNL_ARG_SCRATCHPAD);
    const auto &scratchpad_engine
            = dst.engine_kind() == dnnl_gpu ? dst.engine() : src.engine();
    dnn_mem_t scratchpad(scratchpad_md, scratchpad_engine);

    DNN_SAFE(dnnl_primitive_create(&prim_, r_pd), CRIT);
    auto prim = make_benchdnn_dnnl_wrapper(prim_);

    args_t args;
    args.set(DNNL_ARG_FROM, *r_src);
    args.set(DNNL_ARG_TO, *r_dst);
    args.set(DNNL_ARG_SCRATCHPAD, scratchpad);

    return execute_and_wait(prim, args);
}

// `swap_dt` changes `this` data type which may be needed for
// different sum data type or fpmath mode specified.
int dnn_mem_t::reorder(const dnn_mem_t &rhs, const_dnnl_primitive_attr_t attr,
        dnnl_data_type_t swap_dt) {
    if (this == &rhs) return OK;

    // When `rhs` object is empty, it's illigal to execute a reorder over it.
    // Do nothing, return a good status. Keep here to avoid guarding externally.
    if (query_md_ndims(rhs.md_) == 0) return OK;

    // Assumption is `no_ref_memory` assigned values at construction, and no
    // actual reorder needed. This check is to avoid extra code outside of
    // reorder interface.
    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;

    const bool do_swap_dt = swap_dt != dnnl_data_type_undef;
    dnnl_data_type_t orig_dt = this->dt();
    if (do_swap_dt) this->set_dt(swap_dt);
    auto status = execute_reorder(rhs, *this, attr);
    if (do_swap_dt) this->set_dt(orig_dt);
    return status;
}

size_t dnn_mem_t::size() const {
    return dnnl_memory_desc_get_size(md_);
}

bool dnn_mem_t::is_sparse_md() const {
#ifdef DNNL_EXPERIMENTAL_SPARSE
    return query_md_sparse_encoding(md_) != dnnl_sparse_encoding_undef;
#else
    return false;
#endif
}

size_t dnn_mem_t::sizeof_dt() const {
    return dnnl_data_type_size(dt());
}

float dnn_mem_t::get_elem(int64_t idx, int buffer_index) const {
    void *data = get_mapped_pointer<void>(buffer_index);
    float elem = 0.0;

    switch (dt(buffer_index)) {
        case dnnl_s8: elem = static_cast<int8_t *>(data)[idx]; break;
        case dnnl_u8: elem = static_cast<uint8_t *>(data)[idx]; break;
        case dnnl_s32: elem = static_cast<int32_t *>(data)[idx]; break;
        case dnnl_f32: elem = static_cast<float *>(data)[idx]; break;
        case dnnl_f64: elem = static_cast<double *>(data)[idx]; break;
        case dnnl_f16:
            elem = static_cast<dnnl::impl::float16_t *>(data)[idx];
            break;
        case dnnl_bf16:
            elem = static_cast<dnnl::impl::bfloat16_t *>(data)[idx];
            break;
        case dnnl_e8m0:
            elem = static_cast<dnnl::impl::float8_e8m0_t *>(data)[idx];
            break;
        case dnnl_f8_e5m2:
            elem = static_cast<dnnl::impl::float8_e5m2_t *>(data)[idx];
            break;
        case dnnl_f8_e4m3:
            elem = static_cast<dnnl::impl::float8_e4m3_t *>(data)[idx];
            break;
        case dnnl_s4: {
            dnnl::impl::nibble2_t nibble_pair(
                    reinterpret_cast<uint8_t *>(data)[idx / 2]);
            elem = dnnl::impl::int4_t(nibble_pair.get(idx % 2));
            break;
        }
        case dnnl_u4: {
            dnnl::impl::nibble2_t nibble_pair(
                    reinterpret_cast<uint8_t *>(data)[idx / 2]);
            elem = dnnl::impl::uint4_t(nibble_pair.get(idx % 2));
            break;
        }
        case dnnl_f4_e2m1: {
            dnnl::impl::nibble2_t nibble_pair(
                    reinterpret_cast<uint8_t *>(data)[idx / 2]);
            elem = dnnl::impl::float4_e2m1_t(nibble_pair.get(idx % 2));
            break;
        }
        case dnnl_f4_e3m0: {
            dnnl::impl::nibble2_t nibble_pair(
                    reinterpret_cast<uint8_t *>(data)[idx / 2]);
            elem = dnnl::impl::float4_e3m0_t(nibble_pair.get(idx % 2));
            break;
        }
        default: assert(!"bad data type");
    }
    return elem;
}

void dnn_mem_t::set_elem(int64_t idx, float value, int buffer_index) const {
    void *data = get_mapped_pointer<void>(buffer_index);

    switch (dt(buffer_index)) {
        case dnnl_s8: ((int8_t *)data)[idx] = value; break;
        case dnnl_u8: ((uint8_t *)data)[idx] = value; break;
        case dnnl_s32: ((int32_t *)data)[idx] = value; break;
        case dnnl_f32: ((float *)data)[idx] = value; break;
        case dnnl_f64: ((double *)data)[idx] = value; break;
        case dnnl_f16: ((dnnl::impl::float16_t *)data)[idx] = value; break;
        case dnnl_bf16: ((dnnl::impl::bfloat16_t *)data)[idx] = value; break;
        case dnnl_e8m0: ((dnnl::impl::float8_e8m0_t *)data)[idx] = value; break;
        case dnnl_f8_e5m2:
            ((dnnl::impl::float8_e5m2_t *)data)[idx] = value;
            break;
        case dnnl_f8_e4m3:
            ((dnnl::impl::float8_e4m3_t *)data)[idx] = value;
            break;
        case dnnl_s4: {
            auto dst_val = ((dnnl::impl::nibble2_t *)data)[idx / 2];
            dst_val.set(dnnl::impl::int4_t(value).raw_bits_, idx % 2);
            ((dnnl::impl::nibble2_t *)data)[idx / 2] = dst_val;
            break;
        }
        case dnnl_u4: {
            auto dst_val = ((dnnl::impl::nibble2_t *)data)[idx / 2];
            dst_val.set(dnnl::impl::uint4_t(value).raw_bits_, idx % 2);
            ((dnnl::impl::nibble2_t *)data)[idx / 2] = dst_val;
            break;
        }
        case dnnl_f4_e2m1: {
            auto dst_val = ((dnnl::impl::nibble2_t *)data)[idx / 2];
            dst_val.set(dnnl::impl::float4_e2m1_t(value).raw_bits_, idx % 2);
            ((dnnl::impl::nibble2_t *)data)[idx / 2] = dst_val;
            break;
        }
        case dnnl_f4_e3m0: {
            auto dst_val = ((dnnl::impl::nibble2_t *)data)[idx / 2];
            dst_val.set(dnnl::impl::float4_e3m0_t(value).raw_bits_, idx % 2);
            ((dnnl::impl::nibble2_t *)data)[idx / 2] = dst_val;
            break;
        }
        default: assert(!"bad data type");
    }
}

// Returns an updated logical index based on input `logical_index` and
// `dims_mask`.
// `logical_idx` is a generally composed index where dims[ndims - 1] is most
// dense and dims[0] is least dense dimensions, as tensor in `abx` format.
// `dims_mask` represents dimensions to keep or remove. A value is composed of
// number of bits equal to `ndims` and value of `1` indicates the dimension to
// present in final calculation. E.g., mask=0 means a single value, and mask=2,
// or 1 << 1, means to keep dims[1] only.
// `ndims` allows to reduce the number of dimensions from innermost direction.
// It is used to find an index in batched dimensions. E.g., if ndims() returns
// `4`, but `ndims` argument is passed as `2`, it will count only first two
// dimensions instead of all 4.
// `groups` is an extension of scales when their dimensions are different.
// In this case, it's required to adjust dimensions values according to group
// values to properly compute stride in a smaller tensor. The definition is
// aligned with the API and expects groups of size 2 or empty.
// When `ndims()/ndims` are bigger than 2, groups are applied only to two most
// dense dimensions, which is aligned with 2D matmul definition. E.g., dims=2x6,
// mask=3, ndims=2 and groups=1x3; in this case indices 0,1,2 will return 0 as
// those indices represent the first group. 3,4,5 will return 1. Changing the
// example like dims=6x2, mask=3, ndims=2 and groups=3x1 will change ouput as
// well due to groups are dense not over the last dimension. The match will look
// like this: 0->0, 1->1, 2->0, 3->1, ..., 6->2, ..., 11->4.
//
// Helps to find an index of smaller tensor in bigger one, e.g., scales. Scales
// are usually represented by a 1D array and have a mask indicating dims to be
// applied for. It coincides with `dims_mask`.
// Another usage is to identify an index for operations that support broadcast
// over different dimensions, e.g., matmul batch dimensions, or binary
// primitive.
//
// Example: there's a tensor 3x4 and scales with mask=2, which means to apply
// scales over the dimension of 4. Passing indices from 0 to 3 will return
// values from 0 to 3 correspondently. However, passing `logical_idx` of 4 will
// return 0, as 4 is a logical representation of point 1x0.
int64_t dnn_mem_t::get_idx(int64_t logical_idx, int dims_mask, const int ndims,
        const dims_t &groups) const {
    if (dims_mask == 0) return 0;

    const auto &dims = this->dims();
    int64_t stride = 1;
    int64_t offset = 0;

    assert(groups.empty() || groups.size() == 2);
    assert(groups.size() <= static_cast<size_t>(ndims));
    dims_t groups_ext(ndims, 1);
    if (!groups.empty()) {
        groups_ext[ndims - 2] = groups[0];
        groups_ext[ndims - 1] = groups[1];
    }

    for (int i = 0; i < ndims; ++i) {
        int d = ndims - 1 - i;
        auto pos = logical_idx % dims[d];
        logical_idx /= dims[d];
        if (dims_mask & (1 << d)) {
            offset += (pos / groups_ext[d]) * stride;
            stride *= (dims[d] / groups_ext[d]);
        }
    }

    return offset;
}

// Creates a memory object from the underlying buffer of an existing memory
// object `mem`. The size of `mem` must not be less than the size of `md`.
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || defined(DNNL_WITH_SYCL)
static int init_memory(
        dnnl_memory_t *ret, const dnnl_memory_desc_t &md, dnnl_memory_t mem) {

    dnnl_engine_t engine;
    DNN_SAFE(dnnl_memory_get_engine(mem, &engine), CRIT);

    bool is_sycl = is_sycl_engine(engine);
    bool is_opencl = is_opencl_engine(engine);

    *ret = nullptr;

    const int nhandles = query_md_num_handles(md);
    std::vector<void *> handles(nhandles);
#ifdef DNNL_EXPERIMENTAL_SPARSE
    for (int i = 0; i < nhandles; i++)
        DNN_SAFE(dnnl_memory_get_data_handle_v2(mem, &handles[i], i), CRIT);
#else
    DNN_SAFE(dnnl_memory_get_data_handle(mem, &handles[0]), CRIT);
#endif

    if (is_opencl) {
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
        dnnl_ocl_interop_memory_kind_t mem_kind;
        DNN_SAFE(dnnl_ocl_interop_memory_get_memory_kind(mem, &mem_kind), CRIT);
#ifdef DNNL_EXPERIMENTAL_SPARSE
        DNN_SAFE(dnnl_ocl_interop_memory_create_v2(ret, md, engine, mem_kind,
                         (int)handles.size(), handles.data()),
                CRIT);
#else
        DNN_SAFE(dnnl_ocl_interop_memory_create(
                         ret, md, engine, mem_kind, handles[0]),
                CRIT);
#endif
#endif
    } else if (is_sycl) {
#ifdef DNNL_WITH_SYCL
        dnnl_sycl_interop_memory_kind_t mem_kind;
        DNN_SAFE(
                dnnl_sycl_interop_memory_get_memory_kind(mem, &mem_kind), CRIT);
#ifdef DNNL_EXPERIMENTAL_SPARSE
        DNN_SAFE(dnnl_sycl_interop_memory_create_v2(ret, md, engine, mem_kind,
                         (int)handles.size(), handles.data()),
                CRIT);
#else
        DNN_SAFE(dnnl_sycl_interop_memory_create(
                         ret, md, engine, mem_kind, handles[0]),
                CRIT);
#endif
#endif
    }

    // Memory must be initialized at this point in some of the branches above.
    if (!*ret) assert(!"not expected");

    return OK;
}
#endif

void dnn_mem_t::map() const {
    assert(!is_mapped_ && "memory is already mapped");
    is_mapped_ = true;

    if (!m_) return;
    auto mem = m_padded_ ? m_padded_ : m_;
    const int nhandles = query_md_num_handles(md_);
    mapped_ptrs_.resize(nhandles);
    for (int i = 0; i < nhandles; i++) {
#ifdef DNNL_EXPERIMENTAL_SPARSE
        auto st = dnnl_memory_map_data_v2(mem, &mapped_ptrs_[i], i);
#else
        auto st = dnnl_memory_map_data(mem, &mapped_ptrs_[i]);
#endif
        if (st != dnnl_success) {
            for (int j = 0; j < i; j++) {
#ifdef DNNL_EXPERIMENTAL_SPARSE
                DNN_SAFE_V(dnnl_memory_unmap_data_v2(mem, mapped_ptrs_[i], i));
#else
                DNN_SAFE_V(dnnl_memory_unmap_data(mem, mapped_ptrs_[i]));
#endif
            }
            DNN_SAFE_V(st);
        }
    }
}

void dnn_mem_t::unmap() const {
    assert(is_mapped_ && "memory is not mapped");
    is_mapped_ = false;

    if (!m_) return;
    auto mem = m_padded_ ? m_padded_ : m_;
    const int nhandles = query_md_num_handles(md_);
    for (int i = 0; i < nhandles; i++) {
#ifdef DNNL_EXPERIMENTAL_SPARSE
        DNN_SAFE_V(dnnl_memory_unmap_data_v2(mem, mapped_ptrs_[i], i));
#else
        DNN_SAFE_V(dnnl_memory_unmap_data(mem, mapped_ptrs_[i]));
#endif
        mapped_ptrs_[i] = nullptr;
    }
}

void dnn_mem_t::memset(int value, size_t size, int buffer_index) const {
    bool is_opencl = is_opencl_engine(engine_);
    bool is_sycl = is_sycl_engine(engine_);
    auto mem = m_padded_ ? m_padded_ : m_;
    void *mem_handle;
#ifdef DNNL_EXPERIMENTAL_SPARSE
    DNN_SAFE_V(dnnl_memory_get_data_handle_v2(mem, &mem_handle, buffer_index));
#else
    DNN_SAFE_V(dnnl_memory_get_data_handle(mem, &mem_handle));
#endif

    if (is_opencl) {
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
        stream_t stream(engine_);
        switch (memory_kind) {
            case memory_kind_ext_t::buffer: {
                auto buf = static_cast<cl_mem>(mem_handle);
                cl_command_queue queue;
                DNN_SAFE_V(dnnl_ocl_interop_stream_get_command_queue(
                        stream, &queue));
                cl_int err = clEnqueueFillBuffer(queue, buf, &value,
                        sizeof(uint8_t), 0, size, 0, nullptr, nullptr);
                if (err != CL_SUCCESS) SAFE_V(FAIL);
                DNN_SAFE_V(dnnl_stream_wait(stream));
                return;
            }
            case memory_kind_ext_t::usm:
            case memory_kind_ext_t::usm_device:
            case memory_kind_ext_t::usm_shared: {
                DNN_SAFE_V(dnnl::impl::xpu::ocl::usm::memset(
                        stream, mem_handle, value, size));
                DNN_SAFE_V(dnnl_stream_wait(stream));
                return;
            }
        }
#endif
    } else if (is_sycl) {
#ifdef DNNL_WITH_SYCL
        stream_t stream(engine_);
        void *queue_ptr;
        DNN_SAFE_V(dnnl_sycl_interop_stream_get_queue(stream, &queue_ptr));
        auto &queue = *static_cast<::sycl::queue *>(queue_ptr);
        switch (memory_kind) {
            case memory_kind_ext_t::buffer: {
                auto &buf = *static_cast<::sycl::buffer<uint8_t, 1> *>(
                        mem_handle);
                queue.submit([&](::sycl::handler &cgh) {
                    constexpr auto target_device = ::sycl::target::device;
                    ::sycl::accessor<uint8_t, 1, ::sycl::access::mode::write,
                            target_device>
                            acc(buf, cgh);
                    cgh.fill(acc, static_cast<uint8_t>(value));
                });
                DNN_SAFE_V(dnnl_stream_wait(stream));
                return;
            }
            case memory_kind_ext_t::usm:
            case memory_kind_ext_t::usm_device:
            case memory_kind_ext_t::usm_shared: {
                queue.submit([&](::sycl::handler &cgh) {
                    cgh.memset(mem_handle, value, size);
                });
                DNN_SAFE_V(dnnl_stream_wait(stream));
                return;
            }
        }
#endif
    }
    if (is_cpu(engine_)) {
        ::memset(mem_handle, value, size);
        return;
    }
    SAFE_V(FAIL);
}

dnn_mem_t dnn_mem_t::create_from_host_ptr(
        const dnnl_memory_desc_t &md, dnnl_engine_t engine, void *host_ptr) {
    return dnn_mem_t(md, engine, {true, host_ptr});
}

size_t dnn_mem_t::pad_memory_size(
        size_t sz, dnnl_engine_kind_t engine_kind, bool *was_padded) {
    if (was_padded) *was_padded = false;
    if (sz == 0 || !has_bench_mode_bit(mode_bit_t::corr)
            || engine_kind == dnnl_cpu)
        return sz;

    const size_t page_size = 4096;
    auto padded_sz = rnd_up(sz, page_size);
    if (was_padded) *was_padded = padded_sz != sz;
    return padded_sz;
}

dnnl_memory_desc_t dnn_mem_t::pad_memory_desc(const_dnnl_memory_desc_t md,
        dnnl_engine_kind_t engine_kind, bool *was_padded) {
    if (was_padded) *was_padded = false;
#ifdef DNNL_EXPERIMENTAL_SPARSE
    // TODO: add padded memory descriptor support for sparse memory.
    if (query_md_format_kind(md) == dnnl_format_kind_sparse) return nullptr;
#endif
    size_t old_sz = dnnl_memory_desc_get_size(md);
    if (old_sz == 0 || !has_bench_mode_bit(mode_bit_t::corr)
            || engine_kind == dnnl_cpu)
        return nullptr;

    size_t sz = pad_memory_size(old_sz, engine_kind, was_padded);
    if (sz == old_sz) return nullptr;

    dnnl_memory_desc_t ret;
    dnnl_dims_t dims = {(dnnl_dim_t)sz};
    DNN_SAFE_V(
            dnnl_memory_desc_create_with_tag(&ret, 1, dims, dnnl_u8, dnnl_x));
    return ret;
}

benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> dnn_mem_t::init_md(int ndims,
        const dnnl_dims_t dims, dnnl_data_type_t data_type,
        const std::string &tag_, const dims_t &strides_) {
    dnnl_memory_desc_t md {};
    const bool use_strides = !strides_.empty();
    // Ignore tag_ in case strides_ are explicitly provided
    if (use_strides) {
        std::vector<dnnl_dim_t> strides(strides_);
        DNN_SAFE_V(dnnl_memory_desc_create_with_strides(
                &md, ndims, dims, data_type, strides.data()));
        return md;
    }

    auto tag = normalize_tag(tag_, ndims);
    if (tag == tag::undef || tag == tag::any || ndims == 0) {
        dnnl_format_tag_t enum_tag = (tag == tag::undef || ndims == 0)
                ? dnnl_format_tag_undef
                : dnnl_format_tag_any;
        DNN_SAFE_V(dnnl_memory_desc_create_with_tag(
                &md, ndims, dims, data_type, enum_tag));
        return md;
    }

    DNN_SAFE_V(dnnl_memory_desc_create_with_string_tag(
            &md, ndims, dims, data_type, tag.data()));

    return md;
}

#ifdef DNNL_EXPERIMENTAL_SPARSE
benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> dnn_mem_t::init_csr_md(int ndims,
        const dnnl_dims_t dims, dnnl_data_type_t data_type, dnnl_dim_t nnz,
        dnnl_data_type_t indices_dt, dnnl_data_type_t pointers_dt) {
    dnnl_memory_desc_t md {};
    DNN_SAFE_V(dnnl_memory_desc_create_with_csr_encoding(
            &md, ndims, dims, data_type, nnz, indices_dt, pointers_dt));
    return md;
}

benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> dnn_mem_t::init_coo_md(int ndims,
        const dnnl_dims_t dims, dnnl_data_type_t data_type, dnnl_dim_t nnz,
        dnnl_data_type_t indices_dt) {
    dnnl_memory_desc_t md {};
    DNN_SAFE_V(dnnl_memory_desc_create_with_coo_encoding(
            &md, ndims, dims, data_type, nnz, indices_dt));
    return md;
}

benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> dnn_mem_t::init_sparse_packed_md(
        int ndims, const dnnl_dims_t dims, dnnl_data_type_t data_type,
        dnnl_dim_t nnz) {
    dnnl_memory_desc_t md {};
    DNN_SAFE_V(dnnl_memory_desc_create_with_packed_encoding(
            &md, ndims, dims, data_type, nnz));
    return md;
}
#endif

int dnn_mem_t::initialize_memory_create_sycl(const handle_info_t &handle_info) {
#ifdef DNNL_WITH_SYCL
    if (handle_info.is_host_ptr) {
        // Ignore memory_kind with host pointers and force USM.
#ifdef DNNL_EXPERIMENTAL_SPARSE
        const int nhandles = query_md_num_handles(md_);
        std::vector<void *> handles(nhandles, handle_info.ptr);
        DNN_SAFE(dnnl_sycl_interop_memory_create_v2(&m_, md_, engine_,
                         dnnl_sycl_interop_usm, (int)handles.size(),
                         handles.data()),
                CRIT);
#else
        DNN_SAFE(dnnl_sycl_interop_memory_create(&m_, md_, engine_,
                         dnnl_sycl_interop_usm, handle_info.ptr),
                CRIT);
#endif
        return OK;
    }

    auto md_padded = pad_memory_desc(md_, engine_kind_, &is_canary_protected_);
    if (!md_padded) md_padded = md_;

    switch (memory_kind) {
        case memory_kind_ext_t::usm:
        case memory_kind_ext_t::buffer: {
            dnnl_sycl_interop_memory_kind_t mem_kind
                    = (memory_kind == memory_kind_ext_t::usm
                                    ? dnnl_sycl_interop_usm
                                    : dnnl_sycl_interop_buffer);
#ifdef DNNL_EXPERIMENTAL_SPARSE
            const int nhandles = query_md_num_handles(md_);
            std::vector<void *> handles(nhandles, handle_info.ptr);
            DNN_SAFE(dnnl_sycl_interop_memory_create_v2(&m_padded_, md_padded,
                             engine_, mem_kind, (int)handles.size(),
                             handles.data()),
                    CRIT);
#else
            DNN_SAFE(dnnl_sycl_interop_memory_create(&m_padded_, md_padded,
                             engine_, mem_kind, handle_info.ptr),
                    CRIT);
#endif
            SAFE(init_memory(&m_, md_, m_padded_), CRIT);
            break;
        }
        case memory_kind_ext_t::usm_device:
        case memory_kind_ext_t::usm_shared: {
            SAFE(handle_info.is_allocate() ? OK : FAIL, CRIT);
            is_data_owner_ = true;

            auto eng = dnnl::engine(engine_, true);
            auto dev = dnnl::sycl_interop::get_device(eng);
            auto ctx = dnnl::sycl_interop::get_context(eng);

            const int nhandles = query_md_num_handles(md_);
            for (int i = 0; i < nhandles; i++) {
#ifdef DNNL_EXPERIMENTAL_SPARSE
                size_t sz = dnnl_memory_desc_get_size_v2(md_padded, i);
#else
                size_t sz = dnnl_memory_desc_get_size(md_padded);
#endif
                if (memory_kind == memory_kind_ext_t::usm_device) {
                    data_.push_back(::sycl::malloc_device(sz, dev, ctx));
                } else {
                    data_.push_back(::sycl::malloc_shared(sz, dev, ctx));
                }
                if (sz > 0 && !data_[i]) {
                    for (void *p : data_)
                        ::sycl::free(p, ctx);
                    DNN_SAFE(dnnl_out_of_memory, CRIT);
                }
            }
#ifdef DNNL_EXPERIMENTAL_SPARSE
            DNN_SAFE(dnnl_sycl_interop_memory_create_v2(&m_padded_, md_padded,
                             engine_, dnnl_sycl_interop_usm, (int)data_.size(),
                             data_.data()),
                    CRIT);
#else
            DNN_SAFE(dnnl_sycl_interop_memory_create(&m_padded_, md_padded,
                             engine_, dnnl_sycl_interop_usm, data_[0]),
                    CRIT);
#endif
            SAFE(init_memory(&m_, md_, m_padded_), CRIT);
            break;
        }
        default: assert(!"not expected");
    }
    if (md_padded != md_) DNN_SAFE(dnnl_memory_desc_destroy(md_padded), CRIT);

#else
    (void)handle_info;
#endif
    return OK;
}

int dnn_mem_t::initialize_memory_create_opencl(
        const handle_info_t &handle_info) {
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
    if (handle_info.is_host_ptr) {
        // Ignore memory_kind with host pointers and force USM.
#ifdef DNNL_EXPERIMENTAL_SPARSE
        const int nhandles = query_md_num_handles(md_);
        std::vector<void *> handles(nhandles, handle_info.ptr);
        DNN_SAFE(dnnl_ocl_interop_memory_create_v2(&m_, md_, engine_,
                         dnnl_ocl_interop_usm, (int)handles.size(),
                         handles.data()),
                CRIT);
#else
        DNN_SAFE(dnnl_ocl_interop_memory_create(&m_, md_, engine_,
                         dnnl_ocl_interop_usm, handle_info.ptr),
                CRIT);
#endif
        return OK;
    }

    SAFE(handle_info.is_allocate() ? OK : FAIL, CRIT);

    auto md_padded = pad_memory_desc(md_, engine_kind_, &is_canary_protected_);
    if (!md_padded) md_padded = md_;

    switch (memory_kind) {
        case memory_kind_ext_t::usm:
        case memory_kind_ext_t::buffer: {
            dnnl_ocl_interop_memory_kind_t mem_kind
                    = (memory_kind == memory_kind_ext_t::usm
                                    ? dnnl_ocl_interop_usm
                                    : dnnl_ocl_interop_buffer);
#ifdef DNNL_EXPERIMENTAL_SPARSE
            const int nhandles = query_md_num_handles(md_);
            std::vector<void *> handles(nhandles, handle_info.ptr);
            DNN_SAFE(dnnl_ocl_interop_memory_create_v2(&m_padded_, md_padded,
                             engine_, mem_kind, (int)handles.size(),
                             handles.data()),
                    CRIT);
#else
            DNN_SAFE(dnnl_ocl_interop_memory_create(&m_padded_, md_padded,
                             engine_, mem_kind, handle_info.ptr),
                    CRIT);
#endif
            SAFE(init_memory(&m_, md_, m_padded_), CRIT);
            break;
        }
        case memory_kind_ext_t::usm_device:
        case memory_kind_ext_t::usm_shared: {
            is_data_owner_ = true;

            const int nhandles = query_md_num_handles(md_);
            for (int i = 0; i < nhandles; i++) {
#ifdef DNNL_EXPERIMENTAL_SPARSE
                size_t sz = dnnl_memory_desc_get_size_v2(md_padded, i);
#else
                size_t sz = dnnl_memory_desc_get_size(md_padded);
#endif
                if (memory_kind == memory_kind_ext_t::usm_device) {
                    data_.push_back(dnnl::impl::xpu::ocl::usm::malloc_device(
                            engine_, sz));
                } else {
                    data_.push_back(dnnl::impl::xpu::ocl::usm::malloc_shared(
                            engine_, sz));
                }

                if (sz > 0 && !data_[i]) {
                    for (void *p : data_)
                        dnnl::impl::xpu::ocl::usm::free(engine_, p);
                    DNN_SAFE(dnnl_out_of_memory, CRIT);
                }
            }
#ifdef DNNL_EXPERIMENTAL_SPARSE
            DNN_SAFE(dnnl_ocl_interop_memory_create_v2(&m_padded_, md_padded,
                             engine_, dnnl_ocl_interop_usm, (int)data_.size(),
                             data_.data()),
                    CRIT);
#else
            DNN_SAFE(dnnl_ocl_interop_memory_create(&m_padded_, md_padded,
                             engine_, dnnl_ocl_interop_usm, data_[0]),
                    CRIT);
#endif
            SAFE(init_memory(&m_, md_, m_padded_), CRIT);
            break;
        }
        default: assert(!"not expected");
    }
    if (md_padded != md_) DNN_SAFE(dnnl_memory_desc_destroy(md_padded), CRIT);
#else
    (void)handle_info;
#endif
    return OK;
}

int dnn_mem_t::initialize_memory_create(const handle_info_t &handle_info) {
    bool is_sycl = is_sycl_engine(engine_);
    bool is_opencl = is_opencl_engine(engine_);

    if (handle_info.is_host_ptr) {
        // Host pointer can be used with CPU memory only.
        // XXX: assumption is that SYCL can work with native host pointers.
        SAFE(is_cpu(engine_) ? OK : FAIL, CRIT);
    }

    if (is_cpu(engine_) && handle_info.is_allocate() && !is_sycl) {
        // Allocate memory for native runtime directly.
        is_data_owner_ = true;
        const size_t alignment = 2 * 1024 * 1024;

        const int nhandles = query_md_num_handles(md_);
        for (int i = 0; i < nhandles; i++) {
#ifdef DNNL_EXPERIMENTAL_SPARSE
            size_t sz = dnnl_memory_desc_get_size_v2(md_, i);
#else
            size_t sz = dnnl_memory_desc_get_size(md_);
#endif
            data_.push_back(zmalloc(sz, alignment));
        }
        if (std::any_of(
                    data_.cbegin(), data_.cend(), [](void *p) { return !p; })) {
            for (void *p : data_)
                zfree(p);
            DNN_SAFE(dnnl_out_of_memory, CRIT);
        }
#ifdef DNNL_EXPERIMENTAL_SPARSE
        DNN_SAFE(dnnl_memory_create_v2(
                         &m_, md_, engine_, (int)data_.size(), data_.data()),
                CRIT);
#else
        DNN_SAFE(dnnl_memory_create(&m_, md_, engine_, data_[0]), CRIT);
#endif

    } else if (is_sycl) {
        SAFE(initialize_memory_create_sycl(handle_info), CRIT);
    } else if (is_opencl) {
        SAFE(initialize_memory_create_opencl(handle_info), CRIT);
    } else {
        is_data_owner_ = false;
        const int nhandles = query_md_num_handles(md_);
        std::vector<void *> handles(nhandles, handle_info.ptr);
#ifdef DNNL_EXPERIMENTAL_SPARSE
        DNN_SAFE(dnnl_memory_create_v2(&m_, md_, engine_, (int)handles.size(),
                         handles.data()),
                CRIT);
#else
        DNN_SAFE(dnnl_memory_create(&m_, md_, engine_, handles[0]), CRIT);
#endif
    }
    return OK;
}

int dnn_mem_t::initialize(
        dnnl_engine_t engine, const handle_info_t &handle_info) {
    is_mapped_ = false;
    engine_ = engine;
    engine_kind_ = query_engine_kind(engine_);

    SAFE(initialize_memory_create(handle_info), CRIT);

    if (handle_info.is_allocate()) {
        // Memory objects consisting of several buffers can rely on indirect
        // data access through metadata (e.g., sparse memory objects).
        // Filling metadata buffers with random values can lead to accessing an
        // address location not controlled by the process. Thus, such metadata
        // buffers must be always properly filled according to the driver rules.
        // Filling buffers requires them to be mapped.
        // To save code on updating every case separately, update the logic in
        // this common place.
        const bool mem_has_indirect_access = is_sparse_md();
        if (!has_bench_mode_modifier(mode_modifier_t::no_ref_memory)
                || mem_has_indirect_access)
            map();

        const int nhandles = query_md_num_handles(md_);
        for (int i = 0; i < nhandles; i++) {
#ifdef DNNL_EXPERIMENTAL_SPARSE
            size_t sz = dnnl_memory_desc_get_size_v2(md_, i);
#else
            size_t sz = dnnl_memory_desc_get_size(md_);
#endif
            if (is_canary_protected_) sz = pad_memory_size(sz, engine_kind_);
            // Do not fill a memory if its size is zero. Moreover, memset
            // expects defined pointer, nullptr is not allowed.
            if (sz != 0) {
                // Avoid costy data reorders for cold cache mode when
                // initializing cold cache buffers.
                // TODO: consider enabling broadly for perf mode.
                if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)
                        || cold_cache_input.cold_cache_mode_
                                != default_cold_cache_input()
                                           .cold_cache_mode_) {
                    // Fill memory directly with 0x3F3F3F3F (0.747059f) number.
                    this->memset(dnnl_mem_default_perf_test_value, sz, i);
                } else {
                    // Fill memory with a magic number (NAN for fp data types)
                    // to catch possible uninitialized access.
                    ::memset(mapped_ptrs_[i], dnnl_mem_default_value, sz);
                }
            }
        }
    }

    return OK;
}

static int cleanup_sycl(
        const dnnl_engine_t &engine, const std::vector<void *> &data) {
#ifdef DNNL_WITH_SYCL
    switch (memory_kind) {
        case memory_kind_ext_t::usm_device:
        case memory_kind_ext_t::usm_shared: {
            auto eng = dnnl::engine(engine, true);
            auto ctx = dnnl::sycl_interop::get_context(eng);
            for (void *p : data)
                ::sycl::free(p, ctx);
            break;
        }
        default: break;
    }
#endif
    return OK;
}

static int cleanup_opencl(
        const dnnl_engine_t &engine, const std::vector<void *> &data) {
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
    switch (memory_kind) {
        case memory_kind_ext_t::usm_device:
        case memory_kind_ext_t::usm_shared:
            for (void *p : data)
                dnnl::impl::xpu::ocl::usm::free(engine, p);
            break;
        default: break;
    }
#endif
    return OK;
}

int dnn_mem_t::cleanup() {
    if (!active_) return OK;
    if (!has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) unmap();
    DNN_SAFE(dnnl_memory_desc_destroy(md_), CRIT);
    DNN_SAFE(dnnl_memory_destroy(m_), CRIT);
    if (is_data_owner_) {
        if (is_sycl_engine(engine_)) {
            SAFE(cleanup_sycl(engine_, data_), CRIT);
        } else if (is_opencl_engine(engine_)) {
            SAFE(cleanup_opencl(engine_, data_), CRIT);
        } else {
            for (void *p : data_)
                zfree(p);
        }
    }
    DNN_SAFE(dnnl_memory_destroy(m_padded_), CRIT);
    return OK;
}

void dnn_mem_t::set_dt(dnnl_data_type_t dt) const {
    // NOLINTNEXTLINE(readability-make-member-function-const)
    dnnl_memory_desc_set_data_type(md_, dt);
}

// Queries from memory descriptor.
int dnn_mem_t::ndims() const {
    return query_md_ndims(md_);
}

// Can't merge two below because compiler doesn't like conversion from
// pointer to reference type.
const dnnl_dims_t &dnn_mem_t::dims() const {
    return query_md_dims(md_);
}

const dnnl_dims_t &dnn_mem_t::padded_dims() const {
    return query_md_padded_dims(md_);
}

dnnl_data_type_t dnn_mem_t::dt(int buffer_index) const {
    return query_md_data_type(md_, buffer_index);
}

const dnnl_dims_t &dnn_mem_t::padded_offsets() const {
    return query_md_padded_offsets(md_);
}

dnnl_dim_t dnn_mem_t::offset0() const {
    return query_md_submemory_offset(md_);
}

dnnl_format_kind_t dnn_mem_t::format_kind() const {
    return query_md_format_kind(md_);
}

const dnnl_dims_t &dnn_mem_t::strides() const {
    return query_md_strides(md_);
}

int dnn_mem_t::inner_nblks() const {
    return query_md_inner_nblks(md_);
}

const dnnl_dims_t &dnn_mem_t::inner_blks() const {
    return query_md_inner_blks(md_);
}

const dnnl_dims_t &dnn_mem_t::inner_idxs() const {
    return query_md_inner_idxs(md_);
}

// Returns physical offset by logical one. logical offset is represented by a
// scalar l_offset. If is_pos_padded is true, l_offset represents logical
// offset in already padded area.
static dnnl_dim_t md_off_l(dnnl_dims_t _pos, const dnn_mem_t &mem,
        dnnl_dim_t l_offset, bool is_pos_padded = false) {
    dnnl_dims_t pos;
    const auto &_dims = is_pos_padded ? mem.padded_dims() : mem.dims();
    for (int rd = 0; rd < mem.ndims(); ++rd) {
        const int d = mem.ndims() - 1 - rd;
        const dnnl_dim_t cur_dim = _dims[d];
        pos[d] = l_offset % cur_dim;
        if (_pos) _pos[d] = pos[d];
        l_offset /= cur_dim;
    }
    return md_off_v(mem, pos, is_pos_padded);
}

template <typename T>
static int check_zero_padding_impl(
        const dnn_mem_t &mem, int arg, res_t *res, int *error_count) {
    const int ndims = mem.ndims();
    const auto &dims = mem.dims();
    const auto &pdims = mem.padded_dims();

    if (ndims == 0) return OK;
    if (mem.format_kind() != dnnl_blocked) return OK;

    auto product = [](const dnnl_dim_t *beg, const dnnl_dim_t *end) {
        return std::accumulate(
                beg, end, (dnnl_dim_t)1, std::multiplies<dnnl_dim_t>());
    };

    int errors = 0;
    std::atomic<int> ok(true);

    for (int dim_m_idx = 0; dim_m_idx < ndims; ++dim_m_idx) {
        if (dims[dim_m_idx] == pdims[dim_m_idx]) continue;

        auto dim_l = product(pdims, pdims + dim_m_idx);
        auto dim_r = product(pdims + dim_m_idx + 1, pdims + ndims);

        benchdnn_parallel_nd(dim_l, dim_r, [&](dnnl_dim_t l, dnnl_dim_t r) {
            for (dnnl_dim_t m = dims[dim_m_idx]; m < pdims[dim_m_idx]; ++m) {
                auto l_idx = (l * pdims[dim_m_idx] + m) * dim_r + r;
                auto idx = md_off_l(nullptr, mem, l_idx, true);
                if (!(mem.get_elem(idx) == 0)) ok = false;
            }
        });

        // Run the check one more time to report incorrect elements. This check
        // is sequential.
        if (!ok) {
            for_(dnnl_dim_t l = 0; l < dim_l; ++l)
            for_(dnnl_dim_t m = dims[dim_m_idx]; m < pdims[dim_m_idx]; ++m)
            for (dnnl_dim_t r = 0; r < dim_r; ++r) {
                auto l_idx = (l * pdims[dim_m_idx] + m) * dim_r + r;
                dnnl_dims_t pos = {};
                auto idx = md_off_l(pos, mem, l_idx, true);

                bool idx_ok = (mem.get_elem(idx) == 0);
                if (!idx_ok) errors++;

                const bool dump = (!idx_ok && (errors < 10 || verbose >= 10))
                        || (verbose >= 99);
                if (dump) {
                    BENCHDNN_PRINT(0,
                            "[%4ld][arg:%d]"
                            "[" IFMT "," IFMT "," IFMT "," IFMT "," IFMT
                            "," IFMT "] fp:  0.f dt:% 9.6g \n",
                            (long)idx, arg, pos[0], pos[1], pos[2], pos[3],
                            pos[4], pos[5], mem.get_elem(idx));
                }
            }
        }
    }

    if (!ok) {
        BENCHDNN_PRINT(0, "@@@ [arg:%d] check_zero_padding failed\n", arg);
        if (res) res->state = FAILED;
    }

    if (error_count != nullptr) *error_count = errors;

    return ok ? OK : FAIL;
}

int check_zero_padding(
        const dnn_mem_t &mem, int arg, res_t *res, int *error_count) {
#define CASE(dt, type) \
    case dt: return check_zero_padding_impl<type>(mem, arg, res, error_count);

    switch (mem.dt()) {
        case dnnl_data_type_undef:
            return OK;

            CASE(dnnl_e8m0, dnnl::impl::float8_e8m0_t);
            CASE(dnnl_f8_e5m2, dnnl::impl::float8_e5m2_t);
            CASE(dnnl_f8_e4m3, dnnl::impl::float8_e4m3_t);
            CASE(dnnl_bf16, dnnl::impl::bfloat16_t);
            CASE(dnnl_f16, dnnl::impl::float16_t);
            CASE(dnnl_f32, float);
            CASE(dnnl_f64, double);
            CASE(dnnl_s32, int32_t);
            CASE(dnnl_s8, int8_t);
            CASE(dnnl_u8, uint8_t);
            CASE(dnnl_s4, dnnl::impl::int4_t);
            CASE(dnnl_u4, dnnl::impl::uint4_t);
            CASE(dnnl_f4_e2m1, dnnl::impl::float4_e2m1_t);
            CASE(dnnl_f4_e3m0, dnnl::impl::float4_e3m0_t);
        default: assert(!"bad data_type");
    };
#undef CASE

    return FAIL;
}

int check_buffer_overwrite(const dnn_mem_t &mem, int arg, res_t *res) {
    if (!mem.is_canary_protected()) return OK;

    size_t sz = mem.size();
    size_t sz_padded = dnn_mem_t::pad_memory_size(sz, mem.engine_kind());

    auto *mem_ptr = (const uint8_t *)mem;
    for (size_t i = sz; i < sz_padded; i++) {
        if (mem_ptr[i] == dnnl_mem_default_value) continue;

        BENCHDNN_PRINT(0,
                "@@@ [arg:%d] check_buffer_overwrite failed. Expected: %d at "
                "byte: %lld but found: %d\n",
                arg, dnnl_mem_default_value, (long long)i, mem_ptr[i]);
        if (res) res->state = FAILED;
        return FAIL;
    }
    return OK;
}

// Returns physical offset by logical one. Logical offset is represented by an
// array pos. If is_pos_padded is true pos represents the position in already
// padded area.
dnnl_dim_t md_off_v(
        const dnn_mem_t &mem, const dnnl_dims_t pos, bool is_pos_padded) {
    assert(mem.format_kind() == dnnl_blocked);

    dnnl_dims_t pos_copy = {0};
    for (int d = 0; d < mem.ndims(); ++d)
        pos_copy[d] = pos[d] + (is_pos_padded ? 0 : mem.padded_offsets()[d]);

    dnnl_dim_t phys_offset = mem.offset0();

    const int nblks = mem.inner_nblks();
    if (nblks > 0) {
        const auto &inner_idxs = mem.inner_idxs();
        const auto &inner_blks = mem.inner_blks();
        dnnl_dim_t blk_stride = 1;
        for (int iblk = nblks - 1; iblk >= 0; --iblk) {
            const int d = inner_idxs[iblk];

            dnnl_dim_t p = pos_copy[d] % inner_blks[iblk];
            pos_copy[d] /= inner_blks[iblk];

            phys_offset += p * blk_stride;
            blk_stride *= inner_blks[iblk];
        }
    }

    for (int d = 0; d < mem.ndims(); ++d) {
        const dnnl_dim_t p = pos_copy[d];
        phys_offset += p * mem.strides()[d];
    }

    return phys_offset;
}

bool has_sparse_md(const dnn_mem_map_t &dnn_mem_map) {
    for (const auto &e : dnn_mem_map) {
        const auto &m = e.second;
        if (m.is_sparse_md()) return true;
    }
    return false;
}

dnnl_memory_desc_t clone_md(const_dnnl_memory_desc_t md) {
    dnnl_memory_desc_t cloned_md;
    auto status = dnnl_memory_desc_clone(&cloned_md, md);
    if (status != dnnl_success) return nullptr;
    return cloned_md;
}