uxlfoundation · dzarukin · Mar 12, 2025 · Mar 5, 2025 · Mar 5, 2025 · Mar 5, 2025
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2023 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,6 +28,7 @@ const impl_list_map_t &comp_s8_s8_impl_list_map() {
         // s8 -> s8
         {{s8, s8, 2}, {
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_copy_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
             DNNL_NON_X64_ONLY(REG_SR(s8, oi, s8, OI4i16o4i, fmt_order::keep, spec::conv_req_comp))
@@ -50,6 +51,7 @@ const impl_list_map_t &comp_s8_s8_impl_list_map() {
         // s8 -> s8
         {{s8, s8, 3}, {
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_copy_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
             DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, wio, fmt_order::keep, spec::conv_req_comp))
@@ -88,6 +90,7 @@ const impl_list_map_t &comp_s8_s8_impl_list_map() {
             nullptr,
         }},
         {{s8, s8, 4}, {
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
             DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, hwio, fmt_order::keep, spec::conv_req_comp))
@@ -137,6 +140,7 @@ const impl_list_map_t &comp_s8_s8_impl_list_map() {
             nullptr,
         }},
         {{s8, s8, 5}, {
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
             DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, hwigo, fmt_order::keep, spec::conv_req_comp))
@@ -183,6 +187,7 @@ const impl_list_map_t &comp_s8_s8_impl_list_map() {
             nullptr,
         }},
         {{s8, s8, 6}, {
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
             DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, dhwigo, fmt_order::keep, spec::conv_req_comp))

@@ -91,13 +91,10 @@ const float *precompute_scales(const memory_tracking::grantor_t &scratchpad,
 
     const float *scales = nullptr;
     if (req_copy_scales(attr, scale_adjust_factor)) {
-        const int wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
-        assert(wei_scale_mask >= 0);
-
         size_t size = 0;
         auto loc_scales
                 = scratchpad.template get<float>(key_precomputed_scales, &size);
-        if (wei_scale_mask == 0 || wei_scale_count == 1) {
+        if (wei_scale_count == 1) {
             const size_t count = nstl::min(size / sizeof(float), scales_simd_w);
             utils::array_set(loc_scales,
                     src_scales[0] * wei_scales[0] * scale_adjust_factor, count);

@@ -1771,9 +1771,11 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
 void jit_avx512_core_x8s8s32x_fwd_kernel::init_scratchpad(
         memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp,
         const primitive_attr_t &attr) {
-    const int wei_mask = attr.scales_.get_mask(DNNL_ARG_WEIGHTS);
-    const dim_t scales_count = wei_mask == 0 ? 1 : jcp.oc * jcp.ngroups;
-    dim_t count = wei_mask == 0 ? (dim_t)16 : scales_count;
+    dim_t count = 16;
+    if (!attr.scales_.has_default_values(DNNL_ARG_WEIGHTS)) {
+        const int wei_mask = attr.scales_.get_mask(DNNL_ARG_WEIGHTS);
+        if (wei_mask > 0) count = jcp.oc * jcp.ngroups;
+    }
     scratchpad.book<float>(key_conv_adjusted_scales, count);
 }
 

@@ -309,9 +309,13 @@ status_t jit_uni_reorder_direct_copy_t::pd_t::init(
     VDISPATCH_REORDER(src_d.similar_to(dst_d, true, false, 0),
             VERBOSE_TENSOR_FORMAT_MISMATCH, "src", "dst");
 
-    VDISPATCH_REORDER(
-            utils::everyone_is(0UL, src_d.extra().flags, dst_d.extra().flags),
-            VERBOSE_UNSUPPORTED_MD_FLAG);
+    VDISPATCH_REORDER(src_d.extra().flags == dst_d.extra().flags,
+            VERBOSE_UNSUPPORTED_MD_FLAG, "src or dst");
+
+    VDISPATCH_REORDER(IMPLICATION(src_d.extra().flags > 0UL,
+                              src_d.additional_buffer_size()
+                                      == dst_d.additional_buffer_size()),
+            VERBOSE_UNSUPPORTED_MD_FLAG, "src or dst");
 
     VDISPATCH_REORDER(attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
 
@@ -374,6 +378,17 @@ status_t jit_uni_reorder_direct_copy_t::execute(const exec_ctx_t &ctx) const {
                 out + (start + dst_d.offset0()) * dst_dt_size, end - start);
     });
 
+    if (src_d.is_additional_buffer()) {
+        // Verified in pd_t::init();
+        assert(src_d.extra().flags == dst_d.extra().flags);
+
+        const auto additional_size = src_d.additional_buffer_size();
+        const auto data_size = src_d.size(/* index = */ 0,
+                /* include_additional_size = */ false);
+        std::memcpy(out + data_size * dst_dt_size, in + data_size * src_dt_size,
+                additional_size);
+    }
+
     return status::success;
 }
 

@@ -20,6 +20,7 @@
 
 #include <algorithm>
 #include <cctype>
+#include <cerrno>
 #include <fstream>
 #include <functional>
 #include <string>
@@ -206,6 +207,7 @@ static void *zmalloc_protect(size_t size) {
     // Protect one page right after the block of size bytes
     int err = mprotect(ptr_protect, page_sz, PROT_NONE);
     if (err != 0) {
+        printf("Error: mprotect returned \'%s\'.\n", strerror(errno));
         ::free(ptr_start);
         return nullptr;
     }
@@ -239,7 +241,10 @@ static void zfree_protect(void *ptr) {
 
 void *zmalloc(size_t size, size_t align) {
 #ifdef BENCHDNN_MEMORY_CHECK
-    if (has_bench_mode_bit(mode_bit_t::exec)) { return zmalloc_protect(size); }
+    if (has_bench_mode_bit(mode_bit_t::exec)
+            && !has_bench_mode_bit(mode_bit_t::perf)) {
+        return zmalloc_protect(size);
+    }
 #endif
 
     void *ptr;
@@ -264,7 +269,8 @@ void *zmalloc(size_t size, size_t align) {
 void zfree(void *ptr) {
     if (!ptr) return;
 #ifdef BENCHDNN_MEMORY_CHECK
-    if (has_bench_mode_bit(mode_bit_t::exec)) {
+    if (has_bench_mode_bit(mode_bit_t::exec)
+            && !has_bench_mode_bit(mode_bit_t::perf)) {
         zfree_protect(ptr);
         return;
     }

@@ -205,6 +205,14 @@ size_t dnn_mem_t::size() const {
     return dnnl_memory_desc_get_size(md_);
 }
 
+bool dnn_mem_t::is_sparse_md() const {
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    return query_md_sparse_encoding(md_) != dnnl_sparse_encoding_undef;
+#else
+    return false;
+#endif
+}
+
 size_t dnn_mem_t::sizeof_dt() const {
     return dnnl_data_type_size(dt());
 }
@@ -478,12 +486,17 @@ void dnn_mem_t::unmap() const {
     }
 }
 
-void dnn_mem_t::memset(int value, size_t size) const {
+void dnn_mem_t::memset(int value, size_t size, int buffer_index) const {
     bool is_opencl = is_opencl_engine(engine_);
     bool is_sycl = is_sycl_engine(engine_);
     auto mem = m_padded_ ? m_padded_ : m_;
     void *mem_handle;
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    DNN_SAFE_V(dnnl_memory_get_data_handle_v2(mem, &mem_handle, buffer_index));
+#else
     DNN_SAFE_V(dnnl_memory_get_data_handle(mem, &mem_handle));
+#endif
+
     if (is_opencl) {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
         stream_t stream(engine_);
@@ -900,7 +913,18 @@ int dnn_mem_t::initialize(
     SAFE(initialize_memory_create(handle_info), CRIT);
 
     if (handle_info.is_allocate()) {
-        if (!has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) map();
+        // Memory objects consisting of several buffers can rely on indirect
+        // data access through metadata (e.g., sparse memory objects).
+        // Filling metadata buffers with random values can lead to accessing an
+        // address location not controlled by the process. Thus, such metadata
+        // buffers must be always properly filled according to the driver rules.
+        // Filling buffers requires them to be mapped.
+        // To save code on updating every case separately, update the logic in
+        // this common place.
+        const bool mem_has_indirect_access = is_sparse_md();
+        if (!has_bench_mode_modifier(mode_modifier_t::no_ref_memory)
+                || mem_has_indirect_access)
+            map();
 
         const int nhandles = query_md_num_handles(md_);
         for (int i = 0; i < nhandles; i++) {
@@ -921,7 +945,7 @@ int dnn_mem_t::initialize(
                                 != default_cold_cache_input()
                                            .cold_cache_mode_) {
                     // Fill memory directly with 0x3F3F3F3F (0.747059f) number.
-                    this->memset(dnnl_mem_default_perf_test_value, sz);
+                    this->memset(dnnl_mem_default_perf_test_value, sz, i);
                 } else {
                     // Fill memory with a magic number (NAN for fp data types)
                     // to catch possible uninitialized access.
@@ -1211,6 +1235,14 @@ dnnl_dim_t md_off_v(
     return phys_offset;
 }
 
+bool has_sparse_md(const dnn_mem_map_t &dnn_mem_map) {
+    for (const auto &e : dnn_mem_map) {
+        const auto &m = e.second;
+        if (m.is_sparse_md()) return true;
+    }
+    return false;
+}
+
 dnnl_memory_desc_t clone_md(const_dnnl_memory_desc_t md) {
     dnnl_memory_desc_t cloned_md;
     auto status = dnnl_memory_desc_clone(&cloned_md, md);

@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -115,6 +115,11 @@ struct dnn_mem_t {
     const dnnl_dims_t &inner_blks() const;
     const dnnl_dims_t &inner_idxs() const;
 
+    // Sparse memories require special handling for `no_ref_memory` modifier
+    // because of indirect access. Thus, filling should apply to metadata and
+    // it must be meaningful. It implies unconditional mapping.
+    bool is_sparse_md() const;
+
     size_t sizeof_dt() const;
 
     template <typename T>
@@ -150,7 +155,7 @@ struct dnn_mem_t {
 
     void map() const;
     void unmap() const;
-    void memset(int value, size_t size) const;
+    void memset(int value, size_t size, int buffer_index) const;
 
     static dnn_mem_t create_from_host_ptr(
             const dnnl_memory_desc_t &md, dnnl_engine_t engine, void *host_ptr);
@@ -216,6 +221,8 @@ struct dnn_mem_t {
 
 using dnn_mem_map_t = std::unordered_map<int, dnn_mem_t>;
 
+bool has_sparse_md(const dnn_mem_map_t &dnn_mem_map);
+
 dnnl_memory_desc_t clone_md(const_dnnl_memory_desc_t md);
 
 // Checks that zero padding is preserved.

@@ -192,7 +192,7 @@ only. This mode targets forward and backward by data propagation kinds. When
 This targets any propagation kind but mostly bandwidth-limited functionality
 to emulate first access to data or branching cases. When `MODE` is set to
 `custom`, cold cache is enabled for specified arguments, but it requires source
-code adjustments. Refer to [cold cache](cold_cache.md) for more information.
+code adjustments. Refer to [cold cache](knob_cold_cache.md) for more information.
 
 ### --fix-times-per-prb
 `--fix-times-per-prb=N` specifies the `N` number of rounds per problem to run,

@@ -0,0 +1,11 @@
+# test if jit kernels properly handle corner cases:
+# * large stride problems
+# * huge dimensions (UINT_MAX + 1)
+--reset
+--skip-impl=ref,simple # run only jit impl, won't iterate
+--sdt=f32
+--ddt=f32
+--stag=abx
+--dtag=aBx8b
+2x16x19200x19200
+1x4294967296x1
@@ -60,16 +60,6 @@
 --stag=aBx4b,aBx8b --dtag=aBx16b 2x71x16x16 2x72x16x16 2x73x16x16
 --stag=aBx16b      --dtag=aBx8b  2x71x16x16 2x72x16x16 2x73x16x16
 
-# test if jit kernels properly handle corner cases:
-# * large stride problems
-# * huge dimensions (UINT_MAX + 1)
---reset
---skip-impl=ref,simple # ! test jit version only
---sdt=f32 --ddt=f32
---stag=abx --dtag=aBx8b 2x16x19200x19200
---skip-impl=
-1x4294967296x1
-
 # f16
 --batch=test_reorder_float16
 
@@ -102,3 +92,6 @@
 
 # Decompression quantization
 --batch=harness_reorder_decompression
+
+# large problems
+--batch=harness_reorder_large
@@ -268,6 +268,7 @@
 
 # Catch overflows
 --reset
+--skip-impl=ref
 2147483648_n"int_overflow"
 4294967296_n"uint_overflow"
 2147483869_n"nd_range_overflow"
@@ -326,6 +326,9 @@ int fill_sparse_data(data_kind_t kind, const prb_t *prb, dnn_mem_t &mem_dt,
         mem_dt.set_elem(i, index, indices_idx);
     });
 
+    // Don't fill data for `no_ref_memory` as it will be filled by benchdnn.
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
+
     // Generate values.
     cfg_t cfg(prb, {SRC, WEI, BIA, DST});
 
@@ -783,7 +786,16 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
+    // Sparse functionality relies on indirect access to the data. While the
+    // data itself can be anything for `no_ref_memory` modifier, metadata values
+    // must be meaningful, otherwise a jump to a random memory location outside
+    // of allocated bytes will happen.
+    // If there's a sparse memory, non-sparse memory and non-metadata handles
+    // will not reach the filling.
+    const bool map_has_sparse_mem = has_sparse_md(mem_map);
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)
+            && !map_has_sparse_mem)
+        return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -805,13 +817,18 @@ int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
 
         const bool is_sparse_src = exec_arg == DNNL_ARG_SRC
                 && src_encoding != dnnl_sparse_encoding_undef;
-
         const bool is_sparse_wei = exec_arg == DNNL_ARG_WEIGHTS
                 && wei_encoding != dnnl_sparse_encoding_undef;
+        const bool is_sparse = is_sparse_src || is_sparse_wei;
         const bool is_sparse_wei_packed
                 = is_sparse_wei && wei_encoding == dnnl_packed;
 
-        if ((is_sparse_src || is_sparse_wei) && !is_sparse_wei_packed) {
+        // See the comment at the beginning of the function.
+        if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)
+                && !is_sparse)
+            continue;
+
+        if (is_sparse && !is_sparse_wei_packed) {
             if (is_sparse_src) {
                 auto src_fp_d = create_md(prb, SRC);
                 ref_mem_map.emplace(exec_arg, dnn_mem_t(src_fp_d, ref_engine));

@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,7 +39,10 @@ flag_t str2flag(const char *str) {
     else if (sub.compare("zp_comp") == 0)
         flag = FLAG_ZP_COMP;
     else {
-        assert(!"unknown flag");
+        BENCHDNN_PRINT(0,
+                "Error: unsupported flag value \'%s\'. Supported values are "
+                "\'s8s8_comp\' and \'zp_comp\'.\n",
+                sub.c_str());
         SAFE_V(FAIL);
     }