Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bunch of functional fixes (fixes MFDNN-13193, fixes MFDNN-13282, fixes MFDNN-13236) #2825

Merged
merged 13 commits into from
Mar 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/cpu/reorder/cpu_reorder_comp_s8_s8.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2024 Intel Corporation
* Copyright 2020-2025 Intel Corporation
* Copyright 2023 FUJITSU LIMITED
*
* Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -28,6 +28,7 @@ const impl_list_map_t &comp_s8_s8_impl_list_map() {
// s8 -> s8
{{s8, s8, 2}, {
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_copy_reorder_t))
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
DNNL_NON_X64_ONLY(REG_SR(s8, oi, s8, OI4i16o4i, fmt_order::keep, spec::conv_req_comp))
Expand All @@ -50,6 +51,7 @@ const impl_list_map_t &comp_s8_s8_impl_list_map() {
// s8 -> s8
{{s8, s8, 3}, {
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_copy_reorder_t))
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, wio, fmt_order::keep, spec::conv_req_comp))
Expand Down Expand Up @@ -88,6 +90,7 @@ const impl_list_map_t &comp_s8_s8_impl_list_map() {
nullptr,
}},
{{s8, s8, 4}, {
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, hwio, fmt_order::keep, spec::conv_req_comp))
Expand Down Expand Up @@ -137,6 +140,7 @@ const impl_list_map_t &comp_s8_s8_impl_list_map() {
nullptr,
}},
{{s8, s8, 5}, {
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, hwigo, fmt_order::keep, spec::conv_req_comp))
Expand Down Expand Up @@ -183,6 +187,7 @@ const impl_list_map_t &comp_s8_s8_impl_list_map() {
nullptr,
}},
{{s8, s8, 6}, {
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, dhwigo, fmt_order::keep, spec::conv_req_comp))
Expand Down
5 changes: 1 addition & 4 deletions src/cpu/scale_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,10 @@ const float *precompute_scales(const memory_tracking::grantor_t &scratchpad,

const float *scales = nullptr;
if (req_copy_scales(attr, scale_adjust_factor)) {
const int wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
assert(wei_scale_mask >= 0);

size_t size = 0;
auto loc_scales
= scratchpad.template get<float>(key_precomputed_scales, &size);
if (wei_scale_mask == 0 || wei_scale_count == 1) {
if (wei_scale_count == 1) {
const size_t count = nstl::min(size / sizeof(float), scales_simd_w);
utils::array_set(loc_scales,
src_scales[0] * wei_scales[0] * scale_adjust_factor, count);
Expand Down
8 changes: 5 additions & 3 deletions src/cpu/x64/jit_avx512_core_x8s8s32x_conv_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1771,9 +1771,11 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
void jit_avx512_core_x8s8s32x_fwd_kernel::init_scratchpad(
memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp,
const primitive_attr_t &attr) {
const int wei_mask = attr.scales_.get_mask(DNNL_ARG_WEIGHTS);
const dim_t scales_count = wei_mask == 0 ? 1 : jcp.oc * jcp.ngroups;
dim_t count = wei_mask == 0 ? (dim_t)16 : scales_count;
dim_t count = 16;
if (!attr.scales_.has_default_values(DNNL_ARG_WEIGHTS)) {
const int wei_mask = attr.scales_.get_mask(DNNL_ARG_WEIGHTS);
if (wei_mask > 0) count = jcp.oc * jcp.ngroups;
}
scratchpad.book<float>(key_conv_adjusted_scales, count);
}

Expand Down
21 changes: 18 additions & 3 deletions src/cpu/x64/jit_uni_reorder_direct_copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,9 +309,13 @@ status_t jit_uni_reorder_direct_copy_t::pd_t::init(
VDISPATCH_REORDER(src_d.similar_to(dst_d, true, false, 0),
VERBOSE_TENSOR_FORMAT_MISMATCH, "src", "dst");

VDISPATCH_REORDER(
utils::everyone_is(0UL, src_d.extra().flags, dst_d.extra().flags),
VERBOSE_UNSUPPORTED_MD_FLAG);
VDISPATCH_REORDER(src_d.extra().flags == dst_d.extra().flags,
VERBOSE_UNSUPPORTED_MD_FLAG, "src or dst");

VDISPATCH_REORDER(IMPLICATION(src_d.extra().flags > 0UL,
src_d.additional_buffer_size()
== dst_d.additional_buffer_size()),
VERBOSE_UNSUPPORTED_MD_FLAG, "src or dst");

VDISPATCH_REORDER(attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);

Expand Down Expand Up @@ -374,6 +378,17 @@ status_t jit_uni_reorder_direct_copy_t::execute(const exec_ctx_t &ctx) const {
out + (start + dst_d.offset0()) * dst_dt_size, end - start);
});

if (src_d.is_additional_buffer()) {
// Verified in pd_t::init();
assert(src_d.extra().flags == dst_d.extra().flags);

const auto additional_size = src_d.additional_buffer_size();
const auto data_size = src_d.size(/* index = */ 0,
/* include_additional_size = */ false);
std::memcpy(out + data_size * dst_dt_size, in + data_size * src_dt_size,
additional_size);
}

return status::success;
}

Expand Down
10 changes: 8 additions & 2 deletions tests/benchdnn/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include <algorithm>
#include <cctype>
#include <cerrno>
#include <fstream>
#include <functional>
#include <string>
Expand Down Expand Up @@ -206,6 +207,7 @@ static void *zmalloc_protect(size_t size) {
// Protect one page right after the block of size bytes
int err = mprotect(ptr_protect, page_sz, PROT_NONE);
if (err != 0) {
printf("Error: mprotect returned \'%s\'.\n", strerror(errno));
::free(ptr_start);
return nullptr;
}
Expand Down Expand Up @@ -239,7 +241,10 @@ static void zfree_protect(void *ptr) {

void *zmalloc(size_t size, size_t align) {
#ifdef BENCHDNN_MEMORY_CHECK
if (has_bench_mode_bit(mode_bit_t::exec)) { return zmalloc_protect(size); }
if (has_bench_mode_bit(mode_bit_t::exec)
&& !has_bench_mode_bit(mode_bit_t::perf)) {
return zmalloc_protect(size);
}
#endif

void *ptr;
Expand All @@ -264,7 +269,8 @@ void *zmalloc(size_t size, size_t align) {
void zfree(void *ptr) {
if (!ptr) return;
#ifdef BENCHDNN_MEMORY_CHECK
if (has_bench_mode_bit(mode_bit_t::exec)) {
if (has_bench_mode_bit(mode_bit_t::exec)
&& !has_bench_mode_bit(mode_bit_t::perf)) {
zfree_protect(ptr);
return;
}
Expand Down
38 changes: 35 additions & 3 deletions tests/benchdnn/dnnl_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,14 @@ size_t dnn_mem_t::size() const {
return dnnl_memory_desc_get_size(md_);
}

bool dnn_mem_t::is_sparse_md() const {
#ifdef DNNL_EXPERIMENTAL_SPARSE
return query_md_sparse_encoding(md_) != dnnl_sparse_encoding_undef;
#else
return false;
#endif
}

size_t dnn_mem_t::sizeof_dt() const {
return dnnl_data_type_size(dt());
}
Expand Down Expand Up @@ -478,12 +486,17 @@ void dnn_mem_t::unmap() const {
}
}

void dnn_mem_t::memset(int value, size_t size) const {
void dnn_mem_t::memset(int value, size_t size, int buffer_index) const {
bool is_opencl = is_opencl_engine(engine_);
bool is_sycl = is_sycl_engine(engine_);
auto mem = m_padded_ ? m_padded_ : m_;
void *mem_handle;
#ifdef DNNL_EXPERIMENTAL_SPARSE
DNN_SAFE_V(dnnl_memory_get_data_handle_v2(mem, &mem_handle, buffer_index));
#else
DNN_SAFE_V(dnnl_memory_get_data_handle(mem, &mem_handle));
#endif

if (is_opencl) {
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
stream_t stream(engine_);
Expand Down Expand Up @@ -900,7 +913,18 @@ int dnn_mem_t::initialize(
SAFE(initialize_memory_create(handle_info), CRIT);

if (handle_info.is_allocate()) {
if (!has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) map();
// Memory objects consisting of several buffers can rely on indirect
// data access through metadata (e.g., sparse memory objects).
// Filling metadata buffers with random values can lead to accessing an
// address location not controlled by the process. Thus, such metadata
// buffers must be always properly filled according to the driver rules.
// Filling buffers requires them to be mapped.
// To save code on updating every case separately, update the logic in
// this common place.
const bool mem_has_indirect_access = is_sparse_md();
if (!has_bench_mode_modifier(mode_modifier_t::no_ref_memory)
|| mem_has_indirect_access)
map();

const int nhandles = query_md_num_handles(md_);
for (int i = 0; i < nhandles; i++) {
Expand All @@ -921,7 +945,7 @@ int dnn_mem_t::initialize(
!= default_cold_cache_input()
.cold_cache_mode_) {
// Fill memory directly with 0x3F3F3F3F (0.747059f) number.
this->memset(dnnl_mem_default_perf_test_value, sz);
this->memset(dnnl_mem_default_perf_test_value, sz, i);
} else {
// Fill memory with a magic number (NAN for fp data types)
// to catch possible uninitialized access.
Expand Down Expand Up @@ -1211,6 +1235,14 @@ dnnl_dim_t md_off_v(
return phys_offset;
}

bool has_sparse_md(const dnn_mem_map_t &dnn_mem_map) {
for (const auto &e : dnn_mem_map) {
const auto &m = e.second;
if (m.is_sparse_md()) return true;
}
return false;
}

dnnl_memory_desc_t clone_md(const_dnnl_memory_desc_t md) {
dnnl_memory_desc_t cloned_md;
auto status = dnnl_memory_desc_clone(&cloned_md, md);
Expand Down
11 changes: 9 additions & 2 deletions tests/benchdnn/dnnl_memory.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2017-2024 Intel Corporation
* Copyright 2017-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -115,6 +115,11 @@ struct dnn_mem_t {
const dnnl_dims_t &inner_blks() const;
const dnnl_dims_t &inner_idxs() const;

// Sparse memories require special handling for `no_ref_memory` modifier
// because of indirect access. Thus, filling should apply to metadata and
// it must be meaningful. It implies unconditional mapping.
bool is_sparse_md() const;

size_t sizeof_dt() const;

template <typename T>
Expand Down Expand Up @@ -150,7 +155,7 @@ struct dnn_mem_t {

void map() const;
void unmap() const;
void memset(int value, size_t size) const;
void memset(int value, size_t size, int buffer_index) const;

static dnn_mem_t create_from_host_ptr(
const dnnl_memory_desc_t &md, dnnl_engine_t engine, void *host_ptr);
Expand Down Expand Up @@ -216,6 +221,8 @@ struct dnn_mem_t {

using dnn_mem_map_t = std::unordered_map<int, dnn_mem_t>;

bool has_sparse_md(const dnn_mem_map_t &dnn_mem_map);

dnnl_memory_desc_t clone_md(const_dnnl_memory_desc_t md);

// Checks that zero padding is preserved.
Expand Down
2 changes: 1 addition & 1 deletion tests/benchdnn/doc/knobs_common.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ only. This mode targets forward and backward by data propagation kinds. When
This targets any propagation kind but mostly bandwidth-limited functionality
to emulate first access to data or branching cases. When `MODE` is set to
`custom`, cold cache is enabled for specified arguments, but it requires source
code adjustments. Refer to [cold cache](cold_cache.md) for more information.
code adjustments. Refer to [cold cache](knob_cold_cache.md) for more information.

### --fix-times-per-prb
`--fix-times-per-prb=N` specifies the `N` number of rounds per problem to run,
Expand Down
11 changes: 11 additions & 0 deletions tests/benchdnn/inputs/reorder/harness_reorder_large
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# test if jit kernels properly handle corner cases:
# * large stride problems
# * huge dimensions (UINT_MAX + 1)
--reset
--skip-impl=ref,simple # run only jit impl, won't iterate
--sdt=f32
--ddt=f32
--stag=abx
--dtag=aBx8b
2x16x19200x19200
1x4294967296x1
13 changes: 3 additions & 10 deletions tests/benchdnn/inputs/reorder/test_reorder_all
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,6 @@
--stag=aBx4b,aBx8b --dtag=aBx16b 2x71x16x16 2x72x16x16 2x73x16x16
--stag=aBx16b --dtag=aBx8b 2x71x16x16 2x72x16x16 2x73x16x16

# test if jit kernels properly handle corner cases:
# * large stride problems
# * huge dimensions (UINT_MAX + 1)
--reset
--skip-impl=ref,simple # ! test jit version only
--sdt=f32 --ddt=f32
--stag=abx --dtag=aBx8b 2x16x19200x19200
--skip-impl=
1x4294967296x1

# f16
--batch=test_reorder_float16

Expand Down Expand Up @@ -102,3 +92,6 @@

# Decompression quantization
--batch=harness_reorder_decompression

# large problems
--batch=harness_reorder_large
1 change: 1 addition & 0 deletions tests/benchdnn/inputs/reorder/test_reorder_gpu
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@

# Catch overflows
--reset
--skip-impl=ref
2147483648_n"int_overflow"
4294967296_n"uint_overflow"
2147483869_n"nd_range_overflow"
23 changes: 20 additions & 3 deletions tests/benchdnn/matmul/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,9 @@ int fill_sparse_data(data_kind_t kind, const prb_t *prb, dnn_mem_t &mem_dt,
mem_dt.set_elem(i, index, indices_idx);
});

// Don't fill data for `no_ref_memory` as it will be filled by benchdnn.
if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;

// Generate values.
cfg_t cfg(prb, {SRC, WEI, BIA, DST});

Expand Down Expand Up @@ -783,7 +786,16 @@ std::vector<int> supported_exec_args(dir_t dir) {
int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
dnnl_primitive_t prim, const prb_t *prb, res_t *res,
dnnl_primitive_t prim_ref) {
if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
// Sparse functionality relies on indirect access to the data. While the
// data itself can be anything for `no_ref_memory` modifier, metadata values
// must be meaningful, otherwise a jump to a random memory location outside
// of allocated bytes will happen.
// If there's a sparse memory, non-sparse memory and non-metadata handles
// will not reach the filling.
const bool map_has_sparse_mem = has_sparse_md(mem_map);
if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)
&& !map_has_sparse_mem)
return OK;

const auto &ref_engine = get_cpu_engine();

Expand All @@ -805,13 +817,18 @@ int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,

const bool is_sparse_src = exec_arg == DNNL_ARG_SRC
&& src_encoding != dnnl_sparse_encoding_undef;

const bool is_sparse_wei = exec_arg == DNNL_ARG_WEIGHTS
&& wei_encoding != dnnl_sparse_encoding_undef;
const bool is_sparse = is_sparse_src || is_sparse_wei;
const bool is_sparse_wei_packed
= is_sparse_wei && wei_encoding == dnnl_packed;

if ((is_sparse_src || is_sparse_wei) && !is_sparse_wei_packed) {
// See the comment at the beginning of the function.
if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)
&& !is_sparse)
continue;

if (is_sparse && !is_sparse_wei_packed) {
if (is_sparse_src) {
auto src_fp_d = create_md(prb, SRC);
ref_mem_map.emplace(exec_arg, dnn_mem_t(src_fp_d, ref_engine));
Expand Down
7 changes: 5 additions & 2 deletions tests/benchdnn/reorder/reorder_aux.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2017-2024 Intel Corporation
* Copyright 2017-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -39,7 +39,10 @@ flag_t str2flag(const char *str) {
else if (sub.compare("zp_comp") == 0)
flag = FLAG_ZP_COMP;
else {
assert(!"unknown flag");
BENCHDNN_PRINT(0,
"Error: unsupported flag value \'%s\'. Supported values are "
"\'s8s8_comp\' and \'zp_comp\'.\n",
sub.c_str());
SAFE_V(FAIL);
}

Expand Down
Loading
Loading