From db0deb36d4003d2dcf3bf4d1e1dcf3397ca9aa19 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Tue, 18 Mar 2025 12:48:10 -0700 Subject: [PATCH 01/12] benchdnn: matmul: adjust int4 invalid cases --- tests/benchdnn/matmul/matmul.cpp | 42 ++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/tests/benchdnn/matmul/matmul.cpp b/tests/benchdnn/matmul/matmul.cpp index b07c4bee515..dd5e788ac43 100644 --- a/tests/benchdnn/matmul/matmul.cpp +++ b/tests/benchdnn/matmul/matmul.cpp @@ -703,15 +703,41 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) { } } + // Check int4 weights byte alignment if format is specified. if ((prb->wei_dt() == dnnl_s4 || prb->wei_dt() == dnnl_u4) - && (prb->n % 2)) { - BENCHDNN_PRINT(2, - "[INVALID][%s:%d]: Int4 Weights decompression requires OC " - "('%d') to be even.\n", - __FILE__, __LINE__, (int)prb->n); - res->state = SKIPPED; - res->reason = skip_reason::invalid_case; - return; + && (!prb->strides[WEI].empty() + || (prb->wtag != tag::any && prb->wtag != tag::undef))) { + const auto &weights_rt_dims = get_runtime_dims( + prb->weights_dims(), prb->weights_runtime_dim_mask()); + const auto wei_md + = dnn_mem_t::init_md(prb->ndims, weights_rt_dims.data(), + prb->wei_dt(), prb->wtag, prb->strides[STRIDES_WEI]); + + const auto wei_strides = query_md_strides(wei_md); + int n_unit_strides = 0; + for (int d = 0; d < query_md_ndims(wei_md); d++) { + if (wei_strides[d] == 1) { + n_unit_strides++; + if (n_unit_strides > 1) { + BENCHDNN_PRINT(2, + "[INVALID][%s:%d]: Int4 Weights decompression " + "requires byte alignment for the tensor.\n", + __FILE__, __LINE__); + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; + return; + } + } + if (wei_strides[d] > 1 && (wei_strides[d] % 2)) { + BENCHDNN_PRINT(2, + "[INVALID][%s:%d]: Int4 Weights decompression requires " + "byte alignment for the tensor.\n", + __FILE__, __LINE__); + res->state = SKIPPED; + res->reason = skip_reason::invalid_case; + return; + } + } } auto src_rt_mask = prb->src_runtime_dim_mask(); From 4e9595dc760f49dbb42845adfe74cfaf96fa0cee Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Tue, 18 Mar 2025 13:35:52 -0700 Subject: [PATCH 02/12] common: matmul: adjust check for int4 tensors w.r.t. strides --- src/common/matmul.cpp | 48 +++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/src/common/matmul.cpp b/src/common/matmul.cpp index a838416503a..12174cc3b24 100644 --- a/src/common/matmul.cpp +++ b/src/common/matmul.cpp @@ -354,21 +354,39 @@ status_t matmul_desc_init(matmul_desc_t *matmul_desc, ? utils::get_dims_mask(dst_desc->dims, op_d.bias_desc.dims, ndims) : 0; - // TODO: requirement is for innermost dim to be multiple of 2 for - // the memory to be byte aligned. - - // s4/u4/f4 weights requires n to be multiple of 2 to be byte aligned - VCHECK_MATMUL(IMPLICATION(utils::one_of(weights_desc->data_type, - data_type::s4, data_type::u4, - data_type::f4_e2m1, data_type::f4_e3m0), - weights_desc->dims[n_idx] % 2 == 0), - VERBOSE_BAD_DIM, "weights", n_idx); - // s4/u4/f4 src requires k to be multiple of 2 to be byte aligned - VCHECK_MATMUL(IMPLICATION(utils::one_of(src_desc->data_type, data_type::s4, - data_type::u4, data_type::f4_e2m1, - data_type::f4_e3m0), - src_desc->dims[k_idx_src] % 2 == 0), - VERBOSE_BAD_DIM, "src", n_idx); + using namespace data_type; + if (weights_desc->format_kind == format_kind::blocked + && utils::one_of( + weights_desc->data_type, s4, u4, f4_e2m1, f4_e3m0)) { + const auto &wei_strides = weights_desc->format_desc.blocking.strides; + + int n_unit_strides = 0; + for (int d = 0; d < ndims; d++) { + if (wei_strides[d] == 1) { + n_unit_strides++; + VCHECK_MATMUL( + n_unit_strides <= 1, VERBOSE_BAD_DIM, "weights", d); + } + VCHECK_MATMUL( + IMPLICATION(wei_strides[d] > 1, wei_strides[d] % 2 == 0), + VERBOSE_BAD_DIM, "weights", d); + } + } + if (src_desc->format_kind == format_kind::blocked + && utils::one_of(src_desc->data_type, s4, u4, f4_e2m1, f4_e3m0)) { + const auto &src_strides = src_desc->format_desc.blocking.strides; + + int n_unit_strides = 0; + for (int d = 0; d < ndims; d++) { + if (src_strides[d] == 1) { + n_unit_strides++; + VCHECK_MATMUL(n_unit_strides <= 1, VERBOSE_BAD_DIM, "src", d); + } + VCHECK_MATMUL( + IMPLICATION(src_strides[d] > 1, src_strides[d] % 2 == 0), + VERBOSE_BAD_DIM, "src", d); + } + } // check if other dims match. for (int d = 0; d < ndims - 2; ++d) { From d53b92d5d259637c7c3f1fdb8b0d4a69c3c7f544 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Wed, 5 Mar 2025 17:09:49 -0800 Subject: [PATCH 03/12] cpu: x64: jit_uni_dw_utils: improve verbose --- src/cpu/x64/jit_uni_dw_conv_kernel_utils.cpp | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/cpu/x64/jit_uni_dw_conv_kernel_utils.cpp b/src/cpu/x64/jit_uni_dw_conv_kernel_utils.cpp index 1f0975424bb..b26b3e2f859 100644 --- a/src/cpu/x64/jit_uni_dw_conv_kernel_utils.cpp +++ b/src/cpu/x64/jit_uni_dw_conv_kernel_utils.cpp @@ -576,13 +576,19 @@ status_t jit_uni_dw_conv_bwd_weights_kernel::init_conf( = !is_data_layout_nxc && one_of(isa, avx512_core, avx2); if (ok_to_pad_channels) { jcp.ngroups = rnd_up(jcp.ngroups, jcp.ch_block); } - bool args_ok = true - && IMPLICATION(!is_data_layout_nxc, jcp.ngroups % jcp.ch_block == 0) - && jcp.dilate_h == 0 && jcp.dilate_w == 0 && jcp.kw <= 3 - && jcp.stride_w <= jcp.kw // no gaps in kernel - && jcp.oh == (jcp.ihp - jcp.kh) / jcp.stride_h + 1 - && jcp.ow == (jcp.iwp - jcp.kw) / jcp.stride_w + 1; - VDISPATCH_CONV_IC(args_ok, VERBOSE_BAD_PARAM, ""); + VDISPATCH_CONV_IC( + IMPLICATION(!is_data_layout_nxc, jcp.ngroups % jcp.ch_block == 0), + VERBOSE_BAD_PARAM, "number of groups doesn't divide channel block"); + VDISPATCH_CONV_IC(jcp.dilate_h == 0, VERBOSE_BAD_PARAM, "dilate_h"); + VDISPATCH_CONV_IC(jcp.dilate_w == 0, VERBOSE_BAD_PARAM, "dilate_w"); + VDISPATCH_CONV_IC(jcp.kw <= 3, VERBOSE_BAD_PARAM, "kw > 3"); + // No gaps in the kernel. + VDISPATCH_CONV_IC( + jcp.stride_w <= jcp.kw, VERBOSE_BAD_PARAM, "stride_w > kw"); + VDISPATCH_CONV_IC(jcp.oh == (jcp.ihp - jcp.kh) / jcp.stride_h + 1, + VERBOSE_BAD_PARAM, "oh != (ihp - kh) / stride_h + 1"); + VDISPATCH_CONV_IC(jcp.ow == (jcp.iwp - jcp.kw) / jcp.stride_w + 1, + VERBOSE_BAD_PARAM, "ow != (iwp - kw) / stride_w + 1"); jcp.nb_ch = div_up(jcp.ngroups, jcp.ch_block); From 36b50bcb910a4dd93d8a44898ba3cfe4588bb52a Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Tue, 18 Mar 2025 13:52:42 -0700 Subject: [PATCH 04/12] fixup: cpu: x64: jit_avx512_core_x8s8s32x_conv_kernel: restore scratchpad size --- .../jit_avx512_core_x8s8s32x_convolution.cpp | 17 ++++++++--------- .../jit_avx512_core_x8s8s32x_deconvolution.cpp | 10 ++++++---- src/cpu/x64/jit_uni_x8s8s32x_deconvolution.cpp | 9 ++++++--- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/cpu/x64/jit_avx512_core_x8s8s32x_convolution.cpp b/src/cpu/x64/jit_avx512_core_x8s8s32x_convolution.cpp index 50d6915d2b1..edf6ea16725 100644 --- a/src/cpu/x64/jit_avx512_core_x8s8s32x_convolution.cpp +++ b/src/cpu/x64/jit_avx512_core_x8s8s32x_convolution.cpp @@ -42,19 +42,18 @@ const float *jit_avx512_core_x8s8s32x_convolution_fwd_t::adjust_oscales( const memory_tracking::grantor_t &scratchpad, const float *src_scales, const float *wei_scales) const { auto loc_scales = scratchpad.template get(key_conv_adjusted_scales); - const float src_scale = src_scales[0]; + const bool has_wei_scales + = !pd()->attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS); const int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS); float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni)) ? 1.f / pd()->jcp_.wei_adj_scale : 1.f; - switch (wei_mask) { - case 0: - utils::array_set(loc_scales, src_scale * wei_scales[0] * factor, - pd()->jcp_.simd_w); - break; - default: - for (dim_t c = 0; c < pd()->OC(); c++) - loc_scales[c] = src_scale * wei_scales[c] * factor; + if (has_wei_scales && wei_mask > 0) { + for (dim_t c = 0; c < pd()->OC(); c++) + loc_scales[c] = src_scales[0] * wei_scales[c] * factor; + } else { + utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor, + pd()->jcp_.simd_w); } return loc_scales; } diff --git a/src/cpu/x64/jit_avx512_core_x8s8s32x_deconvolution.cpp b/src/cpu/x64/jit_avx512_core_x8s8s32x_deconvolution.cpp index f4acaffd478..9c4ee4e9395 100644 --- a/src/cpu/x64/jit_avx512_core_x8s8s32x_deconvolution.cpp +++ b/src/cpu/x64/jit_avx512_core_x8s8s32x_deconvolution.cpp @@ -1393,16 +1393,18 @@ const float *jit_avx512_core_x8s8s32x_deconvolution_fwd_t::adjust_oscales( const memory_tracking::grantor_t &scratchpad, const float *src_scales, const float *wei_scales) const { auto loc_scales = scratchpad.template get(key_conv_adjusted_scales); + const bool has_wei_scales + = !pd()->attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS); int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS); float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni)) ? 1.f / pd()->jcp_.wei_adj_scale : 1.0f; - if (wei_mask == 0) { - utils::array_set( - loc_scales, src_scales[0] * wei_scales[0] * factor, 16); - } else { + if (has_wei_scales && wei_mask > 0) { for (dim_t c = 0; c < pd()->OC(); c++) loc_scales[c] = src_scales[0] * wei_scales[c] * factor; + } else { + utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor, + /* WHY: pd()->jcp_.simd_w = 0!!! */ 16); } return loc_scales; } diff --git a/src/cpu/x64/jit_uni_x8s8s32x_deconvolution.cpp b/src/cpu/x64/jit_uni_x8s8s32x_deconvolution.cpp index de940d607d8..ca1eef1a159 100644 --- a/src/cpu/x64/jit_uni_x8s8s32x_deconvolution.cpp +++ b/src/cpu/x64/jit_uni_x8s8s32x_deconvolution.cpp @@ -1451,15 +1451,18 @@ const float *jit_uni_x8s8s32x_deconvolution_fwd_t::adjust_oscales( const memory_tracking::grantor_t &scratchpad, const float *src_scales, const float *wei_scales) const { auto loc_scales = scratchpad.template get(key_conv_adjusted_scales); + const bool has_wei_scales + = !pd()->attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS); int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS); float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni)) ? 1.f / pd()->jcp_.wei_adj_scale : 1.0f; - if (wei_mask == 0) { - utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor, 8); - } else { + if (has_wei_scales && wei_mask > 0) { for (dim_t c = 0; c < pd()->OC(); c++) loc_scales[c] = src_scales[0] * wei_scales[c] * factor; + } else { + utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor, + /* WHY: pd()->jcp_.simd_w = 0!!! */ 8); } return loc_scales; } From 147044738ef527eed597bc419a379a3986b3fba0 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Tue, 18 Mar 2025 15:04:16 -0700 Subject: [PATCH 05/12] benchdnn: reorder: update filling for s32->f32 cases with zero-points --- tests/benchdnn/reorder/cfg.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/benchdnn/reorder/cfg.cpp b/tests/benchdnn/reorder/cfg.cpp index a519a27edf3..9446ad1b425 100644 --- a/tests/benchdnn/reorder/cfg.cpp +++ b/tests/benchdnn/reorder/cfg.cpp @@ -42,10 +42,11 @@ REG(f8_e5m2, -f16_max_exact, f16_max_exact); REG(f8_e4m3, -f16_max_exact, f16_max_exact); REG(f4_e2m1, -f16_max_exact, f16_max_exact); REG(f4_e3m0, -f4_max_exact, f4_max_exact); -// Do not exceed max float value representable in integer. Otherwise, we get -// a correctness issue caused by different computations in reference and the -// library. -REG(s32, INT_MIN, BENCHDNN_S32_TO_F32_SAT_CONST); +// Do not exceed min/max float value representable in integer. Otherwise, we get +// a correctness issue caused by different computations or roudings in the naive +// reference and the library. One of those can be zero-point subtracting which +// leads to underflow or overflow. +REG(s32, -BENCHDNN_S32_TO_F32_SAT_CONST, BENCHDNN_S32_TO_F32_SAT_CONST); REG(s8, INT8_MIN, INT8_MAX); REG(u8, 0, UINT8_MAX); REG(s4, -7, 8); From 84c498f52b957f68392bbeca85de79f8590aaa2b Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Tue, 18 Mar 2025 16:26:33 -0700 Subject: [PATCH 06/12] benchdnn: matmul: adjust filling to incorporate zero-points --- tests/benchdnn/matmul/matmul.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/benchdnn/matmul/matmul.cpp b/tests/benchdnn/matmul/matmul.cpp index dd5e788ac43..e07ae52b349 100644 --- a/tests/benchdnn/matmul/matmul.cpp +++ b/tests/benchdnn/matmul/matmul.cpp @@ -413,6 +413,20 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg, density_args.n_acc = prb->k; const auto density = cfg.get_density(density_args); + const auto &e_zp_src = prb->attr.zero_points.get(DNNL_ARG_SRC); + const bool has_src_zp = !e_zp_src.is_def(); + const int src_zp_mask = attr_t::get_default_mask(e_zp_src.policy); + // Apply src_zp for source tensor only. + int src_zp = kind == SRC && has_src_zp && src_zp_mask == 0 ? e_zp_src.value + : 0; + + const auto &e_zp_wei = prb->attr.zero_points.get(DNNL_ARG_WEIGHTS); + const bool has_wei_zp = !e_zp_wei.is_def(); + const int wei_zp_mask = attr_t::get_default_mask(e_zp_wei.policy); + // Apply wei_zp for weights tensor only. + int wei_zp = kind == WEI && has_wei_zp && wei_zp_mask == 0 ? e_zp_wei.value + : 0; + /* Do fixed partitioning to have same filling for any number of threads */ const int64_t chunk_size = 64; const int64_t n_chunks = div_up(nelems, chunk_size); @@ -438,6 +452,7 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg, float val = 0; while (val <= 0) val = gen(int_seed); + val += src_zp + wei_zp; // Add zp so that it will be subtracted. mem_fp.set_elem( 0, round_to_nearest_representable(cfg.get_dt(kind), val)); idx_start += 1; @@ -453,6 +468,7 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg, val *= is_one; } else { val = is_one * gen(int_seed); + val += src_zp + wei_zp; // Add zp so that it will be subtracted. } mem_fp.set_elem( idx, round_to_nearest_representable(cfg.get_dt(kind), val)); From 34570cfe83d17cb6cd53c71bfccc3a694ed49468 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Wed, 19 Mar 2025 12:03:57 -0700 Subject: [PATCH 07/12] benchdnn: reorder: update skip conditions and add messages --- tests/benchdnn/reorder/reorder.cpp | 235 ++++++++++++++++++----------- 1 file changed, 149 insertions(+), 86 deletions(-) diff --git a/tests/benchdnn/reorder/reorder.cpp b/tests/benchdnn/reorder/reorder.cpp index 97cc6825830..4315b384f90 100644 --- a/tests/benchdnn/reorder/reorder.cpp +++ b/tests/benchdnn/reorder/reorder.cpp @@ -208,114 +208,163 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { skip_unimplemented_sum_po(prb->attr, res, dnnl_reorder, sdt); skip_unimplemented_prelu_po(prb->attr, res, dnnl_reorder); - bool scales_ok = true; -#if !defined(DNNL_X64) || DNNL_X64 == 0 - { - // reference reorder supports only a subset of scale policies - const std::vector supported_policy = {policy_t::COMMON, - policy_t::PER_DIM_0, policy_t::PER_DIM_1, policy_t::PER_DIM_01}; - - for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) { - scales_ok = std::any_of(supported_policy.cbegin(), - supported_policy.cend(), [&](const policy_t policy) { - return prb->attr.scales.get(arg).policy == policy; - }); - } - } -#endif - if (!scales_ok) { + const bool s32_src_ok = IMPLICATION(sdt == dnnl_s32, + ddt != dnnl_f8_e5m2 && ddt != dnnl_f8_e4m3 && ddt != dnnl_bf16 + && ddt != dnnl_f16); + const bool s32_dst_ok = IMPLICATION(ddt == dnnl_s32, + sdt != dnnl_f8_e5m2 && sdt != dnnl_f8_e4m3 && sdt != dnnl_bf16 + && sdt != dnnl_f16); + if (!s32_src_ok || !s32_dst_ok) { + BENCHDNN_PRINT(2, + "[SKIP][%s:%d]: Mixed (xf8,xf16)<-->s32 support is limited.\n", + __FILE__, __LINE__); res->state = SKIPPED; res->reason = skip_reason::case_not_supported; return; } - if (prb->is_reorder_with_compensation(FLAG_ANY)) { - // Compensation is supported for s8 dst data type. - const bool dt_ok = ddt == dnnl_s8; - // Compensation can be paired with dst scale only. - const bool attr_ok - = prb->attr.zero_points.is_def() && prb->attr.post_ops.is_def(); - // Compensation does not support runtime dims. - const bool rt_ok = prb->runtime_dim_mask == 0; - - // Compensation and scales mask should coincide - const auto comp_mask = prb->get_compensation_mask(FLAG_ANY); - bool masks_ok = true; - for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) { - const auto &e = prb->attr.scales.get(arg); - if (!e.is_def()) { - int e_mask = attr_t::get_default_mask(e.policy); - masks_ok = masks_ok && e_mask == comp_mask; + if (is_cpu()) { + bool scales_ok = true; +#if !defined(DNNL_X64) || DNNL_X64 == 0 + { + // reference reorder supports only a subset of scale policies + const std::vector supported_policy + = {policy_t::COMMON, policy_t::PER_DIM_0, + policy_t::PER_DIM_1, policy_t::PER_DIM_01}; + + for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) { + scales_ok = std::any_of(supported_policy.cbegin(), + supported_policy.cend(), [&](const policy_t policy) { + return prb->attr.scales.get(arg).policy == policy; + }); } } - - if (!dt_ok || !attr_ok || !rt_ok || !masks_ok) { +#endif + if (!scales_ok) { + BENCHDNN_PRINT(2, + "[SKIP][%s:%d]: Generic CPU doesn't support specified " + "scale mask.\n", + __FILE__, __LINE__); res->state = SKIPPED; res->reason = skip_reason::case_not_supported; return; } -#if !defined(DNNL_X64) || DNNL_X64 == 0 - // Simple reorder doesn't provide decent coverage for compensated cases. - // Shut them down unconditionally by default. - res->state = SKIPPED; - res->reason = skip_reason::case_not_supported; - return; -#endif - } + if (prb->is_reorder_with_compensation(FLAG_ANY)) { + const bool dt_ok = ddt == dnnl_s8; + if (!dt_ok) { + BENCHDNN_PRINT(2, + "[SKIP][%s:%d]: Compensation is supported only for s8 " + "dst data type.\n", + __FILE__, __LINE__); + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; + return; + } - // Destination scale is not supported for runtime dimensions since the - // implementation logic inverts dst scales and requires scratchpad for - // `mask > 0` cases which is impossible to estimate with rt dims. - const auto &dst_scales = prb->attr.scales.get(DNNL_ARG_DST); - if (!dst_scales.is_def() && attr_t::get_default_mask(dst_scales.policy) > 0 - && prb->runtime_dim_mask != 0) { - res->state = SKIPPED; - res->reason = skip_reason::case_not_supported; - return; - } + const bool attr_ok = prb->attr.zero_points.is_def() + && prb->attr.post_ops.is_def(); + if (!attr_ok) { + BENCHDNN_PRINT(2, + "[SKIP][%s:%d]: Compensation is supported with scale " + "attribute only.\n", + __FILE__, __LINE__); + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; + return; + } + + const bool rt_ok = prb->runtime_dim_mask == 0; + if (!rt_ok) { + BENCHDNN_PRINT(2, + "[SKIP][%s:%d]: Compensation is not supported for " + "runtime dimensions.\n", + __FILE__, __LINE__); + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; + return; + } + + const auto comp_mask = prb->get_compensation_mask(FLAG_ANY); + bool masks_ok = true; + for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) { + const auto &e = prb->attr.scales.get(arg); + if (!e.is_def()) { + int e_mask = attr_t::get_default_mask(e.policy); + masks_ok = masks_ok && e_mask == comp_mask; + } + } + if (!masks_ok) { + BENCHDNN_PRINT(2, + "[SKIP][%s:%d]: Compensation mask doesn't coincide " + "with scaling mask.\n", + __FILE__, __LINE__); + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; + return; + } - // Compensation is supported through jit reorder only, but jit reorder - // doesn't support different masks for source and destination scales. - const auto &src_scales = prb->attr.scales.get(DNNL_ARG_SRC); - if (!src_scales.is_def() && !dst_scales.is_def()) { - if (attr_t::get_default_mask(src_scales.policy) - != attr_t::get_default_mask(dst_scales.policy) - && prb->is_reorder_with_compensation(FLAG_ANY)) { +#if !defined(DNNL_X64) || DNNL_X64 == 0 + // Simple reorder doesn't provide decent coverage for compensated + // cases. Shut them down unconditionally by default. + BENCHDNN_PRINT(2, + "[SKIP][%s:%d]: Generic CPU doesn't support compensation " + "cases uniformly.\n", + __FILE__, __LINE__); res->state = SKIPPED; res->reason = skip_reason::case_not_supported; return; +#endif } - } - if (is_cpu()) { - // Int4 reorder support is limited on CPU. - if (sdt == dnnl_s4 || ddt == dnnl_s4 || sdt == dnnl_u4 - || ddt == dnnl_u4) { + const auto &dst_scales = prb->attr.scales.get(DNNL_ARG_DST); + if (!dst_scales.is_def() + && attr_t::get_default_mask(dst_scales.policy) > 0 + && prb->runtime_dim_mask != 0) { + // Destination scale is not supported for runtime dimensions since + // the implementation logic inverts dst scales and requires + // scratchpad for `mask > 0` cases which is impossible to estimate + // with runtime dims. + BENCHDNN_PRINT(2, + "[SKIP][%s:%d]: Destination scale is not supported for " + "runtime dimensions.\n", + __FILE__, __LINE__); res->state = SKIPPED; res->reason = skip_reason::case_not_supported; return; } - // CPU reorder doesn't support (xf8,xf16)<-->s32 combinations. - const bool s32_src_ok = IMPLICATION(sdt == dnnl_s32, - ddt != dnnl_f8_e5m2 && ddt != dnnl_f8_e4m3 && ddt != dnnl_bf16 - && ddt != dnnl_f16); - const bool s32_dst_ok = IMPLICATION(ddt == dnnl_s32, - sdt != dnnl_f8_e5m2 && sdt != dnnl_f8_e4m3 && sdt != dnnl_bf16 - && sdt != dnnl_f16); - if (!s32_src_ok || !s32_dst_ok) { + const auto &src_scales = prb->attr.scales.get(DNNL_ARG_SRC); + if (!src_scales.is_def() && !dst_scales.is_def()) { + if (attr_t::get_default_mask(src_scales.policy) + != attr_t::get_default_mask(dst_scales.policy) + && prb->is_reorder_with_compensation(FLAG_ANY)) { + BENCHDNN_PRINT(2, + "[SKIP][%s:%d]: Compensation cases when both scales " + "specified but with different masks isn't supported.\n", + __FILE__, __LINE__); + res->state = SKIPPED; + res->reason = skip_reason::case_not_supported; + return; + } + } + + if (sdt == dnnl_s4 || ddt == dnnl_s4 || sdt == dnnl_u4 + || ddt == dnnl_u4) { + BENCHDNN_PRINT(2, "[SKIP][%s:%d]: Int4 support is limited.\n", + __FILE__, __LINE__); res->state = SKIPPED; res->reason = skip_reason::case_not_supported; return; } - // CPU f16 reorders only support f16<->f32 combinations const bool f16_src_ok = IMPLICATION( sdt == dnnl_f16, ddt == dnnl_f16 || ddt == dnnl_f32); const bool f16_dst_ok = IMPLICATION( ddt == dnnl_f16, sdt == dnnl_f16 || sdt == dnnl_f32); if (!f16_src_ok || !f16_dst_ok) { + BENCHDNN_PRINT(2, "[SKIP][%s:%d]: f16 support is limited.\n", + __FILE__, __LINE__); res->state = SKIPPED; res->reason = skip_reason::case_not_supported; return; @@ -329,6 +378,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { = IMPLICATION(sdt == dnnl_f8_e5m2 || sdt == dnnl_f8_e4m3, ddt == dnnl_f16 || ddt == dnnl_f32); if (!xf8_src_ok || !xf8_dst_ok) { + BENCHDNN_PRINT(2, "[SKIP][%s:%d]: f8 support is limited.\n", + __FILE__, __LINE__); res->state = SKIPPED; res->reason = skip_reason::case_not_supported; return; @@ -336,20 +387,23 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { } if (is_gpu()) { - // GPU does not support run-time dims. - // Reorders w/ compensation are not supported by design: zp_comp is done - // in kernels directly, but s8s8 instructions are available in HW. - if (prb->runtime_dim_mask != 0 - || prb->is_reorder_with_compensation(FLAG_ANY)) { + if (prb->runtime_dim_mask != 0) { + BENCHDNN_PRINT(2, + "[SKIP][%s:%d]: GPU doesn't support runtime dimensions.\n", + __FILE__, __LINE__); res->state = SKIPPED; res->reason = skip_reason::case_not_supported; return; } - // GPU doesn't support f8_e5m2/f8_e4m3. - const bool is_xf8 = prb->sdt == dnnl_f8_e5m2 || prb->sdt == dnnl_f8_e4m3 - || prb->ddt == dnnl_f8_e5m2 || prb->ddt == dnnl_f8_e4m3; - if (is_xf8) { + if (prb->is_reorder_with_compensation(FLAG_ANY)) { + // Reorders w/ compensation are not supported by design: zp_comp is + // done in kernels directly, but s8s8 instructions are available in + // HW. + BENCHDNN_PRINT(2, + "[SKIP][%s:%d]: GPU doesn't support cases with " + "compensation.\n", + __FILE__, __LINE__); res->state = SKIPPED; res->reason = skip_reason::case_not_supported; return; @@ -358,31 +412,40 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) { } void skip_invalid_prb(const prb_t *prb, res_t *res) { - // No sense in cross engine reorders when one of devices is switched off. #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_NONE \ || DNNL_GPU_RUNTIME == DNNL_RUNTIME_NONE auto cross_engine = prb->cross_engine; if (cross_engine == CPU2GPU || cross_engine == GPU2CPU) { + BENCHDNN_PRINT(2, + "[INVALID][%s:%d]: Cross-engine case isn't supported when just " + "one runtime is enabled.\n", + __FILE__, __LINE__); res->state = SKIPPED; res->reason = skip_reason::invalid_case; return; } #endif - // Zero-points can't be used with sum post-op. if (!prb->attr.zero_points.is_def(DNNL_ARG_DST) && prb->attr.post_ops.find(attr_t::post_ops_t::kind_t::SUM) != -1) { + BENCHDNN_PRINT(2, + "[INVALID][%s:%d]: Zero-points can't be used with sum " + "post-op.\n", + __FILE__, __LINE__); res->state = SKIPPED; res->reason = skip_reason::invalid_case; return; } - // only integral data types can have zero points const bool is_src_zp_ok = is_integral_dt(prb->sdt) || prb->attr.zero_points.is_def(DNNL_ARG_SRC); const bool is_dst_zp_ok = is_integral_dt(prb->ddt) || prb->attr.zero_points.is_def(DNNL_ARG_DST); if (!(is_src_zp_ok && is_dst_zp_ok)) { + BENCHDNN_PRINT(2, + "[INVALID][%s:%d]: Non-integral data types don't support " + "zero-points\n", + __FILE__, __LINE__); res->state = SKIPPED; res->reason = skip_reason::invalid_case; return; From f3c6ad137a3303d77b85e77e5abea9ec1809640e Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Wed, 19 Mar 2025 12:12:39 -0700 Subject: [PATCH 08/12] benchdnn: reorder: correct the mask bit check --- tests/benchdnn/reorder/reorder_aux.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/benchdnn/reorder/reorder_aux.cpp b/tests/benchdnn/reorder/reorder_aux.cpp index 9eb01d53f71..ca46cc958ed 100644 --- a/tests/benchdnn/reorder/reorder_aux.cpp +++ b/tests/benchdnn/reorder/reorder_aux.cpp @@ -103,7 +103,8 @@ void prb_t::get_compensation_parameters( dims_t &comp_dims, int &mask, flag_bit_t flag) const { if (is_reorder_with_compensation(flag)) { for (const auto &i_oflag : oflag) { - if (i_oflag.first != flag) continue; + const bool has_flag_bit = (i_oflag.first & flag); + if (!has_flag_bit) continue; mask = i_oflag.second; for (int d = 0; d < ndims; ++d) From e51ded6e9166b29b0c1456c7d5220dab230deb07 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Wed, 19 Mar 2025 12:14:18 -0700 Subject: [PATCH 09/12] fixup: src: introduce quant_entry_t and refactor arg_scales_t to rely on it Case with different mask is not supported if only both scales were specified. --- src/cpu/x64/jit_uni_reorder_utils.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/cpu/x64/jit_uni_reorder_utils.cpp b/src/cpu/x64/jit_uni_reorder_utils.cpp index 754f5de57c8..1662a174195 100644 --- a/src/cpu/x64/jit_uni_reorder_utils.cpp +++ b/src/cpu/x64/jit_uni_reorder_utils.cpp @@ -286,7 +286,11 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd, = dst_mask == 0 ? scale_type_t::COMMON : scale_type_t::MANY; } - if (src_mask != dst_mask) return status::unimplemented; + VDISPATCH_REORDER_IC( + IMPLICATION(p.src_scale_type != scale_type_t::NONE + && p.dst_scale_type != scale_type_t::NONE, + src_mask == dst_mask), + VERBOSE_UNSUPPORTED_SCALES_CFG); p.scale_adjust = (om_d.extra().flags & memory_extra_flags::scale_adjust) ? om_d.extra().scale_adjust From 52edf0d17e69f166ef615b0ccc00db16f59d329d Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Wed, 19 Mar 2025 12:20:23 -0700 Subject: [PATCH 10/12] cpu: x64: jit_reorder: add verbose messages --- src/cpu/x64/jit_uni_reorder_utils.cpp | 37 +++++++++++++++++---------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/src/cpu/x64/jit_uni_reorder_utils.cpp b/src/cpu/x64/jit_uni_reorder_utils.cpp index 1662a174195..38621550ec1 100644 --- a/src/cpu/x64/jit_uni_reorder_utils.cpp +++ b/src/cpu/x64/jit_uni_reorder_utils.cpp @@ -199,14 +199,22 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd, return po.len() == 0 || (po.len() == 1 && po.entry_[0].is_sum(false)); }; - bool ok = im_d.is_blocking_desc() && om_d.is_blocking_desc() - && !im_d.has_runtime_dims_or_strides() && !im_d.has_zero_dim() - && !om_d.has_runtime_dims_or_strides() && !om_d.has_zero_dim() - && attr->has_default_values(primitive_attr_t::skip_mask_t::scales - | primitive_attr_t::skip_mask_t::zero_points - | primitive_attr_t::skip_mask_t::post_ops) - && check_post_ops(attr); - if (!ok) return unimplemented; + VDISPATCH_REORDER_IC( + im_d.is_blocking_desc(), VERBOSE_UNSUPPORTED_FORMAT_KIND); + VDISPATCH_REORDER_IC( + om_d.is_blocking_desc(), VERBOSE_UNSUPPORTED_FORMAT_KIND); + VDISPATCH_REORDER_IC(!im_d.has_zero_dim(), VERBOSE_EMPTY_TENSOR, "src"); + VDISPATCH_REORDER_IC(!om_d.has_zero_dim(), VERBOSE_EMPTY_TENSOR, "dst"); + VDISPATCH_REORDER_IC(!im_d.has_runtime_dims_or_strides(), + VERBOSE_RUNTIMEDIM_UNSUPPORTED); + VDISPATCH_REORDER_IC(!om_d.has_runtime_dims_or_strides(), + VERBOSE_RUNTIMEDIM_UNSUPPORTED); + + using smask_t = primitive_attr_t::skip_mask_t; + VDISPATCH_REORDER_IC(attr->has_default_values(smask_t::scales + | smask_t::zero_points | smask_t::post_ops), + VERBOSE_UNSUPPORTED_ATTR); + VDISPATCH_REORDER_IC(check_post_ops(attr), VERBOSE_UNSUPPORTED_POSTOP); bool is_tail_present = false; dims_t iblocks, oblocks, i_tails, o_tails, i_paddings, o_paddings; @@ -218,7 +226,8 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd, const auto pdim = om_d.padded_dims()[d]; const auto cblock = oblocks[d]; // do not allow excess pdim other than required for rounding-up of dim. - if (utils::rnd_up(dim, cblock) != pdim) return unimplemented; + VDISPATCH_REORDER_IC(utils::rnd_up(dim, cblock) == pdim, + VERBOSE_UNSUPPORTED_PAD_FEATURE); } utils::array_set(i_tails, 0, im_d.ndims()); @@ -306,10 +315,12 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd, return IMPLICATION(check, mask == (with_groups ? 0x3 : 0x1)); }; - if (!mask_ok(p.req_s8s8_comp, om_d.extra().compensation_mask) - || !mask_ok(p.req_asymmetric_comp, - om_d.extra().asymm_compensation_mask)) - return status::unimplemented; + VDISPATCH_REORDER_IC( + mask_ok(p.req_s8s8_comp, om_d.extra().compensation_mask), + VERBOSE_UNSUPPORTED_MD_FLAG, "dst"); + VDISPATCH_REORDER_IC(mask_ok(p.req_asymmetric_comp, + om_d.extra().asymm_compensation_mask), + VERBOSE_UNSUPPORTED_MD_FLAG, "dst"); ptrdiff_t ss[max_ndims] = {0}; // scales strides if (p.src_scale_type == scale_type_t::MANY From 87661bc0467776f04b0d72825d7c1b94d67bd298 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Wed, 19 Mar 2025 16:51:52 -0700 Subject: [PATCH 11/12] benchdnn: self: replace temporary "const char *" with "std::string" Temporary "const char *" objects can disappear while getting to the parser internals. Moving strings to parse into a permanent container solves the problem. --- tests/benchdnn/self/common.cpp | 62 +++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/tests/benchdnn/self/common.cpp b/tests/benchdnn/self/common.cpp index eb7080f0426..e95460516d9 100644 --- a/tests/benchdnn/self/common.cpp +++ b/tests/benchdnn/self/common.cpp @@ -118,10 +118,10 @@ static int check_attr() { { base_settings_t s; std::vector &zp = s.zero_points; - SELF_CHECK_EQ(parse_attributes(s, def, - "--attr-zero-points=src:common:0+wei:per_oc+dst:" - "common:-2,src:per_dim_1"), - true); + std::string content_to_parse( + "--attr-zero-points=src:common:0+wei:per_oc+dst:common:-2,src:" + "per_dim_1"); + SELF_CHECK_EQ(parse_attributes(s, def, content_to_parse.c_str()), true); SELF_CHECK_EQ(zp.size(), 2); const std::vector def_g {}; SELF_CHECK_ATTR_ZP( @@ -138,11 +138,10 @@ static int check_attr() { { base_settings_t s; std::vector &sc = s.scales; + std::string content_to_parse( + "--attr-scales=src:common:1.5+wei:per_oc+src:common:0.5"); // `src` scale is overridden with the latter value. - SELF_CHECK_EQ(parse_attributes(s, def, - "--attr-scales=src:common:1.5+wei:per_oc+src:" - "common:0.5"), - true); + SELF_CHECK_EQ(parse_attributes(s, def, content_to_parse.c_str()), true); SELF_CHECK_EQ(sc.size(), 1); SELF_CHECK_EQ(sc[0].get(DNNL_ARG_SRC).policy, policy_t::COMMON); SELF_CHECK_EQ(sc[0].get(DNNL_ARG_SRC).scale, 0.5f); @@ -153,9 +152,9 @@ static int check_attr() { { base_settings_t s; std::vector &sc = s.scales; - SELF_CHECK_EQ(parse_attributes(s, def, - "--attr-scales=src:common:2.5+src1:common:1.5"), - true); + std::string content_to_parse( + "--attr-scales=src:common:2.5+src1:common:1.5"); + SELF_CHECK_EQ(parse_attributes(s, def, content_to_parse.c_str()), true); SELF_CHECK_EQ(sc.size(), 1); SELF_CHECK_EQ(sc[0].get(DNNL_ARG_SRC_0).policy, policy_t::COMMON); SELF_CHECK_EQ(sc[0].get(DNNL_ARG_SRC_0).scale, 2.5); @@ -166,9 +165,8 @@ static int check_attr() { { base_settings_t s; std::vector &zp = s.zero_points; - SELF_CHECK_EQ(parse_attributes( - s, def, "--attr-zero-points=wei:per_ocic:s8:2x1"), - true); + std::string content_to_parse("--attr-zero-points=wei:per_ocic:s8:2x1"); + SELF_CHECK_EQ(parse_attributes(s, def, content_to_parse.c_str()), true); SELF_CHECK_EQ(zp.size(), 1); std::vector groups = {2, 1}; SELF_CHECK_ATTR_ZP(zp[0], DNNL_ARG_WEIGHTS, policy_t::PER_OCIC, 0, @@ -178,9 +176,9 @@ static int check_attr() { { base_settings_t s; std::vector &sc = s.scales; - SELF_CHECK_EQ(parse_attributes(s, def, - "--attr-scales=attr_post_op_dw_wei:common:2"), - true); + std::string content_to_parse( + "--attr-scales=attr_post_op_dw_wei:common:2"); + SELF_CHECK_EQ(parse_attributes(s, def, content_to_parse.c_str()), true); SELF_CHECK_EQ(sc.size(), 1); const auto arg = DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS; SELF_CHECK_EQ(sc[0].get(arg).policy, policy_t::COMMON); @@ -191,7 +189,8 @@ static int check_attr() { { base_settings_t s; std::vector &po = s.post_ops; - auto st = parse_attributes(s, def, "--attr-post-ops=dw:k3s1p1"); + std::string content_to_parse("--attr-post-ops=dw:k3s1p1"); + auto st = parse_attributes(s, def, content_to_parse.c_str()); SELF_CHECK_EQ(st, true); SELF_CHECK_EQ(po[0].len(), 1); const auto &e = po[0].entry[0]; @@ -206,8 +205,9 @@ static int check_attr() { { base_settings_t s; std::vector &po = s.post_ops; - auto st = parse_attributes( - s, def, "--attr-post-ops=relu:0.5+dw:k3s2p1:s8+linear:2:1"); + std::string content_to_parse( + "--attr-post-ops=relu:0.5+dw:k3s2p1:s8+linear:2:1"); + auto st = parse_attributes(s, def, content_to_parse.c_str()); SELF_CHECK_EQ(st, true); SELF_CHECK_EQ(po[0].len(), 3); auto &e = po[0].entry[0]; @@ -236,7 +236,8 @@ static int check_attr() { { base_settings_t s; std::vector &fm = s.fpmath_mode; - auto st = parse_attributes(s, def, "--attr-fpmath=strict:true"); + std::string content_to_parse("--attr-fpmath=strict:true"); + auto st = parse_attributes(s, def, content_to_parse.c_str()); SELF_CHECK_EQ(st, true); SELF_CHECK_EQ(fm[0].mode, dnnl_fpmath_mode_strict); SELF_CHECK_EQ(fm[0].apply_to_int, true); @@ -245,7 +246,8 @@ static int check_attr() { { base_settings_t s; std::vector &fm = s.fpmath_mode; - auto st = parse_attributes(s, def, "--attr-fpmath=bf16"); + std::string content_to_parse("--attr-fpmath=bf16"); + auto st = parse_attributes(s, def, content_to_parse.c_str()); SELF_CHECK_EQ(st, true); SELF_CHECK_EQ(fm[0].mode, dnnl_fpmath_mode_bf16); SELF_CHECK_EQ(fm[0].apply_to_int, false); @@ -257,7 +259,8 @@ static int check_attr() { std::vector &fm = s.fpmath_mode; def.fpmath_mode.emplace_back(); def.fpmath_mode[0].set(dnnl_fpmath_mode_bf16, true); - auto st = parse_attributes(s, def, "--attr-fpmath="); + std::string content_to_parse("--attr-fpmath="); + auto st = parse_attributes(s, def, content_to_parse.c_str()); SELF_CHECK_EQ(st, true); SELF_CHECK_EQ(fm[0].mode, dnnl_fpmath_mode_bf16); SELF_CHECK_EQ(fm[0].apply_to_int, true); @@ -268,7 +271,8 @@ static int check_attr() { { base_settings_t s; std::vector &d = s.dropout; - auto st = parse_attributes(s, def, "--attr-dropout=0.5:12345:axb"); + std::string content_to_parse("--attr-dropout=0.5:12345:axb"); + auto st = parse_attributes(s, def, content_to_parse.c_str()); SELF_CHECK_EQ(st, true); SELF_CHECK_EQ(d[0].p, 0.5f); SELF_CHECK_EQ(d[0].seed, 12345); @@ -278,7 +282,8 @@ static int check_attr() { { base_settings_t s; std::vector &d = s.dropout; - auto st = parse_attributes(s, def, "--attr-dropout=0.75"); + std::string content_to_parse("--attr-dropout=0.75"); + auto st = parse_attributes(s, def, content_to_parse.c_str()); SELF_CHECK_EQ(st, true); SELF_CHECK_EQ(d[0].p, 0.75f); SELF_CHECK_EQ(d[0].seed, 0); @@ -288,7 +293,8 @@ static int check_attr() { { base_settings_t s; std::vector &d = s.dropout; - auto st = parse_attributes(s, def, "--attr-dropout="); + std::string content_to_parse("--attr-dropout="); + auto st = parse_attributes(s, def, content_to_parse.c_str()); SELF_CHECK_EQ(st, true); SELF_CHECK_EQ(d[0].p, 0.f); SELF_CHECK_EQ(d[0].seed, 0); @@ -298,8 +304,8 @@ static int check_attr() { { base_settings_t s; std::vector &rm = s.rounding_mode; - auto st = parse_attributes( - s, def, "--attr-rounding-mode=dst:stochastic"); + std::string content_to_parse("--attr-rounding-mode=dst:stochastic"); + auto st = parse_attributes(s, def, content_to_parse.c_str()); SELF_CHECK_EQ(st, true); SELF_CHECK_EQ(rm[0].get(DNNL_ARG_DST), dnnl_rounding_mode_stochastic); SELF_CHECK_EQ(rm[0].get(DNNL_ARG_SRC), dnnl_rounding_mode_environment); From f933c1ed2028da8843c2ba048beec969b4171617 Mon Sep 17 00:00:00 2001 From: Dmitrii Zarukin Date: Wed, 19 Mar 2025 19:11:11 -0700 Subject: [PATCH 12/12] cpu: x64: fixed memory leak in jit_uni_ncsp convolution impl --- src/cpu/x64/jit_uni_ncsp_convolution.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpu/x64/jit_uni_ncsp_convolution.cpp b/src/cpu/x64/jit_uni_ncsp_convolution.cpp index 8eb722a891c..9d827ba76ee 100644 --- a/src/cpu/x64/jit_uni_ncsp_convolution.cpp +++ b/src/cpu/x64/jit_uni_ncsp_convolution.cpp @@ -100,7 +100,7 @@ status_t reduction_helper_t::reshape_weights( status_t reduction_helper_t::reshape_for_transpose( memory_desc_t &o_md, memory_desc_t &i_md) { const int ndims = i_md.ndims; - int *perm = new int[ndims]; + std::vector perm(ndims); for (int dim = 0; dim < ndims; dim++) { if (dim == ndims - 2) perm[dim] = dim + 1; @@ -109,7 +109,7 @@ status_t reduction_helper_t::reshape_for_transpose( else perm[dim] = dim; } - return memory_desc_permute_axes(o_md, i_md, perm); + return memory_desc_permute_axes(o_md, i_md, perm.data()); } bool reduction_helper_t::is_gemm() {