uxlfoundation · vpirogov · Mar 24, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 6, 2025
@@ -354,21 +354,39 @@ status_t matmul_desc_init(matmul_desc_t *matmul_desc,
             ? utils::get_dims_mask(dst_desc->dims, op_d.bias_desc.dims, ndims)
             : 0;
 
-    // TODO: requirement is for innermost dim to be multiple of 2 for
-    // the memory to be byte aligned.
-
-    // s4/u4/f4 weights requires n to be multiple of 2 to be byte aligned
-    VCHECK_MATMUL(IMPLICATION(utils::one_of(weights_desc->data_type,
-                                      data_type::s4, data_type::u4,
-                                      data_type::f4_e2m1, data_type::f4_e3m0),
-                          weights_desc->dims[n_idx] % 2 == 0),
-            VERBOSE_BAD_DIM, "weights", n_idx);
-    // s4/u4/f4 src requires k to be multiple of 2 to be byte aligned
-    VCHECK_MATMUL(IMPLICATION(utils::one_of(src_desc->data_type, data_type::s4,
-                                      data_type::u4, data_type::f4_e2m1,
-                                      data_type::f4_e3m0),
-                          src_desc->dims[k_idx_src] % 2 == 0),
-            VERBOSE_BAD_DIM, "src", n_idx);
+    using namespace data_type;
+    if (weights_desc->format_kind == format_kind::blocked
+            && utils::one_of(
+                    weights_desc->data_type, s4, u4, f4_e2m1, f4_e3m0)) {
+        const auto &wei_strides = weights_desc->format_desc.blocking.strides;
+
+        int n_unit_strides = 0;
+        for (int d = 0; d < ndims; d++) {
+            if (wei_strides[d] == 1) {
+                n_unit_strides++;
+                VCHECK_MATMUL(
+                        n_unit_strides <= 1, VERBOSE_BAD_DIM, "weights", d);
+            }
+            VCHECK_MATMUL(
+                    IMPLICATION(wei_strides[d] > 1, wei_strides[d] % 2 == 0),
+                    VERBOSE_BAD_DIM, "weights", d);
+        }
+    }
+    if (src_desc->format_kind == format_kind::blocked
+            && utils::one_of(src_desc->data_type, s4, u4, f4_e2m1, f4_e3m0)) {
+        const auto &src_strides = src_desc->format_desc.blocking.strides;
+
+        int n_unit_strides = 0;
+        for (int d = 0; d < ndims; d++) {
+            if (src_strides[d] == 1) {
+                n_unit_strides++;
+                VCHECK_MATMUL(n_unit_strides <= 1, VERBOSE_BAD_DIM, "src", d);
+            }
+            VCHECK_MATMUL(
+                    IMPLICATION(src_strides[d] > 1, src_strides[d] % 2 == 0),
+                    VERBOSE_BAD_DIM, "src", d);
+        }
+    }
 
     // check if other dims match.
     for (int d = 0; d < ndims - 2; ++d) {

@@ -42,19 +42,18 @@ const float *jit_avx512_core_x8s8s32x_convolution_fwd_t::adjust_oscales(
         const memory_tracking::grantor_t &scratchpad, const float *src_scales,
         const float *wei_scales) const {
     auto loc_scales = scratchpad.template get<float>(key_conv_adjusted_scales);
-    const float src_scale = src_scales[0];
+    const bool has_wei_scales
+            = !pd()->attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS);
     const int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni))
             ? 1.f / pd()->jcp_.wei_adj_scale
             : 1.f;
-    switch (wei_mask) {
-        case 0:
-            utils::array_set(loc_scales, src_scale * wei_scales[0] * factor,
-                    pd()->jcp_.simd_w);
-            break;
-        default:
-            for (dim_t c = 0; c < pd()->OC(); c++)
-                loc_scales[c] = src_scale * wei_scales[c] * factor;
+    if (has_wei_scales && wei_mask > 0) {
+        for (dim_t c = 0; c < pd()->OC(); c++)
+            loc_scales[c] = src_scales[0] * wei_scales[c] * factor;
+    } else {
+        utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor,
+                pd()->jcp_.simd_w);
     }
     return loc_scales;
 }

@@ -1393,16 +1393,18 @@ const float *jit_avx512_core_x8s8s32x_deconvolution_fwd_t::adjust_oscales(
         const memory_tracking::grantor_t &scratchpad, const float *src_scales,
         const float *wei_scales) const {
     auto loc_scales = scratchpad.template get<float>(key_conv_adjusted_scales);
+    const bool has_wei_scales
+            = !pd()->attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS);
     int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni))
             ? 1.f / pd()->jcp_.wei_adj_scale
             : 1.0f;
-    if (wei_mask == 0) {
-        utils::array_set(
-                loc_scales, src_scales[0] * wei_scales[0] * factor, 16);
-    } else {
+    if (has_wei_scales && wei_mask > 0) {
         for (dim_t c = 0; c < pd()->OC(); c++)
             loc_scales[c] = src_scales[0] * wei_scales[c] * factor;
+    } else {
+        utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor,
+                /* WHY: pd()->jcp_.simd_w = 0!!! */ 16);
     }
     return loc_scales;
 }

@@ -576,13 +576,19 @@ status_t jit_uni_dw_conv_bwd_weights_kernel<isa, kernel_dt>::init_conf(
             = !is_data_layout_nxc && one_of(isa, avx512_core, avx2);
     if (ok_to_pad_channels) { jcp.ngroups = rnd_up(jcp.ngroups, jcp.ch_block); }
 
-    bool args_ok = true
-            && IMPLICATION(!is_data_layout_nxc, jcp.ngroups % jcp.ch_block == 0)
-            && jcp.dilate_h == 0 && jcp.dilate_w == 0 && jcp.kw <= 3
-            && jcp.stride_w <= jcp.kw // no gaps in kernel
-            && jcp.oh == (jcp.ihp - jcp.kh) / jcp.stride_h + 1
-            && jcp.ow == (jcp.iwp - jcp.kw) / jcp.stride_w + 1;
-    VDISPATCH_CONV_IC(args_ok, VERBOSE_BAD_PARAM, "");
+    VDISPATCH_CONV_IC(
+            IMPLICATION(!is_data_layout_nxc, jcp.ngroups % jcp.ch_block == 0),
+            VERBOSE_BAD_PARAM, "number of groups doesn't divide channel block");
+    VDISPATCH_CONV_IC(jcp.dilate_h == 0, VERBOSE_BAD_PARAM, "dilate_h");
+    VDISPATCH_CONV_IC(jcp.dilate_w == 0, VERBOSE_BAD_PARAM, "dilate_w");
+    VDISPATCH_CONV_IC(jcp.kw <= 3, VERBOSE_BAD_PARAM, "kw > 3");
+    // No gaps in the kernel.
+    VDISPATCH_CONV_IC(
+            jcp.stride_w <= jcp.kw, VERBOSE_BAD_PARAM, "stride_w > kw");
+    VDISPATCH_CONV_IC(jcp.oh == (jcp.ihp - jcp.kh) / jcp.stride_h + 1,
+            VERBOSE_BAD_PARAM, "oh != (ihp - kh) / stride_h + 1");
+    VDISPATCH_CONV_IC(jcp.ow == (jcp.iwp - jcp.kw) / jcp.stride_w + 1,
+            VERBOSE_BAD_PARAM, "ow != (iwp - kw) / stride_w + 1");
 
     jcp.nb_ch = div_up(jcp.ngroups, jcp.ch_block);
 

@@ -100,7 +100,7 @@ status_t reduction_helper_t::reshape_weights(
 status_t reduction_helper_t::reshape_for_transpose(
         memory_desc_t &o_md, memory_desc_t &i_md) {
     const int ndims = i_md.ndims;
-    int *perm = new int[ndims];
+    std::vector<int> perm(ndims);
     for (int dim = 0; dim < ndims; dim++) {
         if (dim == ndims - 2)
             perm[dim] = dim + 1;
@@ -109,7 +109,7 @@ status_t reduction_helper_t::reshape_for_transpose(
         else
             perm[dim] = dim;
     }
-    return memory_desc_permute_axes(o_md, i_md, perm);
+    return memory_desc_permute_axes(o_md, i_md, perm.data());
 }
 
 bool reduction_helper_t::is_gemm() {

@@ -199,14 +199,22 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
         return po.len() == 0 || (po.len() == 1 && po.entry_[0].is_sum(false));
     };
 
-    bool ok = im_d.is_blocking_desc() && om_d.is_blocking_desc()
-            && !im_d.has_runtime_dims_or_strides() && !im_d.has_zero_dim()
-            && !om_d.has_runtime_dims_or_strides() && !om_d.has_zero_dim()
-            && attr->has_default_values(primitive_attr_t::skip_mask_t::scales
-                    | primitive_attr_t::skip_mask_t::zero_points
-                    | primitive_attr_t::skip_mask_t::post_ops)
-            && check_post_ops(attr);
-    if (!ok) return unimplemented;
+    VDISPATCH_REORDER_IC(
+            im_d.is_blocking_desc(), VERBOSE_UNSUPPORTED_FORMAT_KIND);
+    VDISPATCH_REORDER_IC(
+            om_d.is_blocking_desc(), VERBOSE_UNSUPPORTED_FORMAT_KIND);
+    VDISPATCH_REORDER_IC(!im_d.has_zero_dim(), VERBOSE_EMPTY_TENSOR, "src");
+    VDISPATCH_REORDER_IC(!om_d.has_zero_dim(), VERBOSE_EMPTY_TENSOR, "dst");
+    VDISPATCH_REORDER_IC(!im_d.has_runtime_dims_or_strides(),
+            VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+    VDISPATCH_REORDER_IC(!om_d.has_runtime_dims_or_strides(),
+            VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+
+    using smask_t = primitive_attr_t::skip_mask_t;
+    VDISPATCH_REORDER_IC(attr->has_default_values(smask_t::scales
+                                 | smask_t::zero_points | smask_t::post_ops),
+            VERBOSE_UNSUPPORTED_ATTR);
+    VDISPATCH_REORDER_IC(check_post_ops(attr), VERBOSE_UNSUPPORTED_POSTOP);
 
     bool is_tail_present = false;
     dims_t iblocks, oblocks, i_tails, o_tails, i_paddings, o_paddings;
@@ -218,7 +226,8 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
         const auto pdim = om_d.padded_dims()[d];
         const auto cblock = oblocks[d];
         // do not allow excess pdim other than required for rounding-up of dim.
-        if (utils::rnd_up(dim, cblock) != pdim) return unimplemented;
+        VDISPATCH_REORDER_IC(utils::rnd_up(dim, cblock) == pdim,
+                VERBOSE_UNSUPPORTED_PAD_FEATURE);
     }
 
     utils::array_set(i_tails, 0, im_d.ndims());
@@ -286,7 +295,11 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
                 = dst_mask == 0 ? scale_type_t::COMMON : scale_type_t::MANY;
     }
 
-    if (src_mask != dst_mask) return status::unimplemented;
+    VDISPATCH_REORDER_IC(
+            IMPLICATION(p.src_scale_type != scale_type_t::NONE
+                            && p.dst_scale_type != scale_type_t::NONE,
+                    src_mask == dst_mask),
+            VERBOSE_UNSUPPORTED_SCALES_CFG);
 
     p.scale_adjust = (om_d.extra().flags & memory_extra_flags::scale_adjust)
             ? om_d.extra().scale_adjust
@@ -302,10 +315,12 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
         return IMPLICATION(check, mask == (with_groups ? 0x3 : 0x1));
     };
 
-    if (!mask_ok(p.req_s8s8_comp, om_d.extra().compensation_mask)
-            || !mask_ok(p.req_asymmetric_comp,
-                    om_d.extra().asymm_compensation_mask))
-        return status::unimplemented;
+    VDISPATCH_REORDER_IC(
+            mask_ok(p.req_s8s8_comp, om_d.extra().compensation_mask),
+            VERBOSE_UNSUPPORTED_MD_FLAG, "dst");
+    VDISPATCH_REORDER_IC(mask_ok(p.req_asymmetric_comp,
+                                 om_d.extra().asymm_compensation_mask),
+            VERBOSE_UNSUPPORTED_MD_FLAG, "dst");
 
     ptrdiff_t ss[max_ndims] = {0}; // scales strides
     if (p.src_scale_type == scale_type_t::MANY

@@ -1451,15 +1451,18 @@ const float *jit_uni_x8s8s32x_deconvolution_fwd_t<isa>::adjust_oscales(
         const memory_tracking::grantor_t &scratchpad, const float *src_scales,
         const float *wei_scales) const {
     auto loc_scales = scratchpad.template get<float>(key_conv_adjusted_scales);
+    const bool has_wei_scales
+            = !pd()->attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS);
     int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni))
             ? 1.f / pd()->jcp_.wei_adj_scale
             : 1.0f;
-    if (wei_mask == 0) {
-        utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor, 8);
-    } else {
+    if (has_wei_scales && wei_mask > 0) {
         for (dim_t c = 0; c < pd()->OC(); c++)
             loc_scales[c] = src_scales[0] * wei_scales[c] * factor;
+    } else {
+        utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor,
+                /* WHY: pd()->jcp_.simd_w = 0!!! */ 8);
     }
     return loc_scales;
 }

@@ -413,6 +413,20 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
     density_args.n_acc = prb->k;
     const auto density = cfg.get_density(density_args);
 
+    const auto &e_zp_src = prb->attr.zero_points.get(DNNL_ARG_SRC);
+    const bool has_src_zp = !e_zp_src.is_def();
+    const int src_zp_mask = attr_t::get_default_mask(e_zp_src.policy);
+    // Apply src_zp for source tensor only.
+    int src_zp = kind == SRC && has_src_zp && src_zp_mask == 0 ? e_zp_src.value
+                                                               : 0;
+
+    const auto &e_zp_wei = prb->attr.zero_points.get(DNNL_ARG_WEIGHTS);
+    const bool has_wei_zp = !e_zp_wei.is_def();
+    const int wei_zp_mask = attr_t::get_default_mask(e_zp_wei.policy);
+    // Apply wei_zp for weights tensor only.
+    int wei_zp = kind == WEI && has_wei_zp && wei_zp_mask == 0 ? e_zp_wei.value
+                                                               : 0;
+
     /* Do fixed partitioning to have same filling for any number of threads */
     const int64_t chunk_size = 64;
     const int64_t n_chunks = div_up(nelems, chunk_size);
@@ -438,6 +452,7 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
             float val = 0;
             while (val <= 0)
                 val = gen(int_seed);
+            val += src_zp + wei_zp; // Add zp so that it will be subtracted.
             mem_fp.set_elem(
                     0, round_to_nearest_representable(cfg.get_dt(kind), val));
             idx_start += 1;
@@ -453,6 +468,7 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
                 val *= is_one;
             } else {
                 val = is_one * gen(int_seed);
+                val += src_zp + wei_zp; // Add zp so that it will be subtracted.
             }
             mem_fp.set_elem(
                     idx, round_to_nearest_representable(cfg.get_dt(kind), val));
@@ -703,15 +719,41 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) {
         }
     }
 
+    // Check int4 weights byte alignment if format is specified.
     if ((prb->wei_dt() == dnnl_s4 || prb->wei_dt() == dnnl_u4)
-            && (prb->n % 2)) {
-        BENCHDNN_PRINT(2,
-                "[INVALID][%s:%d]: Int4 Weights decompression requires OC "
-                "('%d') to be even.\n",
-                __FILE__, __LINE__, (int)prb->n);
-        res->state = SKIPPED;
-        res->reason = skip_reason::invalid_case;
-        return;
+            && (!prb->strides[WEI].empty()
+                    || (prb->wtag != tag::any && prb->wtag != tag::undef))) {
+        const auto &weights_rt_dims = get_runtime_dims(
+                prb->weights_dims(), prb->weights_runtime_dim_mask());
+        const auto wei_md
+                = dnn_mem_t::init_md(prb->ndims, weights_rt_dims.data(),
+                        prb->wei_dt(), prb->wtag, prb->strides[STRIDES_WEI]);
+
+        const auto wei_strides = query_md_strides(wei_md);
+        int n_unit_strides = 0;
+        for (int d = 0; d < query_md_ndims(wei_md); d++) {
+            if (wei_strides[d] == 1) {
+                n_unit_strides++;
+                if (n_unit_strides > 1) {
+                    BENCHDNN_PRINT(2,
+                            "[INVALID][%s:%d]: Int4 Weights decompression "
+                            "requires byte alignment for the tensor.\n",
+                            __FILE__, __LINE__);
+                    res->state = SKIPPED;
+                    res->reason = skip_reason::invalid_case;
+                    return;
+                }
+            }
+            if (wei_strides[d] > 1 && (wei_strides[d] % 2)) {
+                BENCHDNN_PRINT(2,
+                        "[INVALID][%s:%d]: Int4 Weights decompression requires "
+                        "byte alignment for the tensor.\n",
+                        __FILE__, __LINE__);
+                res->state = SKIPPED;
+                res->reason = skip_reason::invalid_case;
+                return;
+            }
+        }
     }
 
     auto src_rt_mask = prb->src_runtime_dim_mask();

@@ -42,10 +42,11 @@ REG(f8_e5m2, -f16_max_exact, f16_max_exact);
 REG(f8_e4m3, -f16_max_exact, f16_max_exact);
 REG(f4_e2m1, -f16_max_exact, f16_max_exact);
 REG(f4_e3m0, -f4_max_exact, f4_max_exact);
-// Do not exceed max float value representable in integer. Otherwise, we get
-// a correctness issue caused by different computations in reference and the
-// library.
-REG(s32, INT_MIN, BENCHDNN_S32_TO_F32_SAT_CONST);
+// Do not exceed min/max float value representable in integer. Otherwise, we get
+// a correctness issue caused by different computations or roudings in the naive
+// reference and the library. One of those can be zero-point subtracting which
+// leads to underflow or overflow.
+REG(s32, -BENCHDNN_S32_TO_F32_SAT_CONST, BENCHDNN_S32_TO_F32_SAT_CONST);
 REG(s8, INT8_MIN, INT8_MAX);
 REG(u8, 0, UINT8_MAX);
 REG(s4, -7, 8);