Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Another bucket of fixes (and future bugs...) #2917

Merged
merged 12 commits into from
Mar 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 33 additions & 15 deletions src/common/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,21 +354,39 @@ status_t matmul_desc_init(matmul_desc_t *matmul_desc,
? utils::get_dims_mask(dst_desc->dims, op_d.bias_desc.dims, ndims)
: 0;

// TODO: requirement is for innermost dim to be multiple of 2 for
// the memory to be byte aligned.

// s4/u4/f4 weights requires n to be multiple of 2 to be byte aligned
VCHECK_MATMUL(IMPLICATION(utils::one_of(weights_desc->data_type,
data_type::s4, data_type::u4,
data_type::f4_e2m1, data_type::f4_e3m0),
weights_desc->dims[n_idx] % 2 == 0),
VERBOSE_BAD_DIM, "weights", n_idx);
// s4/u4/f4 src requires k to be multiple of 2 to be byte aligned
VCHECK_MATMUL(IMPLICATION(utils::one_of(src_desc->data_type, data_type::s4,
data_type::u4, data_type::f4_e2m1,
data_type::f4_e3m0),
src_desc->dims[k_idx_src] % 2 == 0),
VERBOSE_BAD_DIM, "src", n_idx);
using namespace data_type;
if (weights_desc->format_kind == format_kind::blocked
&& utils::one_of(
weights_desc->data_type, s4, u4, f4_e2m1, f4_e3m0)) {
const auto &wei_strides = weights_desc->format_desc.blocking.strides;

int n_unit_strides = 0;
for (int d = 0; d < ndims; d++) {
if (wei_strides[d] == 1) {
n_unit_strides++;
VCHECK_MATMUL(
n_unit_strides <= 1, VERBOSE_BAD_DIM, "weights", d);
}
VCHECK_MATMUL(
IMPLICATION(wei_strides[d] > 1, wei_strides[d] % 2 == 0),
VERBOSE_BAD_DIM, "weights", d);
}
}
if (src_desc->format_kind == format_kind::blocked
&& utils::one_of(src_desc->data_type, s4, u4, f4_e2m1, f4_e3m0)) {
const auto &src_strides = src_desc->format_desc.blocking.strides;

int n_unit_strides = 0;
for (int d = 0; d < ndims; d++) {
if (src_strides[d] == 1) {
n_unit_strides++;
VCHECK_MATMUL(n_unit_strides <= 1, VERBOSE_BAD_DIM, "src", d);
}
VCHECK_MATMUL(
IMPLICATION(src_strides[d] > 1, src_strides[d] % 2 == 0),
VERBOSE_BAD_DIM, "src", d);
}
}

// check if other dims match.
for (int d = 0; d < ndims - 2; ++d) {
Expand Down
17 changes: 8 additions & 9 deletions src/cpu/x64/jit_avx512_core_x8s8s32x_convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,18 @@ const float *jit_avx512_core_x8s8s32x_convolution_fwd_t::adjust_oscales(
const memory_tracking::grantor_t &scratchpad, const float *src_scales,
const float *wei_scales) const {
auto loc_scales = scratchpad.template get<float>(key_conv_adjusted_scales);
const float src_scale = src_scales[0];
const bool has_wei_scales
= !pd()->attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS);
const int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni))
? 1.f / pd()->jcp_.wei_adj_scale
: 1.f;
switch (wei_mask) {
case 0:
utils::array_set(loc_scales, src_scale * wei_scales[0] * factor,
pd()->jcp_.simd_w);
break;
default:
for (dim_t c = 0; c < pd()->OC(); c++)
loc_scales[c] = src_scale * wei_scales[c] * factor;
if (has_wei_scales && wei_mask > 0) {
for (dim_t c = 0; c < pd()->OC(); c++)
loc_scales[c] = src_scales[0] * wei_scales[c] * factor;
} else {
utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor,
pd()->jcp_.simd_w);
}
return loc_scales;
}
Expand Down
10 changes: 6 additions & 4 deletions src/cpu/x64/jit_avx512_core_x8s8s32x_deconvolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1393,16 +1393,18 @@ const float *jit_avx512_core_x8s8s32x_deconvolution_fwd_t::adjust_oscales(
const memory_tracking::grantor_t &scratchpad, const float *src_scales,
const float *wei_scales) const {
auto loc_scales = scratchpad.template get<float>(key_conv_adjusted_scales);
const bool has_wei_scales
= !pd()->attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS);
int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni))
? 1.f / pd()->jcp_.wei_adj_scale
: 1.0f;
if (wei_mask == 0) {
utils::array_set(
loc_scales, src_scales[0] * wei_scales[0] * factor, 16);
} else {
if (has_wei_scales && wei_mask > 0) {
for (dim_t c = 0; c < pd()->OC(); c++)
loc_scales[c] = src_scales[0] * wei_scales[c] * factor;
} else {
utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor,
/* WHY: pd()->jcp_.simd_w = 0!!! */ 16);
}
return loc_scales;
}
Expand Down
20 changes: 13 additions & 7 deletions src/cpu/x64/jit_uni_dw_conv_kernel_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -576,13 +576,19 @@ status_t jit_uni_dw_conv_bwd_weights_kernel<isa, kernel_dt>::init_conf(
= !is_data_layout_nxc && one_of(isa, avx512_core, avx2);
if (ok_to_pad_channels) { jcp.ngroups = rnd_up(jcp.ngroups, jcp.ch_block); }

bool args_ok = true
&& IMPLICATION(!is_data_layout_nxc, jcp.ngroups % jcp.ch_block == 0)
&& jcp.dilate_h == 0 && jcp.dilate_w == 0 && jcp.kw <= 3
&& jcp.stride_w <= jcp.kw // no gaps in kernel
&& jcp.oh == (jcp.ihp - jcp.kh) / jcp.stride_h + 1
&& jcp.ow == (jcp.iwp - jcp.kw) / jcp.stride_w + 1;
VDISPATCH_CONV_IC(args_ok, VERBOSE_BAD_PARAM, "");
VDISPATCH_CONV_IC(
IMPLICATION(!is_data_layout_nxc, jcp.ngroups % jcp.ch_block == 0),
VERBOSE_BAD_PARAM, "number of groups doesn't divide channel block");
VDISPATCH_CONV_IC(jcp.dilate_h == 0, VERBOSE_BAD_PARAM, "dilate_h");
VDISPATCH_CONV_IC(jcp.dilate_w == 0, VERBOSE_BAD_PARAM, "dilate_w");
VDISPATCH_CONV_IC(jcp.kw <= 3, VERBOSE_BAD_PARAM, "kw > 3");
// No gaps in the kernel.
VDISPATCH_CONV_IC(
jcp.stride_w <= jcp.kw, VERBOSE_BAD_PARAM, "stride_w > kw");
VDISPATCH_CONV_IC(jcp.oh == (jcp.ihp - jcp.kh) / jcp.stride_h + 1,
VERBOSE_BAD_PARAM, "oh != (ihp - kh) / stride_h + 1");
VDISPATCH_CONV_IC(jcp.ow == (jcp.iwp - jcp.kw) / jcp.stride_w + 1,
VERBOSE_BAD_PARAM, "ow != (iwp - kw) / stride_w + 1");

jcp.nb_ch = div_up(jcp.ngroups, jcp.ch_block);

Expand Down
4 changes: 2 additions & 2 deletions src/cpu/x64/jit_uni_ncsp_convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ status_t reduction_helper_t::reshape_weights(
status_t reduction_helper_t::reshape_for_transpose(
memory_desc_t &o_md, memory_desc_t &i_md) {
const int ndims = i_md.ndims;
int *perm = new int[ndims];
std::vector<int> perm(ndims);
for (int dim = 0; dim < ndims; dim++) {
if (dim == ndims - 2)
perm[dim] = dim + 1;
Expand All @@ -109,7 +109,7 @@ status_t reduction_helper_t::reshape_for_transpose(
else
perm[dim] = dim;
}
return memory_desc_permute_axes(o_md, i_md, perm);
return memory_desc_permute_axes(o_md, i_md, perm.data());
}

bool reduction_helper_t::is_gemm() {
Expand Down
43 changes: 29 additions & 14 deletions src/cpu/x64/jit_uni_reorder_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,14 +199,22 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
return po.len() == 0 || (po.len() == 1 && po.entry_[0].is_sum(false));
};

bool ok = im_d.is_blocking_desc() && om_d.is_blocking_desc()
&& !im_d.has_runtime_dims_or_strides() && !im_d.has_zero_dim()
&& !om_d.has_runtime_dims_or_strides() && !om_d.has_zero_dim()
&& attr->has_default_values(primitive_attr_t::skip_mask_t::scales
| primitive_attr_t::skip_mask_t::zero_points
| primitive_attr_t::skip_mask_t::post_ops)
&& check_post_ops(attr);
if (!ok) return unimplemented;
VDISPATCH_REORDER_IC(
im_d.is_blocking_desc(), VERBOSE_UNSUPPORTED_FORMAT_KIND);
VDISPATCH_REORDER_IC(
om_d.is_blocking_desc(), VERBOSE_UNSUPPORTED_FORMAT_KIND);
VDISPATCH_REORDER_IC(!im_d.has_zero_dim(), VERBOSE_EMPTY_TENSOR, "src");
VDISPATCH_REORDER_IC(!om_d.has_zero_dim(), VERBOSE_EMPTY_TENSOR, "dst");
VDISPATCH_REORDER_IC(!im_d.has_runtime_dims_or_strides(),
VERBOSE_RUNTIMEDIM_UNSUPPORTED);
VDISPATCH_REORDER_IC(!om_d.has_runtime_dims_or_strides(),
VERBOSE_RUNTIMEDIM_UNSUPPORTED);

using smask_t = primitive_attr_t::skip_mask_t;
VDISPATCH_REORDER_IC(attr->has_default_values(smask_t::scales
| smask_t::zero_points | smask_t::post_ops),
VERBOSE_UNSUPPORTED_ATTR);
VDISPATCH_REORDER_IC(check_post_ops(attr), VERBOSE_UNSUPPORTED_POSTOP);

bool is_tail_present = false;
dims_t iblocks, oblocks, i_tails, o_tails, i_paddings, o_paddings;
Expand All @@ -218,7 +226,8 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
const auto pdim = om_d.padded_dims()[d];
const auto cblock = oblocks[d];
// do not allow excess pdim other than required for rounding-up of dim.
if (utils::rnd_up(dim, cblock) != pdim) return unimplemented;
VDISPATCH_REORDER_IC(utils::rnd_up(dim, cblock) == pdim,
VERBOSE_UNSUPPORTED_PAD_FEATURE);
}

utils::array_set(i_tails, 0, im_d.ndims());
Expand Down Expand Up @@ -286,7 +295,11 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
= dst_mask == 0 ? scale_type_t::COMMON : scale_type_t::MANY;
}

if (src_mask != dst_mask) return status::unimplemented;
VDISPATCH_REORDER_IC(
IMPLICATION(p.src_scale_type != scale_type_t::NONE
&& p.dst_scale_type != scale_type_t::NONE,
src_mask == dst_mask),
VERBOSE_UNSUPPORTED_SCALES_CFG);

p.scale_adjust = (om_d.extra().flags & memory_extra_flags::scale_adjust)
? om_d.extra().scale_adjust
Expand All @@ -302,10 +315,12 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
return IMPLICATION(check, mask == (with_groups ? 0x3 : 0x1));
};

if (!mask_ok(p.req_s8s8_comp, om_d.extra().compensation_mask)
|| !mask_ok(p.req_asymmetric_comp,
om_d.extra().asymm_compensation_mask))
return status::unimplemented;
VDISPATCH_REORDER_IC(
mask_ok(p.req_s8s8_comp, om_d.extra().compensation_mask),
VERBOSE_UNSUPPORTED_MD_FLAG, "dst");
VDISPATCH_REORDER_IC(mask_ok(p.req_asymmetric_comp,
om_d.extra().asymm_compensation_mask),
VERBOSE_UNSUPPORTED_MD_FLAG, "dst");

ptrdiff_t ss[max_ndims] = {0}; // scales strides
if (p.src_scale_type == scale_type_t::MANY
Expand Down
9 changes: 6 additions & 3 deletions src/cpu/x64/jit_uni_x8s8s32x_deconvolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1451,15 +1451,18 @@ const float *jit_uni_x8s8s32x_deconvolution_fwd_t<isa>::adjust_oscales(
const memory_tracking::grantor_t &scratchpad, const float *src_scales,
const float *wei_scales) const {
auto loc_scales = scratchpad.template get<float>(key_conv_adjusted_scales);
const bool has_wei_scales
= !pd()->attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS);
int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni))
? 1.f / pd()->jcp_.wei_adj_scale
: 1.0f;
if (wei_mask == 0) {
utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor, 8);
} else {
if (has_wei_scales && wei_mask > 0) {
for (dim_t c = 0; c < pd()->OC(); c++)
loc_scales[c] = src_scales[0] * wei_scales[c] * factor;
} else {
utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor,
/* WHY: pd()->jcp_.simd_w = 0!!! */ 8);
}
return loc_scales;
}
Expand Down
58 changes: 50 additions & 8 deletions tests/benchdnn/matmul/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,20 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
density_args.n_acc = prb->k;
const auto density = cfg.get_density(density_args);

const auto &e_zp_src = prb->attr.zero_points.get(DNNL_ARG_SRC);
const bool has_src_zp = !e_zp_src.is_def();
const int src_zp_mask = attr_t::get_default_mask(e_zp_src.policy);
// Apply src_zp for source tensor only.
int src_zp = kind == SRC && has_src_zp && src_zp_mask == 0 ? e_zp_src.value
: 0;

const auto &e_zp_wei = prb->attr.zero_points.get(DNNL_ARG_WEIGHTS);
const bool has_wei_zp = !e_zp_wei.is_def();
const int wei_zp_mask = attr_t::get_default_mask(e_zp_wei.policy);
// Apply wei_zp for weights tensor only.
int wei_zp = kind == WEI && has_wei_zp && wei_zp_mask == 0 ? e_zp_wei.value
: 0;

/* Do fixed partitioning to have same filling for any number of threads */
const int64_t chunk_size = 64;
const int64_t n_chunks = div_up(nelems, chunk_size);
Expand All @@ -438,6 +452,7 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
float val = 0;
while (val <= 0)
val = gen(int_seed);
val += src_zp + wei_zp; // Add zp so that it will be subtracted.
mem_fp.set_elem(
0, round_to_nearest_representable(cfg.get_dt(kind), val));
idx_start += 1;
Expand All @@ -453,6 +468,7 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
val *= is_one;
} else {
val = is_one * gen(int_seed);
val += src_zp + wei_zp; // Add zp so that it will be subtracted.
}
mem_fp.set_elem(
idx, round_to_nearest_representable(cfg.get_dt(kind), val));
Expand Down Expand Up @@ -703,15 +719,41 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) {
}
}

// Check int4 weights byte alignment if format is specified.
if ((prb->wei_dt() == dnnl_s4 || prb->wei_dt() == dnnl_u4)
&& (prb->n % 2)) {
BENCHDNN_PRINT(2,
"[INVALID][%s:%d]: Int4 Weights decompression requires OC "
"('%d') to be even.\n",
__FILE__, __LINE__, (int)prb->n);
res->state = SKIPPED;
res->reason = skip_reason::invalid_case;
return;
&& (!prb->strides[WEI].empty()
|| (prb->wtag != tag::any && prb->wtag != tag::undef))) {
const auto &weights_rt_dims = get_runtime_dims(
prb->weights_dims(), prb->weights_runtime_dim_mask());
const auto wei_md
= dnn_mem_t::init_md(prb->ndims, weights_rt_dims.data(),
prb->wei_dt(), prb->wtag, prb->strides[STRIDES_WEI]);

const auto wei_strides = query_md_strides(wei_md);
int n_unit_strides = 0;
for (int d = 0; d < query_md_ndims(wei_md); d++) {
if (wei_strides[d] == 1) {
n_unit_strides++;
if (n_unit_strides > 1) {
BENCHDNN_PRINT(2,
"[INVALID][%s:%d]: Int4 Weights decompression "
"requires byte alignment for the tensor.\n",
__FILE__, __LINE__);
res->state = SKIPPED;
res->reason = skip_reason::invalid_case;
return;
}
}
if (wei_strides[d] > 1 && (wei_strides[d] % 2)) {
BENCHDNN_PRINT(2,
"[INVALID][%s:%d]: Int4 Weights decompression requires "
"byte alignment for the tensor.\n",
__FILE__, __LINE__);
res->state = SKIPPED;
res->reason = skip_reason::invalid_case;
return;
}
}
}

auto src_rt_mask = prb->src_runtime_dim_mask();
Expand Down
9 changes: 5 additions & 4 deletions tests/benchdnn/reorder/cfg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,11 @@ REG(f8_e5m2, -f16_max_exact, f16_max_exact);
REG(f8_e4m3, -f16_max_exact, f16_max_exact);
REG(f4_e2m1, -f16_max_exact, f16_max_exact);
REG(f4_e3m0, -f4_max_exact, f4_max_exact);
// Do not exceed max float value representable in integer. Otherwise, we get
// a correctness issue caused by different computations in reference and the
// library.
REG(s32, INT_MIN, BENCHDNN_S32_TO_F32_SAT_CONST);
// Do not exceed min/max float value representable in integer. Otherwise, we get
// a correctness issue caused by different computations or roudings in the naive
// reference and the library. One of those can be zero-point subtracting which
// leads to underflow or overflow.
REG(s32, -BENCHDNN_S32_TO_F32_SAT_CONST, BENCHDNN_S32_TO_F32_SAT_CONST);
REG(s8, INT8_MIN, INT8_MAX);
REG(u8, 0, UINT8_MAX);
REG(s4, -7, 8);
Expand Down
Loading
Loading