Skip to content

Commit 1d5136a

Browse files
luweizhou2016azhai219
authored andcommittedDec 3, 2024
[FORK][FEATURE] cpu: Unify oc_block for inner product with heuristic
logic.
1 parent dc9ea95 commit 1d5136a

File tree

1 file changed

+16
-0
lines changed

1 file changed

+16
-0
lines changed
 

‎src/cpu/x64/jit_brgemm_inner_product_utils.cpp

+16
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,23 @@ int jit_brgemm_ip_conf_t::get_adjusted_oc_block() const {
357357
const bool is_f32_compute = !jbgp.is_bf32
358358
&& everyone_is(f32, jbgp.src_dt, jbgp.wei_dt, jbgp.dst_dt);
359359
const bool is_avx512 = is_superset(jbgp.isa, avx512_core);
360+
const bool is_avx2 = is_superset(jbgp.isa, avx2);
360361
const bool is_f32_compute_avx512 = is_f32_compute && is_avx512;
362+
const bool is_f32_compute_avx2 = !is_avx512 && is_avx2 && is_f32_compute;
363+
364+
// These heuristic are required to avoid usage different weight layouts in case of different data shapes.
365+
// Applicibility is limited to big weights only (like LLM use cases) since minimal memory consumption and
366+
// time for weights reorder are key optimization points there.
367+
const size_t wei_size = static_cast<size_t>(jbgp.ic * jbgp.oc) * types::data_type_size(jbgp.wei_dt);
368+
// Use oc block to be 32 if weight size >= 8MB on amx bf16 to optimized memory consumption.
369+
if (jbgp.is_amx && jbgp.wei_dt == bf16 && !jbgp.is_bf32 && wei_size >= 8 * (1 << 20))
370+
return 32;
371+
// Use oc block to be 64 if weight size >= 16MB on avx512 f32 to optimized memory consumption.
372+
if (is_f32_compute_avx512 && wei_size >= 16 * (1 << 20))
373+
return 64;
374+
// Use oc block to be 24 if weight size >= 16MB on avx2 f32 to optimized memory consumption.
375+
if (is_f32_compute_avx2 && wei_size >= 16 * (1 << 20))
376+
return 24;
361377

362378
// we can't change block size on forward and weights update (external)
363379
// if layout is set by user, for backward data it can be chosen different

0 commit comments

Comments
 (0)