cpu: aarch64: conv: Do not fall through to direct conv for BF16

fadara01 · vpirogov · commit 390d34c4ef01 · 2024-06-24T08:37:42.000-07:00
Indirect conv is faster than direct conv when source, weight
and destination are of type BF16
diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
@@ -316,8 +316,10 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
         const primitive_attr_t &attr) {
     if (weights_md.ndims != 4) return status::unimplemented;
 
-    // Indirect is slower for small convolution kernels
-    if (weights_md.dims[2] == 1 && weights_md.dims[3] == 1)
+    // Indirect is slower for small convolution kernels, except when src, weight and dst are BF16
+    if (weights_md.dims[2] == 1 && weights_md.dims[3] == 1
+            && !everyone_is(data_type::bf16, src_md.data_type,
+                    weights_md.data_type, dst_md.data_type))
         return status::unimplemented;
 
     CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));