cpu: aarch64: Enable ACL stateless API for indirect conv

theComputeKid · vpirogov · commit 2cfff2be2d7d · 2024-08-06T09:47:33.000-07:00
- Bump ACL requirements to 24.07 and document.
- Call stateless ACL APIs from oneDNN for indirect convolution.
- Update gitignore to handle the .cache folder for clangd code navigation.

Signed-off-by: Hamza Butt &lt;hamza.butt@arm.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 #===============================================================================
 # Copyright 2019-2021 Intel Corporation
+# Copyright 2024 Arm Limited and affiliates.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,4 +30,4 @@ compile_commands.json
 .git-blame-ignore-revs
 **/.DS_Store
 __pycache__
-
+.cache
diff --git a/README.md b/README.md
@@ -171,7 +171,7 @@ On a CPU based on Arm AArch64 architecture, oneDNN CPU engine can be built with
 machine learning applications and provides AArch64 optimized implementations
 of core functions. This functionality currently requires that ACL is downloaded
 and built separately. See [Build from Source] section of the Developer Guide for
-details. oneDNN only supports Compute Library versions 24.04 or later.
+details. oneDNN only supports Compute Library versions 24.07 or later.
 
 [Arm Compute Library (ACL)]: https://github.com/arm-software/ComputeLibrary
 
diff --git a/cmake/ACL.cmake b/cmake/ACL.cmake
@@ -31,7 +31,7 @@ endif()
 
 find_package(ACL REQUIRED)
 
-set(ACL_MINIMUM_VERSION "24.04")
+set(ACL_MINIMUM_VERSION "24.07")
 
 if(ACL_FOUND)
     file(GLOB_RECURSE ACL_VERSION_FILE ${ACL_INCLUDE_DIR}/*/arm_compute_version.embed)
diff --git a/src/common/memory_tracking.hpp b/src/common/memory_tracking.hpp
@@ -199,6 +199,7 @@ enum {
     key_conv_gemm_zp_src_comp,
     key_conv_int_dat_in_acc_dt,
     key_conv_padded_bias,
+    key_conv_permuted_weights,
     key_conv_rtus_space,
     key_conv_store_wsp,
     key_conv_tails,
@@ -225,6 +226,7 @@ enum {
     key_gemm_blocked_a,
     key_gemm_blocked_b,
     key_gemm_accumulator,
+    key_gemm_pretranspose,
     key_generic_acc,
     key_gnorm_cvt,
     key_gnorm_reduction,
diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
@@ -310,43 +310,6 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
     return status::success;
 }
 
-status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
-        memory_desc_t &weights_md, memory_desc_t &dst_md,
-        memory_desc_t &bias_md, const convolution_desc_t &cd,
-        const primitive_attr_t &attr) {
-    if (weights_md.ndims != 4) return status::unimplemented;
-
-    // Indirect is slower for small convolution kernels, except when src, weight and dst are BF16
-    if (weights_md.dims[2] == 1 && weights_md.dims[3] == 1
-            && !everyone_is(data_type::bf16, src_md.data_type,
-                    weights_md.data_type, dst_md.data_type))
-        return status::unimplemented;
-
-    CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
-
-    // If we do not need to pad input channels for fast math mode then it would
-    // be faster to run convolution with im2row instead of using indirect kernel
-    int block_by = arm_compute::block_by(acp.weights_info.weight_format());
-    int ic = src_md.dims[1];
-    if (acp.fast_math && ic % block_by == 0) return status::unimplemented;
-
-    // clang-format off
-    // NOTE: indirect convolution method supports only nhwc layout.
-    ACL_CHECK_VALID(arm_compute::NEGEMMConv2d::validate(
-        &acp.src_tensor_info,
-        &acp.wei_tensor_info,
-        acp.with_bias ? &acp.bia_tensor_info : nullptr,
-        &acp.dst_tensor_info,
-        arm_compute::Conv2dInfo(acp.padstride_info,
-                                acp.dilation_info,
-                                acp.act_info,
-                                acp.fast_math,
-                                1, acp.weights_info)));
-    // clang-format on
-
-    return status::success;
-}
-
 status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
         memory_desc_t &weights_md, memory_desc_t &dst_md,
         memory_desc_t &bias_md, const convolution_desc_t &cd,
diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
@@ -34,6 +34,7 @@ struct acl_obj_t {
     arm_compute::Tensor wei_tensor;
     arm_compute::Tensor bia_tensor;
     arm_compute::Tensor dst_tensor;
+    arm_compute::experimental::MemoryRequirements aux_mem_req;
 };
 
 struct acl_conv_conf_t {
@@ -65,7 +66,7 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
         memory_desc_t &bias_md, const convolution_desc_t &cd,
         const primitive_attr_t &attr);
 
-status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
         memory_desc_t &weights_md, memory_desc_t &dst_md,
         memory_desc_t &bias_md, const convolution_desc_t &cd,
         const primitive_attr_t &attr);
@@ -81,6 +82,113 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
         const primitive_attr_t &attr);
 } // namespace acl_convolution_utils
 
+// Keys are anonymous with local linkage. So deduce the type automagically.
+using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);
+
+template <typename op_t, typename post_ops_t>
+status_t init_scratchpad(op_t &conv, memory_tracking::registrar_t &scratchpad,
+        const std::map<int, conv_key_t> &conv_keys, engine_t *engine,
+        post_ops_t &post_ops, dnnl::impl::post_ops_t &attr_post_ops,
+        arm_compute::ActivationLayerInfo &act_info, bool &use_dst_acc_for_sum,
+        const dnnl::impl::memory_desc_t &dst_md) {
+
+    // Book temp mem.
+    const auto aux_mem_req = conv.workspace();
+    for (const auto &key : conv_keys) {
+        const auto id = key.first;
+        if (aux_mem_req[id].size > 0) {
+            scratchpad.book(key.second, aux_mem_req[id].size, 1,
+                    aux_mem_req[id].alignment, aux_mem_req[id].alignment);
+        }
+    }
+
+    CHECK(post_ops.init(engine, attr_post_ops, dst_md, act_info));
+    use_dst_acc_for_sum = post_ops.has_sum();
+
+    if (use_dst_acc_for_sum) {
+        const memory_desc_wrapper dst_d(&dst_md);
+        scratchpad.book(memory_tracking::names::key_generic_acc, dst_d.nelems(),
+                dst_d.data_type_size());
+    }
+
+    return status::success;
+}
+
+template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
+        typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
+        typename bia_data_t = src_data_t>
+status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
+        conv_obj_t *acl_conv_obj, const conv_pd_t *pd,
+        const std::map<int, conv_key_t> &conv_keys) {
+
+    auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
+    auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
+
+    // import_memory() and free() methods do not allocate/free any additional
+    // memory, only acquire/release pointers.
+    arm_compute::Tensor src_tensor;
+    arm_compute::Tensor wei_tensor;
+    arm_compute::Tensor bia_tensor = nullptr;
+    arm_compute::Tensor dst_tensor;
+
+    auto const acp = pd->acp_;
+
+    src_tensor.allocator()->init(acp.src_tensor_info);
+    wei_tensor.allocator()->init(acp.wei_tensor_info);
+    dst_tensor.allocator()->init(acp.dst_tensor_info);
+
+    src_tensor.allocator()->import_memory(const_cast<src_data_t *>(src_base));
+    wei_tensor.allocator()->import_memory(const_cast<wei_data_t *>(wei_base));
+
+    const auto scratchpad = ctx.get_scratchpad_grantor();
+
+    // If we have an unfused sum post op, put the result in a scratchpad tensor.
+    // Result will be summed to the dst during acl_post_ops.execute
+    auto dst_base = acp.use_dst_acc_for_sum
+            ? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
+            : CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
+    dst_tensor.allocator()->import_memory(dst_base);
+
+    if (acp.with_bias) {
+        auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
+        bia_tensor.allocator()->init(acp.bia_tensor_info);
+        bia_tensor.allocator()->import_memory(
+                const_cast<bia_data_t *>(bia_base));
+    }
+
+    arm_compute::ITensorPack pack
+            = {{arm_compute::TensorType::ACL_SRC_0, &src_tensor},
+                    {arm_compute::TensorType::ACL_SRC_1, &wei_tensor},
+                    {arm_compute::TensorType::ACL_SRC_2, &bia_tensor},
+                    {arm_compute::TensorType::ACL_DST, &dst_tensor}};
+
+    // Get temp workspaces.
+    const auto aux_mem = acl_conv_obj->aux_mem_req;
+
+    // Hold onto tmp tensors while we need pack.
+    std::vector<arm_compute::Tensor> tmp_tensors(aux_mem.size());
+    for (const auto &key : conv_keys) {
+        const auto id = key.first;
+        if (aux_mem[id].size > 0) {
+            const auto info = arm_compute::TensorInfo(
+                    arm_compute::TensorShape(aux_mem[id].size), 1,
+                    arm_compute::DataType::U8);
+            auto buffer = scratchpad.get<void>(key.second);
+            tmp_tensors[id].allocator()->init(info, aux_mem[id].alignment);
+            tmp_tensors[id].allocator()->import_memory(buffer);
+            pack.add_tensor(aux_mem[id].slot, &tmp_tensors[id]);
+        }
+    }
+
+    acl_conv_obj->conv.prepare(pack);
+    acl_conv_obj->conv.run(pack);
+
+    void *dst = dst_tensor.buffer();
+    pd->post_ops.execute(ctx, dst);
+
+    return status::success;
+}
+
 template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
         typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
         typename bia_data_t = src_data_t>
diff --git a/src/cpu/aarch64/acl_indirect_gemm_convolution.cpp b/src/cpu/aarch64/acl_indirect_gemm_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2022 Arm Ltd. and affiliates
+* Copyright 2021-2022, 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,27 +14,115 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/acl_indirect_gemm_convolution.hpp"
+#include "acl_indirect_gemm_convolution.hpp"
+#include "acl_convolution_utils.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/utils.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace aarch64 {
 
+namespace {
+// Keys are anonymous. So deduce the type automagically.
+using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);
+
+// Map: [slot , key]
+const std::map<int, conv_key_t> indirect_conv_keys
+        = {{0, conv_key_t::key_gemm_tmp_buffer},
+                {2, conv_key_t::key_gemm_pretranspose},
+                {3, conv_key_t::key_conv_permuted_weights}};
+} // namespace
+
+status_t acl_indirect_gemm_convolution_fwd_t::init(engine_t *engine) {
+    auto acp_ = pd()->acp_;
+    acl_obj_->conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
+            acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
+            &acp_.dst_tensor_info,
+            arm_compute::Conv2dInfo(acp_.padstride_info, acp_.dilation_info,
+                    acp_.act_info, acp_.fast_math, 1, acp_.weights_info));
+    acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
+    return status::success;
+}
+
 status_t acl_indirect_gemm_convolution_fwd_t::execute_forward(
         const exec_ctx_t &ctx) const {
-    // Lock here is needed because resource_mapper does not support
-    // concurrent multithreaded access.
-    std::lock_guard<std::mutex> _lock {this->mtx};
-    // Retrieve primitive resource and configured Compute Library objects
-    auto *acl_resource
-            = ctx.get_resource_mapper()->get<acl_indirect_gemm_resource_t>(
-                    this);
-    acl_obj_t<arm_compute::NEGEMMConv2d> &acl_indirect_gemm_obj
-            = acl_resource->get_acl_obj();
-
-    return execute_forward_conv_acl<acl_obj_t<arm_compute::NEGEMMConv2d>, pd_t,
-            data_t>(ctx, acl_indirect_gemm_obj, pd());
+    return execute_forward_conv_acl<acl_obj_t<Op>, pd_t, data_t>(
+            ctx, acl_obj_.get(), pd(), indirect_conv_keys);
+}
+
+status_t acl_indirect_gemm_convolution_fwd_t::create_resource(
+        engine_t *engine, resource_mapper_t &mapper) const {
+
+    CHECK(pd()->post_ops.create_resource(engine, mapper));
+    return status::success;
+}
+
+status_t acl_indirect_gemm_convolution_fwd_t::pd_t::init_conf() {
+    if (weights_md_.ndims != 4) return status::unimplemented;
+
+    // Indirect is slower for small convolution kernels, except when src, weight and dst are BF16
+    if (weights_md_.dims[2] == 1 && weights_md_.dims[3] == 1
+            && !dnnl::impl::utils::everyone_is(data_type::bf16,
+                    src_md_.data_type, weights_md_.data_type,
+                    dst_md_.data_type))
+        return status::unimplemented;
+
+    CHECK(acl_convolution_utils::acl_init_conf(
+            acp_, src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr()));
+
+    // If we do not need to pad input channels for fast math mode then it would
+    // be faster to run convolution with im2row instead of using indirect kernel
+    int block_by = arm_compute::block_by(acp_.weights_info.weight_format());
+    int ic = src_md_.dims[1];
+    if (acp_.fast_math && ic % block_by == 0) return status::unimplemented;
+
+    // clang-format off
+    // NOTE: indirect convolution method supports only nhwc layout.
+    ACL_CHECK_VALID(Op::validate(
+        &acp_.src_tensor_info,
+        &acp_.wei_tensor_info,
+        acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
+        &acp_.dst_tensor_info,
+        arm_compute::Conv2dInfo(acp_.padstride_info,
+                                acp_.dilation_info,
+                                acp_.act_info,
+                                acp_.fast_math,
+                                1, acp_.weights_info)));
+    // clang-format on
+
+    return status::success;
+}
+
+status_t acl_indirect_gemm_convolution_fwd_t::pd_t::init(engine_t *engine) {
+    using namespace data_type;
+    using smask_t = primitive_attr_t::skip_mask_t;
+
+    const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
+            && attr()->has_default_values(smask_t::post_ops, f16);
+    const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
+            && attr()->has_default_values(
+                    smask_t::post_ops | smask_t::fpmath_mode, f32);
+    bool ok = is_fwd() && set_default_alg_kind(alg_kind::convolution_direct)
+            && utils::one_of(true, is_fp16_ok, is_fp32_ok)
+            && !has_zero_dim_memory();
+    if (!ok) return status::unimplemented;
+
+    CHECK(init_conf());
+
+    // Book memory.
+    Op conv;
+    conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
+            acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
+            &acp_.dst_tensor_info,
+            arm_compute::Conv2dInfo(acp_.padstride_info, acp_.dilation_info,
+                    acp_.act_info, acp_.fast_math, 1, acp_.weights_info));
+
+    auto scratchpad = scratchpad_registry().registrar();
+    return init_scratchpad(conv, scratchpad, indirect_conv_keys, engine,
+            post_ops, attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum,
+            dst_md_);
 }
 
 } // namespace aarch64
diff --git a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp