uxlfoundation · mgouicem · Oct 21, 2024 · Sep 11, 2024 · Oct 15, 2024 · Oct 17, 2024
@@ -173,7 +173,7 @@ On a CPU based on Arm AArch64 architecture, oneDNN CPU engine can be built with
 machine learning applications and provides AArch64 optimized implementations
 of core functions. This functionality currently requires that ACL is downloaded
 and built separately. See [Build from Source] section of the Developer Guide for
-details. oneDNN only supports Compute Library versions 24.08.1 or later.
+details. oneDNN only supports Compute Library versions 24.09 or later.
 
 [Arm Compute Library (ACL)]: https://github.com/arm-software/ComputeLibrary
 

@@ -31,7 +31,7 @@ endif()
 
 find_package(ACL REQUIRED)
 
-set(ACL_MINIMUM_VERSION "24.08.1")
+set(ACL_MINIMUM_VERSION "24.09")
 
 if(ACL_FOUND)
     file(GLOB_RECURSE ACL_VERSION_FILE ${ACL_INCLUDE_DIR}/*/arm_compute_version.embed)

@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2022 Arm Ltd. and affiliates
+* Copyright 2021-2022, 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,27 +21,147 @@ namespace impl {
 namespace cpu {
 namespace aarch64 {
 
-status_t acl_softmax_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
+const acl_softmax_fwd_t::pd_t *acl_softmax_fwd_t::pd() const {
+    return static_cast<const pd_t *>(primitive_t::pd().get());
+}
+
+status_t acl_softmax_fwd_t::pd_t::init(engine_t *engine) {
+
+    bool ok = is_fwd()
+            && set_default_formats() == status::success
+            // ACL only supports matching src/dst (this must come after
+            // set_default_formats() to handle format_kind::any)
+            && *src_md() == *dst_md()
+            && utils::one_of(
+                    src_md()->data_type, data_type::f32, data_type::f16)
+            && attr()->has_default_values();
+    if (!ok) return status::unimplemented;
+
+    // Get memory desc to find sizes and dims
+    const memory_desc_wrapper src_d(src_md());
+    const data_type_t data_type = src_d.data_type();
+
+    // ACL only supports plain tensors, can be permuted but not blocked
+    if (!src_d.is_plain()) return status::unimplemented;
+
+    // Guards against a 0-sized dimension
+    if (src_d.has_zero_dim()) return status::unimplemented;
+
+    // No scaling
+    asp_.beta = 1;
+
+    asp_.is_logsoftmax = is_logsoftmax();
+
+    // The strides give us the in memory inner size
+    dim_t inner_size_ = src_d.blocking_desc().strides[axis()];
+
+    dim_t axis_size_ = axis_size();
+
+    // The outer size is any left-over dimensions not inner or on the axis
+    dim_t outer_size_ = src_d.nelems() / (inner_size_ * axis_size_);
+
+    // In this context, NHWC tells ACL that the logical and physical
+    // dimensions are the same
+    arm_compute::DataLayout acl_layout = arm_compute::DataLayout::NHWC;
+
+    const arm_compute::DataType acl_data_t
+            = acl_utils::get_acl_data_t(data_type);
+
+    const int threads = dnnl_get_max_threads();
+    if (inner_size_ == 1) {
+        // A rough empirical heuristic created by fitting a polynomial
+        // of the tensor sizes and thread count to the run time of the
+        // ref and ACL softmax. This variable is greater than zero when
+        // ref is faster, and less than zero when ACL is faster. We can
+        // interpret the constant term as the constant overhead
+        // associated with calling the external library and the negative
+        // coefficient on total_size as ACL being faster at processing
+        // each element
+        double acl_ref_performance_diff = 1 + 0.005 * outer_size_
+                - 0.0027 * axis_size_
+                        * std::ceil(double(outer_size_) / threads);
+        if (threads > 1 || outer_size_ > 1) {
+            // Using threads within ACL adds another constant overhead
+            acl_ref_performance_diff += 17;
+        }
+        if (acl_ref_performance_diff > 0) return status::unimplemented;
+
+        // If the inner size is 1, we can get rid of the dimension.
+        // This stops ACL doing a unnecessary permute
+        arm_compute::TensorShape acl_tensor_shape
+                = arm_compute::TensorShape(axis_size_, outer_size_);
+        asp_.axis = 0;
 
-    // Lock here is needed because resource_mapper does not support
-    // concurrent multithreaded access.
-    std::lock_guard<std::mutex> _lock {this->mtx};
+        asp_.src_info = arm_compute::TensorInfo(
+                acl_tensor_shape, 1, acl_data_t, acl_layout);
+        asp_.dst_info = arm_compute::TensorInfo(
+                acl_tensor_shape, 1, acl_data_t, acl_layout);
+    } else {
+        // A rough empirical heuristic, see comment above
+        // The only difference here is that ACL does a reorder, and so
+        // is considerably better
+        double acl_ref_performance_diff = 1 + 0.005 * outer_size_
+                - 0.01 * inner_size_ * axis_size_
+                        * std::ceil(double(outer_size_) / threads);
+        if (threads > 1 || outer_size_ > 1) {
+            // Using threads within ACL adds another constant overhead
+            acl_ref_performance_diff += 17;
+        }
 
+        if (acl_ref_performance_diff > 0) return status::unimplemented;
+
+        // Irrespective of the input dimensions, we construct a tensor
+        // with dimensions such that softmax can be applied over the
+        // middle axis (1), with the correct stride and vector length.
+        arm_compute::TensorShape acl_tensor_shape = arm_compute::TensorShape(
+                inner_size_, axis_size_, outer_size_);
+        asp_.axis = 1;
+
+        asp_.src_info = arm_compute::TensorInfo(
+                acl_tensor_shape, 1, acl_data_t, acl_layout);
+        asp_.dst_info = arm_compute::TensorInfo(
+                acl_tensor_shape, 1, acl_data_t, acl_layout);
+    }
+
+    // Validate manually to check for return status
+    ACL_CHECK_VALID(arm_compute::experimental::op::CpuSoftmax::validate(
+            &asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis));
+
+    return status::success;
+}
+
+status_t acl_softmax_fwd_t::init(engine_t *engine) {
+    auto asp = pd()->asp_;
+
+    auto op = std::make_unique<arm_compute::experimental::op::CpuSoftmax>();
+
+    softmax_op_ = std::move(op);
+    // Configure softmax operation, mem allocation happens.
+    softmax_op_->configure(&asp.src_info, &asp.dst_info, asp.beta, asp.axis,
+            asp.is_logsoftmax);
+
+    return status::success;
+}
+
+status_t acl_softmax_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     auto src = CTX_IN_MEM(const void *, DNNL_ARG_SRC);
     auto dst = CTX_OUT_MEM(void *, DNNL_ARG_DST);
 
-    // Retrieve primitive resource and configured Compute Library objects
-    auto *acl_resource
-            = ctx.get_resource_mapper()->get<acl_softmax_resource_t>(this);
-    acl_softmax_obj_t &acl_obj = acl_resource->get_acl_obj();
+    auto asp = pd()->asp_;
+
+    arm_compute::Tensor src_tensor;
+    arm_compute::Tensor dst_tensor;
 
-    acl_obj.src_tensor.allocator()->import_memory(const_cast<void *>(src));
-    acl_obj.dst_tensor.allocator()->import_memory(dst);
+    src_tensor.allocator()->init(asp.src_info);
+    src_tensor.allocator()->import_memory(const_cast<void *>(src));
+    dst_tensor.allocator()->init(asp.dst_info);
+    dst_tensor.allocator()->import_memory(dst);
 
-    acl_obj.softmax->run();
+    arm_compute::ITensorPack run_pack {
+            {arm_compute::TensorType::ACL_SRC_0, &src_tensor},
+            {arm_compute::TensorType::ACL_DST, &dst_tensor}};
 
-    acl_obj.src_tensor.allocator()->free();
-    acl_obj.dst_tensor.allocator()->free();
+    softmax_op_->run(run_pack);
 
     return status::success;
 }