azhai219 · Dec 24, 2024
diff --git a/‎.github/automation/build_acl.sh
+1-1 b/‎.github/automation/build_acl.sh
+1-1
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎cmake/ACL.cmake
+1-1 b/‎cmake/ACL.cmake
+1-1
diff --git a/‎src/common/memory_tracking.hpp
-15 b/‎src/common/memory_tracking.hpp
-15
diff --git a/‎src/cpu/acl/acl_batch_normalization.hpp
+2 b/‎src/cpu/acl/acl_batch_normalization.hpp
+2
diff --git a/‎src/cpu/acl/acl_binary.cpp
+15-189 b/‎src/cpu/acl/acl_binary.cpp
+15-189
@@ -28,7 +28,7 @@ source ${SCRIPT_DIR}/common_aarch64.sh
 
 ACL_CONFIG=${ACL_CONFIG:-"Release"}
 ACL_ROOT_DIR=${ACL_ROOT_DIR:-"${PWD}/ComputeLibrary"}
-ACL_VERSION=${ACL_VERSION:-v24.11.1}
+ACL_VERSION=${ACL_VERSION:-v24.09}
 ACL_ARCH=${ACL_ARCH:-"armv8.2-a"}
 ACL_REPO="https://github.com/ARM-software/ComputeLibrary.git"
 
 
@@ -173,7 +173,7 @@ On a CPU based on Arm AArch64 architecture, oneDNN CPU engine can be built with
 machine learning applications and provides AArch64 optimized implementations
 of core functions. This functionality currently requires that ACL is downloaded
 and built separately. See [Build from Source] section of the Developer Guide for
-details. oneDNN only supports Compute Library versions 24.11.1 or later.
+details. oneDNN only supports Compute Library versions 24.09 or later.
 
 [Arm Compute Library (ACL)]: https://github.com/arm-software/ComputeLibrary
 
 
@@ -31,7 +31,7 @@ endif()
 
 find_package(ACL REQUIRED)
 
-set(ACL_MINIMUM_VERSION "24.11.1")
+set(ACL_MINIMUM_VERSION "24.09")
 
 if(ACL_FOUND)
     file(GLOB_RECURSE ACL_VERSION_FILE ${ACL_INCLUDE_DIR}/*/arm_compute_version.embed)
 
@@ -200,9 +200,6 @@ enum {
     key_conv_gemm_zp_src_comp,
     key_conv_int_dat_in_acc_dt,
     key_conv_padded_bias,
-    key_conv_permuted_inputs,
-    key_conv_permuted_outputs,
-    key_conv_permuted_weights,
     key_conv_rtus_space,
     key_conv_store_wsp,
     key_conv_tails,
@@ -225,20 +222,10 @@ enum {
     key_eltwise_src,
     key_fusion_forward_scratchpad,
     key_fusion_inout_buffer,
-    key_gemm_asm_tmp_buffer,
     key_gemm_tmp_buffer,
     key_gemm_blocked_a,
     key_gemm_blocked_b,
     key_gemm_accumulator,
-    key_gemm_interleaved_lhs,
-    key_gemm_mm_result_s32,
-    key_gemm_mm_signed_a,
-    key_gemm_mm_signed_output,
-    key_gemm_output,
-    key_gemm_pretranspose,
-    key_gemm_pretranspose_b,
-    key_gemm_pretransposed_rhs,
-    key_gemm_transposed_1xwrhs,
     key_generic_acc,
     key_gnorm_cvt,
     key_gnorm_reduction,
@@ -311,11 +298,9 @@ enum {
     key_softmax_interim_store,
     key_sum_reduction,
     key_sum_srcs_cvt,
-    key_wino_transformed_weights,
     key_wino_U,
     key_wino_V,
     key_wino_M,
-    key_wino_workspace,
     key_decompression_scales,
     key_decompression_zero_points,
     key_src_quantized,
 
@@ -258,6 +258,8 @@ struct acl_batch_normalization_fwd_t : public primitive_t {
         CHECK(r->configure(pd()->abp, pd()));
         mapper.add(this, std::move(r));
 
+        CHECK(pd()->post_ops.create_resource(engine, mapper));
+
         return status::success;
     }
 
 
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022, 2024 Arm Ltd. and affiliates
+* Copyright 2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,198 +16,32 @@
 
 #include "acl_binary.hpp"
 
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/experimental/operators/CpuAdd.h"
-#include "arm_compute/runtime/experimental/operators/CpuElementwise.h"
-#include "arm_compute/runtime/experimental/operators/CpuMul.h"
-#include "arm_compute/runtime/experimental/operators/CpuSub.h"
-
 namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace acl {
 
-status_t acl_binary_t::pd_t::init(engine_t *engine) {
-    using namespace acl_utils;
-
-    // Only support f16/f32/s32 for now
-    data_type_t ddt = dst_md(0)->data_type;
-    if (!utils::one_of(ddt, data_type::f16, data_type::f32, data_type::s32))
-        return status::unimplemented;
-
-    // Only support src and dst all matching for now
-    if (ddt != src_md(0)->data_type || src_md(1)->data_type != ddt)
-        return status::unimplemented;
-
-    // Sets the memory format of dst from any to src_md(0) blocking desc
-    CHECK(set_default_params());
-
-    if (!attr()->has_default_values()) return status::unimplemented;
-
-    asp_.alg = desc()->alg_kind;
-
-    // All the algorithms we support
-    if (!utils::one_of(asp_.alg, alg_kind::binary_add, alg_kind::binary_sub,
-                alg_kind::binary_mul, alg_kind::binary_div,
-                alg_kind::binary_max, alg_kind::binary_min))
-        return status::unimplemented;
-
-    // s32 div in ACL does not round as oneDNN expects
-    if (ddt == data_type::s32 && asp_.alg == alg_kind::binary_div)
-        return status::unimplemented;
-
-    // ACL pointwise arithmetic operators assume that the innermost
-    // dimensions are dense for src0, src1 and dst. Reordering the
-    // logical dimensions by stride does this (if reordered_dims >= 1 )
-    // and also makes memory accesses contiguous in ACL (without any
-    // data reordering).
-    memory_desc_t src_d0_permed, src_d1_permed, dst_d_permed;
-    int reordered_dims = reorder_dimensions_by_stride(
-            {&src_d0_permed, &src_d1_permed, &dst_d_permed},
-            {src_md(0), src_md(1), dst_md()});
-    if (reordered_dims < 1) return status::unimplemented;
-
-    // Create ACL tensor infos with permuted descs
-    CHECK(tensor_info(asp_.src0_info, src_d0_permed));
-    CHECK(tensor_info(asp_.src1_info, src_d1_permed));
-    CHECK(tensor_info(asp_.dst_info, dst_d_permed));
-
-    // In this case ACL tries to treat src0 and src1 as a 1D array, but
-    // fails because the strides aren't equal. TODO: remove when fixed
-    // in ACL
-    if (asp_.alg == alg_kind::binary_add
-            && asp_.src0_info.tensor_shape() == asp_.src1_info.tensor_shape()
-            && asp_.src0_info.strides_in_bytes()
-                    != asp_.src1_info.strides_in_bytes()) {
-        return status::unimplemented;
-    }
-
-    // This forces ACL not to parallelise with small workloads, this is
-    // a temporary fix and should be removed in future versions (TODO)
-    memory_desc_wrapper dst_d(dst_md());
-    if (dst_d.nelems() < 40000) {
-        size_t acl_y_axis_i = 1;
-        CHECK(insert_singleton_dimension(asp_.src0_info, acl_y_axis_i));
-        CHECK(insert_singleton_dimension(asp_.src1_info, acl_y_axis_i));
-        CHECK(insert_singleton_dimension(asp_.dst_info, acl_y_axis_i));
-    }
-
-    // Call operator specific validate function to check support
-    ACL_CHECK_VALID(validate(asp_));
-
-    return status::success;
-}
-
-arm_compute::Status acl_binary_t::pd_t::validate(const acl_binary_conf_t &asp) {
-    switch (asp.alg) {
-        case alg_kind::binary_add:
-            return arm_compute::experimental::op::CpuAdd::validate(
-                    &asp.src0_info, &asp.src1_info, &asp.dst_info,
-                    arm_compute::ConvertPolicy::SATURATE);
-        case alg_kind::binary_sub:
-            return arm_compute::experimental::op::CpuSub::validate(
-                    &asp.src0_info, &asp.src1_info, &asp.dst_info,
-                    arm_compute::ConvertPolicy::SATURATE);
-        case alg_kind::binary_div:
-            return arm_compute::experimental::op::CpuElementwiseDivision::
-                    validate(&asp.src0_info, &asp.src1_info, &asp.dst_info);
-        case alg_kind::binary_mul:
-            return arm_compute::experimental::op::CpuMul::validate(
-                    &asp.src0_info, &asp.src1_info, &asp.dst_info, 1.0f,
-                    arm_compute::ConvertPolicy::SATURATE,
-                    arm_compute::RoundingPolicy::TO_ZERO);
-        case alg_kind::binary_min:
-            return arm_compute::experimental::op::CpuElementwiseMin::validate(
-                    &asp.src0_info, &asp.src1_info, &asp.dst_info);
-        case alg_kind::binary_max:
-            return arm_compute::experimental::op::CpuElementwiseMax::validate(
-                    &asp.src0_info, &asp.src1_info, &asp.dst_info);
-        default:
-            return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
-                    "unsupported alg_kind");
-    }
-}
-
-status_t acl_binary_t::init(engine_t *engine) {
-    auto asp = pd()->asp_;
-
-    switch (asp.alg) {
-        case alg_kind::binary_add: {
-            auto add_op
-                    = std::make_unique<arm_compute::experimental::op::CpuAdd>();
-            add_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info,
-                    arm_compute::ConvertPolicy::SATURATE);
-            binary_op_ = std::move(add_op);
-            break;
-        }
-        case alg_kind::binary_sub: {
-            auto sub_op
-                    = std::make_unique<arm_compute::experimental::op::CpuSub>();
-            sub_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info,
-                    arm_compute::ConvertPolicy::SATURATE);
-            binary_op_ = std::move(sub_op);
-            break;
-        }
-        case alg_kind::binary_div: {
-            auto div_op = std::make_unique<
-                    arm_compute::experimental::op::CpuElementwiseDivision>();
-            div_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info);
-            binary_op_ = std::move(div_op);
-            break;
-        }
-        case alg_kind::binary_mul: {
-            auto mul_op
-                    = std::make_unique<arm_compute::experimental::op::CpuMul>();
-            mul_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info,
-                    1.0f, arm_compute::ConvertPolicy::SATURATE,
-                    arm_compute::RoundingPolicy::TO_ZERO);
-            binary_op_ = std::move(mul_op);
-            break;
-        }
-        case alg_kind::binary_min: {
-            auto min_op = std::make_unique<
-                    arm_compute::experimental::op::CpuElementwiseMin>();
-            min_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info);
-            binary_op_ = std::move(min_op);
-            break;
-        }
-        case alg_kind::binary_max: {
-            auto max_op = std::make_unique<
-                    arm_compute::experimental::op::CpuElementwiseMax>();
-            max_op->configure(&asp.src0_info, &asp.src1_info, &asp.dst_info);
-            binary_op_ = std::move(max_op);
-            break;
-        }
-        default: return status::runtime_error;
-    }
-
-    return status::success;
-}
-
 status_t acl_binary_t::execute_forward(const exec_ctx_t &ctx, const void *src0,
         const void *src1, void *dst) const {
 
-    auto asp = pd()->asp_;
+    // Lock here is needed because resource_mapper does not support
+    // concurrent multithreaded access.
+    std::lock_guard<std::mutex> _lock {this->mtx};
 
-    arm_compute::Tensor src0_tensor;
-    arm_compute::Tensor src1_tensor;
-    arm_compute::Tensor dst_tensor;
+    // Retrieve primitive resource and configured Compute Library objects
+    acl_binary_obj_t &acl_obj = ctx.get_resource_mapper()
+                                        ->get<acl_binary_resource_t>(this)
+                                        ->get_acl_obj();
 
-    src0_tensor.allocator()->init(asp.src0_info);
-    src0_tensor.allocator()->import_memory(const_cast<void *>(src0));
-    src1_tensor.allocator()->init(asp.src1_info);
-    src1_tensor.allocator()->import_memory(const_cast<void *>(src1));
-    dst_tensor.allocator()->init(asp.dst_info);
-    dst_tensor.allocator()->import_memory(dst);
+    acl_obj.src0_tensor.allocator()->import_memory(const_cast<void *>(src0));
+    acl_obj.src1_tensor.allocator()->import_memory(const_cast<void *>(src1));
+    acl_obj.dst_tensor.allocator()->import_memory(dst);
 
-    arm_compute::ITensorPack run_pack {
-            {arm_compute::TensorType::ACL_SRC_0, &src0_tensor},
-            {arm_compute::TensorType::ACL_SRC_1, &src1_tensor},
-            {arm_compute::TensorType::ACL_DST, &dst_tensor}};
+    acl_obj.binary_op->run();
 
-    binary_op_->run(run_pack);
+    acl_obj.src0_tensor.allocator()->free();
+    acl_obj.src1_tensor.allocator()->free();
+    acl_obj.dst_tensor.allocator()->free();
 
     return status::success;
 }
@@ -221,14 +55,6 @@ status_t acl_binary_t::execute_forward(const exec_ctx_t &ctx) const {
     return execute_forward(ctx, src0, src1, dst);
 }
 
-status_t acl_binary_t::execute(const exec_ctx_t &ctx) const {
-    return execute_forward(ctx);
-}
-
-const acl_binary_t::pd_t *acl_binary_t::pd() const {
-    return static_cast<const pd_t *>(primitive_t::pd().get());
-}
-
 } // namespace acl
 } // namespace cpu
 } // namespace impl
Original file line number	Diff line number	Diff line change
`@@ -258,6 +258,8 @@ struct acl_batch_normalization_fwd_t : public primitive_t {`
`258`	`258`	`CHECK(r->configure(pd()->abp, pd()));`
`259`	`259`	`mapper.add(this, std::move(r));`
`260`	`260`
	`261`	`+ CHECK(pd()->post_ops.create_resource(engine, mapper));`
	`262`	`+`
`261`	`263`	`return status::success;`
`262`	`264`	`}`
`263`	`265`