From 9b2974c86a2dee12d61615cad92b3f770f4109a0 Mon Sep 17 00:00:00 2001
From: manjam01 <Manaal.Jamadar@arm.com>
Date: Sun, 2 Mar 2025 20:05:50 +0000
Subject: [PATCH] cpu: aarch64: Enable stateless ACL LayerNorm

---
 src/cpu/aarch64/acl_layer_normalization.cpp | 185 +++++++++++++++--
 src/cpu/aarch64/acl_layer_normalization.hpp | 217 ++------------------
 2 files changed, 180 insertions(+), 222 deletions(-)

diff --git a/src/cpu/aarch64/acl_layer_normalization.cpp b/src/cpu/aarch64/acl_layer_normalization.cpp
index 05bcb1766f1..f4c8b57f18c 100644
--- a/src/cpu/aarch64/acl_layer_normalization.cpp
+++ b/src/cpu/aarch64/acl_layer_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Arm Ltd. and affiliates
+* Copyright 2023, 2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,29 +21,178 @@ namespace impl {
 namespace cpu {
 namespace aarch64 {
 
-status_t acl_layer_normalization_fwd_t::execute_forward(
-        const exec_ctx_t &ctx) const {
+acl_layer_normalization_fwd_t::acl_layer_normalization_fwd_t(const pd_t *apd)
+    : primitive_t(apd)
+    , acl_obj_(std::make_unique<
+              arm_compute::experimental::op::CpuMeanStdDevNormalization>()) {}
+
+status_t acl_layer_normalization_fwd_t::pd_t::init(engine_t *engine) {
+
+    // dir and flags
+    ACL_CHECK_SUPPORT(!is_fwd(), "ACL lnorm supports forward propagation only");
+    ACL_CHECK_SUPPORT(is_training(), "ACL supports inference only for lnorm");
+    ACL_CHECK_SUPPORT(
+            use_global_stats(), "ACL does not support global stats with lnorm");
+    ACL_CHECK_SUPPORT(use_scale() || use_shift(),
+            "ACL does not support lnorm scale and shift");
+
+    // attr-scales
+    ACL_CHECK_SUPPORT(!attr()->has_default_values(),
+            "ACL does not support scales attribute");
+
+    // tag and stat_tag
+    ACL_CHECK_SUPPORT(src_md()->ndims < 2 || src_md()->ndims > 5,
+            "src tensor must have between 2 and 5 (inclusive) "
+            "dimensions");
+
+    // msdNorm only supports lnorm for src in a channels last format.
+    // So if channels aren't last (ie. if they aren't dense),
+    // then reorder into a channels last format
+    std::string ref_implementation_guess = "simple:any";
+    if (src_md()->format_desc.blocking.strides[ndims() - 1] != 1) {
+        CHECK(memory_desc_init_by_tag(
+                src_md_, get_channels_last_format(src_md_.ndims)));
+        ref_implementation_guess = "ref:any";
+    }
+    if (dst_md_ != src_md_)
+        // Make sure dst and src share a format
+        CHECK(memory_desc_init_by_md_and_dt(
+                dst_md_, src_md_, src_md()->data_type));
+    if (!set_default_stat_md_format(src_md_)) return status::unimplemented;
+
+    const memory_desc_wrapper src_d(src_md_);
+    const memory_desc_wrapper dst_d(dst_md_);
 
-    // Lock here is needed because resource_mapper does not support
-    // concurrent access.
-    std::lock_guard<std::mutex> _lock {this->mtx};
+    ACL_CHECK_SUPPORT(src_d.has_zero_dim() || dst_d.has_zero_dim(),
+            "data tensor(s) must not have a zero dimension");
+
+    // data type
+    ACL_CHECK_SUPPORT(
+            src_d.data_type() != data_type::f32, "ACL Lnorm only supports F32");
+    ACL_CHECK_SUPPORT(dst_d.data_type() != src_d.data_type(),
+            "src and dst must share data types");
+
+    // Problem shape
+    int C = norm_axis(); // Channel dim size
+    int X = src_d.nelems() / C; // Non-channel dims size
+
+    ACL_CHECK_SUPPORT(!use_acl_heuristic(X, C, dnnl_get_max_threads(),
+                              is_training(), ref_implementation_guess),
+            "ACL is unoptimal in this case");
+
+    anp_data_info = arm_compute::TensorInfo(
+            arm_compute::TensorShape(C, X), 1, arm_compute::DataType::F32);
+
+    ACL_CHECK_VALID(
+            arm_compute::experimental::op::CpuMeanStdDevNormalization::validate(
+                    &anp_data_info, &anp_data_info,
+                    desc()->layer_norm_epsilon));
+
+    return status::success;
+}
 
-    // Retrieve primitive resource and configured Compute Library objects
-    auto *acl_resource
-            = ctx.get_resource_mapper()
-                      ->get<acl_layer_normalization_resource_t>(this);
-    acl_msdnorm_obj_t &acl_obj = acl_resource->get_acl_obj();
+format_tag_t acl_layer_normalization_fwd_t::pd_t::get_channels_last_format(
+        size_t ndim) const {
+    assert(ndim > 1 && ndim < 6);
+    switch (ndim) {
+        case 2: return format_tag::nc;
+        case 3: return format_tag::tnc;
+        case 4: return format_tag::ldnc;
+        case 5: return format_tag::abcde;
+        default: return format_tag::undef;
+    }
+}
+
+bool acl_layer_normalization_fwd_t::pd_t::use_acl_heuristic(int X, int C,
+        int threads, bool ref_has_stats,
+        const std::string &ref_implementation_guess) const {
+    // Above a certain C, ACL is always faster, and below a certain C,
+    // ACL is always slower. for C in between these two, whether ACL is
+    // faster can be approximated with the workload (X*C) per thread.
+    // The values here were derived empirically and all depend on
+    // threads, whether ref can use provided stats, and which reference
+    // implementation ACL is competing with.
+
+    int acl_competitive_C = C;
+    int acl_better_C = C;
+    int acl_better_XC_per_thread = X * C;
+
+    if (ref_implementation_guess == "simple:any") {
+        acl_competitive_C = 64;
+        if (ref_has_stats) {
+            acl_better_C = 4096;
+            acl_better_XC_per_thread = threads == 1 ? 4096 : 8192;
+        } else {
+            acl_better_C = threads <= 2 ? 1024 : 4096;
+            acl_better_XC_per_thread = threads == 1 ? 1024 : 4096;
+        }
+    } else if (ref_implementation_guess == "ref:any") {
+        acl_competitive_C = 0;
+        if (ref_has_stats) {
+            if (threads == 1) {
+                acl_better_C = 64;
+            } else if (threads == 2) {
+                acl_better_C = 256;
+            } else {
+                acl_better_C = 1024;
+            }
+
+            if (threads == 1) {
+                acl_better_XC_per_thread = 256;
+            } else if (threads <= 16) {
+                acl_better_XC_per_thread = 512;
+            } else {
+                acl_better_XC_per_thread = 1024;
+            }
+        } else {
+            if (threads == 1) {
+                acl_better_C = 64;
+                acl_better_XC_per_thread = 128;
+            } else if (threads <= 32) {
+                acl_better_C = 256;
+                acl_better_XC_per_thread = 256;
+            } else {
+                acl_better_C = 1024;
+                acl_better_XC_per_thread = 512;
+            }
+        }
+    }
+
+    return C > acl_competitive_C
+            && (C > acl_better_C || X * C > acl_better_XC_per_thread * threads);
+}
+
+const acl_layer_normalization_fwd_t::pd_t *
+acl_layer_normalization_fwd_t::pd() const {
+    return (const pd_t *)primitive_t::pd().get();
+}
+
+status_t acl_layer_normalization_fwd_t::init(engine_t *engine) {
+    auto *anp_data_info
+            = const_cast<arm_compute::TensorInfo *>(&pd()->anp_data_info);
+    acl_obj_->configure(
+            anp_data_info, anp_data_info, pd()->desc()->layer_norm_epsilon);
+    return status::success;
+}
+
+status_t acl_layer_normalization_fwd_t::execute_forward(
+        const exec_ctx_t &ctx) const {
 
-    auto src = CTX_IN_MEM(const float *, DNNL_ARG_SRC);
-    acl_obj.src_tensor.allocator()->import_memory(const_cast<float *>(src));
+    const auto *src = CTX_IN_MEM(const float *, DNNL_ARG_SRC);
+    auto *dst = CTX_OUT_MEM(float *, DNNL_ARG_DST);
 
-    auto dst = CTX_OUT_MEM(float *, DNNL_ARG_DST);
-    acl_obj.dst_tensor.allocator()->import_memory(dst);
+    arm_compute::Tensor src_tensor;
+    arm_compute::Tensor dst_tensor;
 
-    acl_obj.msdNorm.run();
+    src_tensor.allocator()->init(pd()->anp_data_info);
+    src_tensor.allocator()->import_memory(const_cast<float *>(src));
+    dst_tensor.allocator()->init(pd()->anp_data_info);
+    dst_tensor.allocator()->import_memory(dst);
 
-    acl_obj.src_tensor.allocator()->free();
-    acl_obj.dst_tensor.allocator()->free();
+    arm_compute::ITensorPack act_pack;
+    act_pack.add_tensor(arm_compute::TensorType::ACL_SRC, &src_tensor);
+    act_pack.add_tensor(arm_compute::TensorType::ACL_DST, &dst_tensor);
+    acl_obj_->run(act_pack);
 
     return status::success;
 }
diff --git a/src/cpu/aarch64/acl_layer_normalization.hpp b/src/cpu/aarch64/acl_layer_normalization.hpp
index 52ef5f99fa8..c3dfaf3b214 100644
--- a/src/cpu/aarch64/acl_layer_normalization.hpp
+++ b/src/cpu/aarch64/acl_layer_normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Arm Ltd. and affiliates
+* Copyright 2023-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #ifndef CPU_AARCH64_ACL_LAYER_NORMALIZATION_HPP
 #define CPU_AARCH64_ACL_LAYER_NORMALIZATION_HPP
 
+#include "arm_compute/runtime/experimental/operators/CpuMeanStdDevNormalization.h"
+
 #include "cpu/aarch64/acl_utils.hpp"
 #include "cpu/cpu_layer_normalization_pd.hpp"
 
@@ -24,46 +26,6 @@ namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace aarch64 {
-
-struct acl_msdnorm_obj_t {
-    arm_compute::NEMeanStdDevNormalizationLayer msdNorm;
-    arm_compute::Tensor src_tensor;
-    arm_compute::Tensor dst_tensor;
-};
-
-struct acl_msdnorm_conf_t {
-    arm_compute::TensorInfo data_info; // src and dst tensors
-};
-
-struct acl_layer_normalization_resource_t : public resource_t {
-    acl_layer_normalization_resource_t()
-        : acl_obj(utils::make_unique<acl_msdnorm_obj_t>()) {}
-
-    status_t configure(
-            const acl_msdnorm_conf_t &anp, const layer_normalization_pd_t *pd) {
-        if (!acl_obj) return status::out_of_memory;
-
-        acl_obj->src_tensor.allocator()->init(anp.data_info);
-        acl_obj->dst_tensor.allocator()->init(anp.data_info);
-
-        // clang-format off
-        acl_obj->msdNorm.configure(
-            &acl_obj->src_tensor,
-            &acl_obj->dst_tensor,
-            pd->desc()->layer_norm_epsilon);
-        // clang-format on
-
-        return status::success;
-    }
-
-    acl_msdnorm_obj_t &get_acl_obj() const { return *acl_obj; }
-
-    DNNL_DISALLOW_COPY_AND_ASSIGN(acl_layer_normalization_resource_t);
-
-private:
-    std::unique_ptr<acl_msdnorm_obj_t> acl_obj;
-}; // acl_layer_normalization_resource_t
-
 struct acl_layer_normalization_fwd_t : public primitive_t {
     struct pd_t : public cpu_layer_normalization_fwd_pd_t {
         using cpu_layer_normalization_fwd_pd_t::
@@ -71,180 +33,27 @@ struct acl_layer_normalization_fwd_t : public primitive_t {
 
         DECLARE_COMMON_PD_T("acl", acl_layer_normalization_fwd_t);
 
-        status_t init(engine_t *engine) {
-
-            // dir and flags
-            ACL_CHECK_SUPPORT(
-                    !is_fwd(), "ACL lnorm supports forward propagation only");
-            ACL_CHECK_SUPPORT(
-                    is_training(), "ACL supports inference only for lnorm");
-            ACL_CHECK_SUPPORT(use_global_stats(),
-                    "ACL does not support global stats with lnorm");
-            ACL_CHECK_SUPPORT(use_scale() || use_shift(),
-                    "ACL does not support lnorm scale and shift");
-
-            // attr-scales
-            ACL_CHECK_SUPPORT(!attr()->has_default_values(),
-                    "ACL does not support scales attribute");
-
-            // tag and stat_tag
-            ACL_CHECK_SUPPORT(src_md()->ndims < 2 || src_md()->ndims > 5,
-                    "src tensor must have between 2 and 5 (inclusive) "
-                    "dimensions");
-
-            // msdNorm only supports lnorm for src in a channels last format.
-            // So if channels aren't last (ie. if they aren't dense),
-            // then reorder into a channels last format
-            std::string ref_implementation_guess = "simple:any";
-            if (src_md()->format_desc.blocking.strides[ndims() - 1] != 1) {
-                CHECK(memory_desc_init_by_tag(
-                        src_md_, get_channels_last_format(src_md_.ndims)));
-                ref_implementation_guess = "ref:any";
-            }
-            if (dst_md_ != src_md_)
-                // Make sure dst and src share a format
-                CHECK(memory_desc_init_by_md_and_dt(
-                        dst_md_, src_md_, src_md()->data_type));
-            if (!set_default_stat_md_format(src_md_))
-                return status::unimplemented;
-
-            const memory_desc_wrapper src_d(src_md_);
-            const memory_desc_wrapper dst_d(dst_md_);
-
-            ACL_CHECK_SUPPORT(src_d.has_zero_dim() || dst_d.has_zero_dim(),
-                    "data tensor(s) must not have a zero dimension");
-
-            // data type
-            ACL_CHECK_SUPPORT(src_d.data_type() != data_type::f32,
-                    "ACL Lnorm only supports F32");
-            ACL_CHECK_SUPPORT(dst_d.data_type() != src_d.data_type(),
-                    "src and dst must share data types");
-
-            // Problem shape
-            int C = norm_axis(); // Channel dim size
-            int X = src_d.nelems() / C; // Non-channel dims size
-
-            ACL_CHECK_SUPPORT(!use_acl_heuristic(X, C, dnnl_get_max_threads(),
-                                      is_training(), ref_implementation_guess),
-                    "ACL is unoptimal in this case");
-
-            anp.data_info
-                    = arm_compute::TensorInfo(arm_compute::TensorShape(C, X), 1,
-                            arm_compute::DataType::F32);
-
-            ACL_CHECK_VALID(
-                    arm_compute::NEMeanStdDevNormalizationLayer::validate(
-                            &anp.data_info, &anp.data_info,
-                            desc()->layer_norm_epsilon));
-
-            return status::success;
-        }
-
-        format_tag_t get_channels_last_format(size_t ndim) {
-            assert(ndim > 1 && ndim < 6);
-            switch (ndim) {
-                case 2: return format_tag::nc;
-                case 3: return format_tag::tnc;
-                case 4: return format_tag::ldnc;
-                case 5: return format_tag::abcde;
-                default: return format_tag::undef;
-            }
-        }
-
+        status_t init(engine_t *engine);
+        format_tag_t get_channels_last_format(size_t ndim) const;
         bool use_acl_heuristic(int X, int C, int threads, bool ref_has_stats,
-                std::string ref_implementation_guess) {
-            // Above a certain C, acl is always better, and below a certain C,
-            // acl is always worse. for C in between these two, whether acl is
-            // better can be approximated with the workload (X*C) per thread.
-            // The values here were derived empirically and all depend on
-            // threads, whether ref can use provided stats, and which reference
-            // implementation acl is competing with.
-
-            int acl_competitive_C = C;
-            int acl_better_C = C;
-            int acl_better_XC_per_thread = X * C;
-
-            if (ref_implementation_guess == "simple:any") {
-                acl_competitive_C = 64;
-                if (ref_has_stats) {
-                    acl_better_C = 4096;
-                    acl_better_XC_per_thread = threads == 1 ? 4096 : 8192;
-                } else {
-                    acl_better_C = threads <= 2 ? 1024 : 4096;
-                    acl_better_XC_per_thread = threads == 1 ? 1024 : 4096;
-                }
-            } else if (ref_implementation_guess == "ref:any") {
-                acl_competitive_C = 0;
-                if (ref_has_stats) {
-                    if (threads == 1) {
-                        acl_better_C = 64;
-                    } else if (threads == 2) {
-                        acl_better_C = 256;
-                    } else {
-                        acl_better_C = 1024;
-                    }
-
-                    if (threads == 1) {
-                        acl_better_XC_per_thread = 256;
-                    } else if (threads <= 16) {
-                        acl_better_XC_per_thread = 512;
-                    } else {
-                        acl_better_XC_per_thread = 1024;
-                    }
-                } else {
-                    if (threads == 1) {
-                        acl_better_C = 64;
-                    } else if (threads <= 32) {
-                        acl_better_C = 256;
-                    } else {
-                        acl_better_C = 1024;
-                    }
-
-                    if (threads == 1) {
-                        acl_better_XC_per_thread = 128;
-                    } else if (threads <= 32) {
-                        acl_better_XC_per_thread = 256;
-                    } else {
-                        acl_better_XC_per_thread = 512;
-                    }
-                }
-            }
-
-            return C > acl_competitive_C
-                    && (C > acl_better_C
-                            || X * C > acl_better_XC_per_thread * threads);
-        }
-
-        acl_msdnorm_conf_t anp = utils::zero<decltype(anp)>();
+                const std::string &ref_implementation_guess) const;
 
+        arm_compute::TensorInfo anp_data_info;
     }; // pd_t
 
-    acl_layer_normalization_fwd_t(const pd_t *apd) : primitive_t(apd) {}
-
-    status_t create_resource(
-            engine_t *engine, resource_mapper_t &mapper) const override {
-        if (mapper.has_resource(this)) return status::success;
-
-        auto r = utils::make_unique<acl_layer_normalization_resource_t>();
-        if (!r) return status::out_of_memory;
-
-        // Configure the resource based on information from primitive descriptor
-        CHECK(r->configure(pd()->anp, pd()));
-        mapper.add(this, std::move(r));
-
-        return status::success;
-    }
+    acl_layer_normalization_fwd_t(const pd_t *apd);
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_forward(ctx);
     }
+    status_t init(engine_t *engine) override;
 
 private:
-    // To guard the const execute_forward, the mutex must be 'mutable'
-    mutable std::mutex mtx;
     status_t execute_forward(const exec_ctx_t &ctx) const;
-    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-}; // acl_layer_normalization_fwd_t
+    const pd_t *pd() const;
+    std::unique_ptr<arm_compute::experimental::op::CpuMeanStdDevNormalization>
+            acl_obj_;
+};
 
 } // namespace aarch64
 } // namespace cpu