From 9b2974c86a2dee12d61615cad92b3f770f4109a0 Mon Sep 17 00:00:00 2001 From: manjam01 Date: Sun, 2 Mar 2025 20:05:50 +0000 Subject: [PATCH] cpu: aarch64: Enable stateless ACL LayerNorm --- src/cpu/aarch64/acl_layer_normalization.cpp | 185 +++++++++++++++-- src/cpu/aarch64/acl_layer_normalization.hpp | 217 ++------------------ 2 files changed, 180 insertions(+), 222 deletions(-) diff --git a/src/cpu/aarch64/acl_layer_normalization.cpp b/src/cpu/aarch64/acl_layer_normalization.cpp index 05bcb1766f1..f4c8b57f18c 100644 --- a/src/cpu/aarch64/acl_layer_normalization.cpp +++ b/src/cpu/aarch64/acl_layer_normalization.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023 Arm Ltd. and affiliates +* Copyright 2023, 2025 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,29 +21,178 @@ namespace impl { namespace cpu { namespace aarch64 { -status_t acl_layer_normalization_fwd_t::execute_forward( - const exec_ctx_t &ctx) const { +acl_layer_normalization_fwd_t::acl_layer_normalization_fwd_t(const pd_t *apd) + : primitive_t(apd) + , acl_obj_(std::make_unique< + arm_compute::experimental::op::CpuMeanStdDevNormalization>()) {} + +status_t acl_layer_normalization_fwd_t::pd_t::init(engine_t *engine) { + + // dir and flags + ACL_CHECK_SUPPORT(!is_fwd(), "ACL lnorm supports forward propagation only"); + ACL_CHECK_SUPPORT(is_training(), "ACL supports inference only for lnorm"); + ACL_CHECK_SUPPORT( + use_global_stats(), "ACL does not support global stats with lnorm"); + ACL_CHECK_SUPPORT(use_scale() || use_shift(), + "ACL does not support lnorm scale and shift"); + + // attr-scales + ACL_CHECK_SUPPORT(!attr()->has_default_values(), + "ACL does not support scales attribute"); + + // tag and stat_tag + ACL_CHECK_SUPPORT(src_md()->ndims < 2 || src_md()->ndims > 5, + "src tensor must have between 2 and 5 (inclusive) " + "dimensions"); + + // msdNorm only supports lnorm for src in a channels last format. + // So if channels aren't last (ie. if they aren't dense), + // then reorder into a channels last format + std::string ref_implementation_guess = "simple:any"; + if (src_md()->format_desc.blocking.strides[ndims() - 1] != 1) { + CHECK(memory_desc_init_by_tag( + src_md_, get_channels_last_format(src_md_.ndims))); + ref_implementation_guess = "ref:any"; + } + if (dst_md_ != src_md_) + // Make sure dst and src share a format + CHECK(memory_desc_init_by_md_and_dt( + dst_md_, src_md_, src_md()->data_type)); + if (!set_default_stat_md_format(src_md_)) return status::unimplemented; + + const memory_desc_wrapper src_d(src_md_); + const memory_desc_wrapper dst_d(dst_md_); - // Lock here is needed because resource_mapper does not support - // concurrent access. - std::lock_guard _lock {this->mtx}; + ACL_CHECK_SUPPORT(src_d.has_zero_dim() || dst_d.has_zero_dim(), + "data tensor(s) must not have a zero dimension"); + + // data type + ACL_CHECK_SUPPORT( + src_d.data_type() != data_type::f32, "ACL Lnorm only supports F32"); + ACL_CHECK_SUPPORT(dst_d.data_type() != src_d.data_type(), + "src and dst must share data types"); + + // Problem shape + int C = norm_axis(); // Channel dim size + int X = src_d.nelems() / C; // Non-channel dims size + + ACL_CHECK_SUPPORT(!use_acl_heuristic(X, C, dnnl_get_max_threads(), + is_training(), ref_implementation_guess), + "ACL is unoptimal in this case"); + + anp_data_info = arm_compute::TensorInfo( + arm_compute::TensorShape(C, X), 1, arm_compute::DataType::F32); + + ACL_CHECK_VALID( + arm_compute::experimental::op::CpuMeanStdDevNormalization::validate( + &anp_data_info, &anp_data_info, + desc()->layer_norm_epsilon)); + + return status::success; +} - // Retrieve primitive resource and configured Compute Library objects - auto *acl_resource - = ctx.get_resource_mapper() - ->get(this); - acl_msdnorm_obj_t &acl_obj = acl_resource->get_acl_obj(); +format_tag_t acl_layer_normalization_fwd_t::pd_t::get_channels_last_format( + size_t ndim) const { + assert(ndim > 1 && ndim < 6); + switch (ndim) { + case 2: return format_tag::nc; + case 3: return format_tag::tnc; + case 4: return format_tag::ldnc; + case 5: return format_tag::abcde; + default: return format_tag::undef; + } +} + +bool acl_layer_normalization_fwd_t::pd_t::use_acl_heuristic(int X, int C, + int threads, bool ref_has_stats, + const std::string &ref_implementation_guess) const { + // Above a certain C, ACL is always faster, and below a certain C, + // ACL is always slower. for C in between these two, whether ACL is + // faster can be approximated with the workload (X*C) per thread. + // The values here were derived empirically and all depend on + // threads, whether ref can use provided stats, and which reference + // implementation ACL is competing with. + + int acl_competitive_C = C; + int acl_better_C = C; + int acl_better_XC_per_thread = X * C; + + if (ref_implementation_guess == "simple:any") { + acl_competitive_C = 64; + if (ref_has_stats) { + acl_better_C = 4096; + acl_better_XC_per_thread = threads == 1 ? 4096 : 8192; + } else { + acl_better_C = threads <= 2 ? 1024 : 4096; + acl_better_XC_per_thread = threads == 1 ? 1024 : 4096; + } + } else if (ref_implementation_guess == "ref:any") { + acl_competitive_C = 0; + if (ref_has_stats) { + if (threads == 1) { + acl_better_C = 64; + } else if (threads == 2) { + acl_better_C = 256; + } else { + acl_better_C = 1024; + } + + if (threads == 1) { + acl_better_XC_per_thread = 256; + } else if (threads <= 16) { + acl_better_XC_per_thread = 512; + } else { + acl_better_XC_per_thread = 1024; + } + } else { + if (threads == 1) { + acl_better_C = 64; + acl_better_XC_per_thread = 128; + } else if (threads <= 32) { + acl_better_C = 256; + acl_better_XC_per_thread = 256; + } else { + acl_better_C = 1024; + acl_better_XC_per_thread = 512; + } + } + } + + return C > acl_competitive_C + && (C > acl_better_C || X * C > acl_better_XC_per_thread * threads); +} + +const acl_layer_normalization_fwd_t::pd_t * +acl_layer_normalization_fwd_t::pd() const { + return (const pd_t *)primitive_t::pd().get(); +} + +status_t acl_layer_normalization_fwd_t::init(engine_t *engine) { + auto *anp_data_info + = const_cast(&pd()->anp_data_info); + acl_obj_->configure( + anp_data_info, anp_data_info, pd()->desc()->layer_norm_epsilon); + return status::success; +} + +status_t acl_layer_normalization_fwd_t::execute_forward( + const exec_ctx_t &ctx) const { - auto src = CTX_IN_MEM(const float *, DNNL_ARG_SRC); - acl_obj.src_tensor.allocator()->import_memory(const_cast(src)); + const auto *src = CTX_IN_MEM(const float *, DNNL_ARG_SRC); + auto *dst = CTX_OUT_MEM(float *, DNNL_ARG_DST); - auto dst = CTX_OUT_MEM(float *, DNNL_ARG_DST); - acl_obj.dst_tensor.allocator()->import_memory(dst); + arm_compute::Tensor src_tensor; + arm_compute::Tensor dst_tensor; - acl_obj.msdNorm.run(); + src_tensor.allocator()->init(pd()->anp_data_info); + src_tensor.allocator()->import_memory(const_cast(src)); + dst_tensor.allocator()->init(pd()->anp_data_info); + dst_tensor.allocator()->import_memory(dst); - acl_obj.src_tensor.allocator()->free(); - acl_obj.dst_tensor.allocator()->free(); + arm_compute::ITensorPack act_pack; + act_pack.add_tensor(arm_compute::TensorType::ACL_SRC, &src_tensor); + act_pack.add_tensor(arm_compute::TensorType::ACL_DST, &dst_tensor); + acl_obj_->run(act_pack); return status::success; } diff --git a/src/cpu/aarch64/acl_layer_normalization.hpp b/src/cpu/aarch64/acl_layer_normalization.hpp index 52ef5f99fa8..c3dfaf3b214 100644 --- a/src/cpu/aarch64/acl_layer_normalization.hpp +++ b/src/cpu/aarch64/acl_layer_normalization.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2023-2024 Arm Ltd. and affiliates +* Copyright 2023-2025 Arm Ltd. and affiliates * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #ifndef CPU_AARCH64_ACL_LAYER_NORMALIZATION_HPP #define CPU_AARCH64_ACL_LAYER_NORMALIZATION_HPP +#include "arm_compute/runtime/experimental/operators/CpuMeanStdDevNormalization.h" + #include "cpu/aarch64/acl_utils.hpp" #include "cpu/cpu_layer_normalization_pd.hpp" @@ -24,46 +26,6 @@ namespace dnnl { namespace impl { namespace cpu { namespace aarch64 { - -struct acl_msdnorm_obj_t { - arm_compute::NEMeanStdDevNormalizationLayer msdNorm; - arm_compute::Tensor src_tensor; - arm_compute::Tensor dst_tensor; -}; - -struct acl_msdnorm_conf_t { - arm_compute::TensorInfo data_info; // src and dst tensors -}; - -struct acl_layer_normalization_resource_t : public resource_t { - acl_layer_normalization_resource_t() - : acl_obj(utils::make_unique()) {} - - status_t configure( - const acl_msdnorm_conf_t &anp, const layer_normalization_pd_t *pd) { - if (!acl_obj) return status::out_of_memory; - - acl_obj->src_tensor.allocator()->init(anp.data_info); - acl_obj->dst_tensor.allocator()->init(anp.data_info); - - // clang-format off - acl_obj->msdNorm.configure( - &acl_obj->src_tensor, - &acl_obj->dst_tensor, - pd->desc()->layer_norm_epsilon); - // clang-format on - - return status::success; - } - - acl_msdnorm_obj_t &get_acl_obj() const { return *acl_obj; } - - DNNL_DISALLOW_COPY_AND_ASSIGN(acl_layer_normalization_resource_t); - -private: - std::unique_ptr acl_obj; -}; // acl_layer_normalization_resource_t - struct acl_layer_normalization_fwd_t : public primitive_t { struct pd_t : public cpu_layer_normalization_fwd_pd_t { using cpu_layer_normalization_fwd_pd_t:: @@ -71,180 +33,27 @@ struct acl_layer_normalization_fwd_t : public primitive_t { DECLARE_COMMON_PD_T("acl", acl_layer_normalization_fwd_t); - status_t init(engine_t *engine) { - - // dir and flags - ACL_CHECK_SUPPORT( - !is_fwd(), "ACL lnorm supports forward propagation only"); - ACL_CHECK_SUPPORT( - is_training(), "ACL supports inference only for lnorm"); - ACL_CHECK_SUPPORT(use_global_stats(), - "ACL does not support global stats with lnorm"); - ACL_CHECK_SUPPORT(use_scale() || use_shift(), - "ACL does not support lnorm scale and shift"); - - // attr-scales - ACL_CHECK_SUPPORT(!attr()->has_default_values(), - "ACL does not support scales attribute"); - - // tag and stat_tag - ACL_CHECK_SUPPORT(src_md()->ndims < 2 || src_md()->ndims > 5, - "src tensor must have between 2 and 5 (inclusive) " - "dimensions"); - - // msdNorm only supports lnorm for src in a channels last format. - // So if channels aren't last (ie. if they aren't dense), - // then reorder into a channels last format - std::string ref_implementation_guess = "simple:any"; - if (src_md()->format_desc.blocking.strides[ndims() - 1] != 1) { - CHECK(memory_desc_init_by_tag( - src_md_, get_channels_last_format(src_md_.ndims))); - ref_implementation_guess = "ref:any"; - } - if (dst_md_ != src_md_) - // Make sure dst and src share a format - CHECK(memory_desc_init_by_md_and_dt( - dst_md_, src_md_, src_md()->data_type)); - if (!set_default_stat_md_format(src_md_)) - return status::unimplemented; - - const memory_desc_wrapper src_d(src_md_); - const memory_desc_wrapper dst_d(dst_md_); - - ACL_CHECK_SUPPORT(src_d.has_zero_dim() || dst_d.has_zero_dim(), - "data tensor(s) must not have a zero dimension"); - - // data type - ACL_CHECK_SUPPORT(src_d.data_type() != data_type::f32, - "ACL Lnorm only supports F32"); - ACL_CHECK_SUPPORT(dst_d.data_type() != src_d.data_type(), - "src and dst must share data types"); - - // Problem shape - int C = norm_axis(); // Channel dim size - int X = src_d.nelems() / C; // Non-channel dims size - - ACL_CHECK_SUPPORT(!use_acl_heuristic(X, C, dnnl_get_max_threads(), - is_training(), ref_implementation_guess), - "ACL is unoptimal in this case"); - - anp.data_info - = arm_compute::TensorInfo(arm_compute::TensorShape(C, X), 1, - arm_compute::DataType::F32); - - ACL_CHECK_VALID( - arm_compute::NEMeanStdDevNormalizationLayer::validate( - &anp.data_info, &anp.data_info, - desc()->layer_norm_epsilon)); - - return status::success; - } - - format_tag_t get_channels_last_format(size_t ndim) { - assert(ndim > 1 && ndim < 6); - switch (ndim) { - case 2: return format_tag::nc; - case 3: return format_tag::tnc; - case 4: return format_tag::ldnc; - case 5: return format_tag::abcde; - default: return format_tag::undef; - } - } - + status_t init(engine_t *engine); + format_tag_t get_channels_last_format(size_t ndim) const; bool use_acl_heuristic(int X, int C, int threads, bool ref_has_stats, - std::string ref_implementation_guess) { - // Above a certain C, acl is always better, and below a certain C, - // acl is always worse. for C in between these two, whether acl is - // better can be approximated with the workload (X*C) per thread. - // The values here were derived empirically and all depend on - // threads, whether ref can use provided stats, and which reference - // implementation acl is competing with. - - int acl_competitive_C = C; - int acl_better_C = C; - int acl_better_XC_per_thread = X * C; - - if (ref_implementation_guess == "simple:any") { - acl_competitive_C = 64; - if (ref_has_stats) { - acl_better_C = 4096; - acl_better_XC_per_thread = threads == 1 ? 4096 : 8192; - } else { - acl_better_C = threads <= 2 ? 1024 : 4096; - acl_better_XC_per_thread = threads == 1 ? 1024 : 4096; - } - } else if (ref_implementation_guess == "ref:any") { - acl_competitive_C = 0; - if (ref_has_stats) { - if (threads == 1) { - acl_better_C = 64; - } else if (threads == 2) { - acl_better_C = 256; - } else { - acl_better_C = 1024; - } - - if (threads == 1) { - acl_better_XC_per_thread = 256; - } else if (threads <= 16) { - acl_better_XC_per_thread = 512; - } else { - acl_better_XC_per_thread = 1024; - } - } else { - if (threads == 1) { - acl_better_C = 64; - } else if (threads <= 32) { - acl_better_C = 256; - } else { - acl_better_C = 1024; - } - - if (threads == 1) { - acl_better_XC_per_thread = 128; - } else if (threads <= 32) { - acl_better_XC_per_thread = 256; - } else { - acl_better_XC_per_thread = 512; - } - } - } - - return C > acl_competitive_C - && (C > acl_better_C - || X * C > acl_better_XC_per_thread * threads); - } - - acl_msdnorm_conf_t anp = utils::zero(); + const std::string &ref_implementation_guess) const; + arm_compute::TensorInfo anp_data_info; }; // pd_t - acl_layer_normalization_fwd_t(const pd_t *apd) : primitive_t(apd) {} - - status_t create_resource( - engine_t *engine, resource_mapper_t &mapper) const override { - if (mapper.has_resource(this)) return status::success; - - auto r = utils::make_unique(); - if (!r) return status::out_of_memory; - - // Configure the resource based on information from primitive descriptor - CHECK(r->configure(pd()->anp, pd())); - mapper.add(this, std::move(r)); - - return status::success; - } + acl_layer_normalization_fwd_t(const pd_t *apd); status_t execute(const exec_ctx_t &ctx) const override { return execute_forward(ctx); } + status_t init(engine_t *engine) override; private: - // To guard the const execute_forward, the mutex must be 'mutable' - mutable std::mutex mtx; status_t execute_forward(const exec_ctx_t &ctx) const; - const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } -}; // acl_layer_normalization_fwd_t + const pd_t *pd() const; + std::unique_ptr + acl_obj_; +}; } // namespace aarch64 } // namespace cpu