Skip to content

Commit a61ebae

Browse files
committed
cpu: aarch64 make softmax ops use stateless ACL interface
Signed-off-by: Ye Tao <ye.tao@arm.com>
1 parent 4392e04 commit a61ebae

File tree

2 files changed

+144
-193
lines changed

2 files changed

+144
-193
lines changed

src/cpu/aarch64/acl_softmax.cpp

+134-14
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*******************************************************************************
2-
* Copyright 2021-2022 Arm Ltd. and affiliates
2+
* Copyright 2021-2022, 2024 Arm Ltd. and affiliates
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -21,27 +21,147 @@ namespace impl {
2121
namespace cpu {
2222
namespace aarch64 {
2323

24-
status_t acl_softmax_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
24+
const acl_softmax_fwd_t::pd_t *acl_softmax_fwd_t::pd() const {
25+
return static_cast<const pd_t *>(primitive_t::pd().get());
26+
}
27+
28+
status_t acl_softmax_fwd_t::pd_t::init(engine_t *engine) {
29+
30+
bool ok = is_fwd()
31+
&& set_default_formats() == status::success
32+
// ACL only supports matching src/dst (this must come after
33+
// set_default_formats() to handle format_kind::any)
34+
&& *src_md() == *dst_md()
35+
&& utils::one_of(
36+
src_md()->data_type, data_type::f32, data_type::f16)
37+
&& attr()->has_default_values();
38+
if (!ok) return status::unimplemented;
39+
40+
// Get memory desc to find sizes and dims
41+
const memory_desc_wrapper src_d(src_md());
42+
const data_type_t data_type = src_d.data_type();
43+
44+
// ACL only supports plain tensors, can be permuted but not blocked
45+
if (!src_d.is_plain()) return status::unimplemented;
46+
47+
// Guards against a 0-sized dimension
48+
if (src_d.has_zero_dim()) return status::unimplemented;
49+
50+
// No scaling
51+
asp_.beta = 1;
52+
53+
asp_.is_logsoftmax = is_logsoftmax();
54+
55+
// The strides give us the in memory inner size
56+
dim_t inner_size_ = src_d.blocking_desc().strides[axis()];
57+
58+
dim_t axis_size_ = axis_size();
59+
60+
// The outer size is any left-over dimensions not inner or on the axis
61+
dim_t outer_size_ = src_d.nelems() / (inner_size_ * axis_size_);
62+
63+
// In this context, NHWC tells ACL that the logical and physical
64+
// dimensions are the same
65+
arm_compute::DataLayout acl_layout = arm_compute::DataLayout::NHWC;
66+
67+
const arm_compute::DataType acl_data_t
68+
= acl_utils::get_acl_data_t(data_type);
69+
70+
const int threads = dnnl_get_max_threads();
71+
if (inner_size_ == 1) {
72+
// A rough empirical heuristic created by fitting a polynomial
73+
// of the tensor sizes and thread count to the run time of the
74+
// ref and ACL softmax. This variable is greater than zero when
75+
// ref is faster, and less than zero when ACL is faster. We can
76+
// interpret the constant term as the constant overhead
77+
// associated with calling the external library and the negative
78+
// coefficient on total_size as ACL being faster at processing
79+
// each element
80+
double acl_ref_performance_diff = 1 + 0.005 * outer_size_
81+
- 0.0027 * axis_size_
82+
* std::ceil(double(outer_size_) / threads);
83+
if (threads > 1 || outer_size_ > 1) {
84+
// Using threads within ACL adds another constant overhead
85+
acl_ref_performance_diff += 17;
86+
}
87+
if (acl_ref_performance_diff > 0) return status::unimplemented;
88+
89+
// If the inner size is 1, we can get rid of the dimension.
90+
// This stops ACL doing a unnecessary permute
91+
arm_compute::TensorShape acl_tensor_shape
92+
= arm_compute::TensorShape(axis_size_, outer_size_);
93+
asp_.axis = 0;
2594

26-
// Lock here is needed because resource_mapper does not support
27-
// concurrent multithreaded access.
28-
std::lock_guard<std::mutex> _lock {this->mtx};
95+
asp_.src_info = arm_compute::TensorInfo(
96+
acl_tensor_shape, 1, acl_data_t, acl_layout);
97+
asp_.dst_info = arm_compute::TensorInfo(
98+
acl_tensor_shape, 1, acl_data_t, acl_layout);
99+
} else {
100+
// A rough empirical heuristic, see comment above
101+
// The only difference here is that ACL does a reorder, and so
102+
// is considerably better
103+
double acl_ref_performance_diff = 1 + 0.005 * outer_size_
104+
- 0.01 * inner_size_ * axis_size_
105+
* std::ceil(double(outer_size_) / threads);
106+
if (threads > 1 || outer_size_ > 1) {
107+
// Using threads within ACL adds another constant overhead
108+
acl_ref_performance_diff += 17;
109+
}
29110

111+
if (acl_ref_performance_diff > 0) return status::unimplemented;
112+
113+
// Irrespective of the input dimensions, we construct a tensor
114+
// with dimensions such that softmax can be applied over the
115+
// middle axis (1), with the correct stride and vector length.
116+
arm_compute::TensorShape acl_tensor_shape = arm_compute::TensorShape(
117+
inner_size_, axis_size_, outer_size_);
118+
asp_.axis = 1;
119+
120+
asp_.src_info = arm_compute::TensorInfo(
121+
acl_tensor_shape, 1, acl_data_t, acl_layout);
122+
asp_.dst_info = arm_compute::TensorInfo(
123+
acl_tensor_shape, 1, acl_data_t, acl_layout);
124+
}
125+
126+
// Validate manually to check for return status
127+
ACL_CHECK_VALID(arm_compute::experimental::op::CpuSoftmax::validate(
128+
&asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis));
129+
130+
return status::success;
131+
}
132+
133+
status_t acl_softmax_fwd_t::init(engine_t *engine) {
134+
auto asp = pd()->asp_;
135+
136+
auto op = std::make_unique<arm_compute::experimental::op::CpuSoftmax>();
137+
138+
softmax_op_ = std::move(op);
139+
// Configure softmax operation, mem allocation happens.
140+
softmax_op_->configure(&asp.src_info, &asp.dst_info, asp.beta, asp.axis,
141+
asp.is_logsoftmax);
142+
143+
return status::success;
144+
}
145+
146+
status_t acl_softmax_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
30147
auto src = CTX_IN_MEM(const void *, DNNL_ARG_SRC);
31148
auto dst = CTX_OUT_MEM(void *, DNNL_ARG_DST);
32149

33-
// Retrieve primitive resource and configured Compute Library objects
34-
auto *acl_resource
35-
= ctx.get_resource_mapper()->get<acl_softmax_resource_t>(this);
36-
acl_softmax_obj_t &acl_obj = acl_resource->get_acl_obj();
150+
auto asp = pd()->asp_;
151+
152+
arm_compute::Tensor src_tensor;
153+
arm_compute::Tensor dst_tensor;
37154

38-
acl_obj.src_tensor.allocator()->import_memory(const_cast<void *>(src));
39-
acl_obj.dst_tensor.allocator()->import_memory(dst);
155+
src_tensor.allocator()->init(asp.src_info);
156+
src_tensor.allocator()->import_memory(const_cast<void *>(src));
157+
dst_tensor.allocator()->init(asp.dst_info);
158+
dst_tensor.allocator()->import_memory(dst);
40159

41-
acl_obj.softmax->run();
160+
arm_compute::ITensorPack run_pack {
161+
{arm_compute::TensorType::ACL_SRC_0, &src_tensor},
162+
{arm_compute::TensorType::ACL_DST, &dst_tensor}};
42163

43-
acl_obj.src_tensor.allocator()->free();
44-
acl_obj.dst_tensor.allocator()->free();
164+
softmax_op_->run(run_pack);
45165

46166
return status::success;
47167
}

src/cpu/aarch64/acl_softmax.hpp

+10-179
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,15 @@
2121

2222
#include "cpu/aarch64/acl_utils.hpp"
2323

24+
#include "arm_compute/core/TensorInfo.h"
25+
#include "arm_compute/runtime/IOperator.h"
26+
#include "arm_compute/runtime/experimental/operators/CpuSoftmax.h"
27+
2428
namespace dnnl {
2529
namespace impl {
2630
namespace cpu {
2731
namespace aarch64 {
2832

29-
struct acl_softmax_obj_t {
30-
std::unique_ptr<arm_compute::IFunction> softmax;
31-
arm_compute::Tensor src_tensor;
32-
arm_compute::Tensor dst_tensor;
33-
};
34-
3533
struct acl_softmax_conf_t {
3634
arm_compute::TensorInfo src_info;
3735
arm_compute::TensorInfo dst_info;
@@ -40,196 +38,29 @@ struct acl_softmax_conf_t {
4038
bool is_logsoftmax;
4139
};
4240

43-
struct acl_softmax_resource_t : public resource_t {
44-
acl_softmax_resource_t()
45-
: acl_obj_(utils::make_unique<acl_softmax_obj_t>()) {}
46-
47-
status_t configure(const acl_softmax_conf_t &asp) {
48-
if (!acl_obj_) return status::out_of_memory;
49-
50-
// Init Compute Library tensors based on info from descriptor
51-
acl_obj_->src_tensor.allocator()->init(asp.src_info);
52-
acl_obj_->dst_tensor.allocator()->init(asp.dst_info);
53-
54-
if (asp.is_logsoftmax) {
55-
auto logsoftmax
56-
= std::make_unique<arm_compute::NELogSoftmaxLayer>();
57-
// clang-format off
58-
logsoftmax->configure(
59-
&acl_obj_->src_tensor,
60-
&acl_obj_->dst_tensor,
61-
asp.beta,
62-
asp.axis);
63-
// clang-format on
64-
acl_obj_->softmax = std::move(logsoftmax);
65-
} else {
66-
auto softmax = std::make_unique<arm_compute::NESoftmaxLayer>();
67-
// clang-format off
68-
softmax->configure(
69-
&acl_obj_->src_tensor,
70-
&acl_obj_->dst_tensor,
71-
asp.beta,
72-
asp.axis);
73-
// clang-format on
74-
acl_obj_->softmax = std::move(softmax);
75-
}
76-
77-
return status::success;
78-
}
79-
80-
acl_softmax_obj_t &get_acl_obj() const { return *acl_obj_; }
81-
82-
DNNL_DISALLOW_COPY_AND_ASSIGN(acl_softmax_resource_t);
83-
84-
private:
85-
std::unique_ptr<acl_softmax_obj_t> acl_obj_;
86-
}; // acl_softmax_resource_t
87-
8841
struct acl_softmax_fwd_t : public primitive_t {
8942
struct pd_t : public cpu_softmax_fwd_pd_t {
9043
using cpu_softmax_fwd_pd_t::cpu_softmax_fwd_pd_t;
9144

9245
DECLARE_COMMON_PD_T("acl", acl_softmax_fwd_t);
93-
94-
status_t init(engine_t *engine) {
95-
96-
bool ok = is_fwd()
97-
&& set_default_formats() == status::success
98-
// ACL only supports matching src/dst (this must come after
99-
// set_default_formats() to handle format_kind::any)
100-
&& *src_md() == *dst_md()
101-
&& utils::one_of(
102-
src_md()->data_type, data_type::f32, data_type::f16)
103-
&& attr()->has_default_values();
104-
if (!ok) return status::unimplemented;
105-
106-
// Get memory desc to find sizes and dims
107-
const memory_desc_wrapper src_d(src_md());
108-
const data_type_t data_type = src_d.data_type();
109-
110-
// ACL only supports plain tensors, can be permuted but not blocked
111-
if (!src_d.is_plain()) return status::unimplemented;
112-
113-
// Guards against a 0-sized dimension
114-
if (src_d.has_zero_dim()) return status::unimplemented;
115-
116-
// No scaling
117-
asp_.beta = 1;
118-
119-
asp_.is_logsoftmax = is_logsoftmax();
120-
121-
// The strides give us the in memory inner size
122-
dim_t inner_size_ = src_d.blocking_desc().strides[axis()];
123-
124-
dim_t axis_size_ = axis_size();
125-
126-
// The outer size is any left-over dimensions not inner or on the axis
127-
dim_t outer_size_ = src_d.nelems() / (inner_size_ * axis_size_);
128-
129-
// In this context, NHWC tells ACL that the logical and physical
130-
// dimensions are the same
131-
arm_compute::DataLayout acl_layout = arm_compute::DataLayout::NHWC;
132-
133-
const arm_compute::DataType acl_data_t
134-
= acl_utils::get_acl_data_t(data_type);
135-
136-
const int threads = dnnl_get_max_threads();
137-
if (inner_size_ == 1) {
138-
// A rough empirical heuristic created by fitting a polynomial
139-
// of the tensor sizes and thread count to the run time of the
140-
// ref and ACL softmax. This variable is greater than zero when
141-
// ref is faster, and less than zero when ACL is faster. We can
142-
// interpret the constant term as the constant overhead
143-
// associated with calling the external library and the negative
144-
// coefficient on total_size as ACL being faster at processing
145-
// each element
146-
double acl_ref_performance_diff = 1 + 0.005 * outer_size_
147-
- 0.0027 * axis_size_
148-
* std::ceil(double(outer_size_) / threads);
149-
if (threads > 1 || outer_size_ > 1) {
150-
// Using threads within ACL adds another constant overhead
151-
acl_ref_performance_diff += 17;
152-
}
153-
if (acl_ref_performance_diff > 0) return status::unimplemented;
154-
155-
// If the inner size is 1, we can get rid of the dimension.
156-
// This stops ACL doing a unnecessary permute
157-
arm_compute::TensorShape acl_tensor_shape
158-
= arm_compute::TensorShape(axis_size_, outer_size_);
159-
asp_.axis = 0;
160-
161-
asp_.src_info = arm_compute::TensorInfo(
162-
acl_tensor_shape, 1, acl_data_t, acl_layout);
163-
asp_.dst_info = arm_compute::TensorInfo(
164-
acl_tensor_shape, 1, acl_data_t, acl_layout);
165-
} else {
166-
// A rough empirical heuristic, see comment above
167-
// The only difference here is that ACL does a reorder, and so
168-
// is considerably better
169-
double acl_ref_performance_diff = 1 + 0.005 * outer_size_
170-
- 0.01 * inner_size_ * axis_size_
171-
* std::ceil(double(outer_size_) / threads);
172-
if (threads > 1 || outer_size_ > 1) {
173-
// Using threads within ACL adds another constant overhead
174-
acl_ref_performance_diff += 17;
175-
}
176-
177-
if (acl_ref_performance_diff > 0) return status::unimplemented;
178-
179-
// Irrespective of the input dimensions, we construct a tensor
180-
// with dimensions such that softmax can be applied over the
181-
// middle axis (1), with the correct stride and vector length.
182-
arm_compute::TensorShape acl_tensor_shape
183-
= arm_compute::TensorShape(
184-
inner_size_, axis_size_, outer_size_);
185-
asp_.axis = 1;
186-
187-
asp_.src_info = arm_compute::TensorInfo(
188-
acl_tensor_shape, 1, acl_data_t, acl_layout);
189-
asp_.dst_info = arm_compute::TensorInfo(
190-
acl_tensor_shape, 1, acl_data_t, acl_layout);
191-
}
192-
193-
// Validate manually to check for return status
194-
if (asp_.is_logsoftmax) {
195-
ACL_CHECK_VALID(arm_compute::NELogSoftmaxLayer::validate(
196-
&asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis));
197-
} else {
198-
ACL_CHECK_VALID(arm_compute::NESoftmaxLayer::validate(
199-
&asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis));
200-
}
201-
202-
return status::success;
203-
}
46+
status_t init(engine_t *engine);
20447

20548
acl_softmax_conf_t asp_;
20649
}; // pd_t
20750

51+
// constructor
20852
acl_softmax_fwd_t(const pd_t *apd) : primitive_t(apd) {}
20953

210-
status_t create_resource(
211-
engine_t *engine, resource_mapper_t &mapper) const override {
212-
if (mapper.has_resource(this)) return status::success;
213-
214-
auto r = utils::make_unique<acl_softmax_resource_t>();
215-
if (!r) return status::out_of_memory;
216-
217-
// Configure the resource based on information from primitive descriptor
218-
auto st = r->configure(pd()->asp_);
219-
if (st == status::success) { mapper.add(this, std::move(r)); }
220-
221-
return st;
222-
}
223-
22454
status_t execute(const exec_ctx_t &ctx) const override {
22555
return execute_forward(ctx);
22656
}
22757

22858
private:
229-
// To guard the const execute_forward, the mutex must be 'mutable'
230-
mutable std::mutex mtx;
59+
const pd_t *pd() const;
60+
61+
status_t init(engine_t *engine) override;
23162
status_t execute_forward(const exec_ctx_t &ctx) const;
232-
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
63+
std::unique_ptr<arm_compute::experimental::op::CpuSoftmax> softmax_op_;
23364
}; // acl_softmax_fwd_t
23465

23566
} // namespace aarch64

0 commit comments

Comments
 (0)