21
21
22
22
#include " cpu/aarch64/acl_utils.hpp"
23
23
24
+ #include " arm_compute/core/TensorInfo.h"
25
+ #include " arm_compute/runtime/IOperator.h"
26
+ #include " arm_compute/runtime/experimental/operators/CpuSoftmax.h"
27
+
24
28
namespace dnnl {
25
29
namespace impl {
26
30
namespace cpu {
27
31
namespace aarch64 {
28
32
29
- struct acl_softmax_obj_t {
30
- std::unique_ptr<arm_compute::IFunction> softmax;
31
- arm_compute::Tensor src_tensor;
32
- arm_compute::Tensor dst_tensor;
33
- };
34
-
35
33
struct acl_softmax_conf_t {
36
34
arm_compute::TensorInfo src_info;
37
35
arm_compute::TensorInfo dst_info;
@@ -40,196 +38,29 @@ struct acl_softmax_conf_t {
40
38
bool is_logsoftmax;
41
39
};
42
40
43
- struct acl_softmax_resource_t : public resource_t {
44
- acl_softmax_resource_t ()
45
- : acl_obj_(utils::make_unique<acl_softmax_obj_t >()) {}
46
-
47
- status_t configure (const acl_softmax_conf_t &asp) {
48
- if (!acl_obj_) return status::out_of_memory;
49
-
50
- // Init Compute Library tensors based on info from descriptor
51
- acl_obj_->src_tensor .allocator ()->init (asp.src_info );
52
- acl_obj_->dst_tensor .allocator ()->init (asp.dst_info );
53
-
54
- if (asp.is_logsoftmax ) {
55
- auto logsoftmax
56
- = std::make_unique<arm_compute::NELogSoftmaxLayer>();
57
- // clang-format off
58
- logsoftmax->configure (
59
- &acl_obj_->src_tensor ,
60
- &acl_obj_->dst_tensor ,
61
- asp.beta ,
62
- asp.axis );
63
- // clang-format on
64
- acl_obj_->softmax = std::move (logsoftmax);
65
- } else {
66
- auto softmax = std::make_unique<arm_compute::NESoftmaxLayer>();
67
- // clang-format off
68
- softmax->configure (
69
- &acl_obj_->src_tensor ,
70
- &acl_obj_->dst_tensor ,
71
- asp.beta ,
72
- asp.axis );
73
- // clang-format on
74
- acl_obj_->softmax = std::move (softmax);
75
- }
76
-
77
- return status::success;
78
- }
79
-
80
- acl_softmax_obj_t &get_acl_obj () const { return *acl_obj_; }
81
-
82
- DNNL_DISALLOW_COPY_AND_ASSIGN (acl_softmax_resource_t );
83
-
84
- private:
85
- std::unique_ptr<acl_softmax_obj_t > acl_obj_;
86
- }; // acl_softmax_resource_t
87
-
88
41
struct acl_softmax_fwd_t : public primitive_t {
89
42
struct pd_t : public cpu_softmax_fwd_pd_t {
90
43
using cpu_softmax_fwd_pd_t ::cpu_softmax_fwd_pd_t ;
91
44
92
45
DECLARE_COMMON_PD_T (" acl" , acl_softmax_fwd_t );
93
-
94
- status_t init (engine_t *engine) {
95
-
96
- bool ok = is_fwd ()
97
- && set_default_formats () == status::success
98
- // ACL only supports matching src/dst (this must come after
99
- // set_default_formats() to handle format_kind::any)
100
- && *src_md () == *dst_md ()
101
- && utils::one_of (
102
- src_md ()->data_type , data_type::f32, data_type::f16)
103
- && attr ()->has_default_values ();
104
- if (!ok) return status::unimplemented;
105
-
106
- // Get memory desc to find sizes and dims
107
- const memory_desc_wrapper src_d (src_md ());
108
- const data_type_t data_type = src_d.data_type ();
109
-
110
- // ACL only supports plain tensors, can be permuted but not blocked
111
- if (!src_d.is_plain ()) return status::unimplemented;
112
-
113
- // Guards against a 0-sized dimension
114
- if (src_d.has_zero_dim ()) return status::unimplemented;
115
-
116
- // No scaling
117
- asp_.beta = 1 ;
118
-
119
- asp_.is_logsoftmax = is_logsoftmax ();
120
-
121
- // The strides give us the in memory inner size
122
- dim_t inner_size_ = src_d.blocking_desc ().strides [axis ()];
123
-
124
- dim_t axis_size_ = axis_size ();
125
-
126
- // The outer size is any left-over dimensions not inner or on the axis
127
- dim_t outer_size_ = src_d.nelems () / (inner_size_ * axis_size_);
128
-
129
- // In this context, NHWC tells ACL that the logical and physical
130
- // dimensions are the same
131
- arm_compute::DataLayout acl_layout = arm_compute::DataLayout::NHWC;
132
-
133
- const arm_compute::DataType acl_data_t
134
- = acl_utils::get_acl_data_t (data_type);
135
-
136
- const int threads = dnnl_get_max_threads ();
137
- if (inner_size_ == 1 ) {
138
- // A rough empirical heuristic created by fitting a polynomial
139
- // of the tensor sizes and thread count to the run time of the
140
- // ref and ACL softmax. This variable is greater than zero when
141
- // ref is faster, and less than zero when ACL is faster. We can
142
- // interpret the constant term as the constant overhead
143
- // associated with calling the external library and the negative
144
- // coefficient on total_size as ACL being faster at processing
145
- // each element
146
- double acl_ref_performance_diff = 1 + 0.005 * outer_size_
147
- - 0.0027 * axis_size_
148
- * std::ceil (double (outer_size_) / threads);
149
- if (threads > 1 || outer_size_ > 1 ) {
150
- // Using threads within ACL adds another constant overhead
151
- acl_ref_performance_diff += 17 ;
152
- }
153
- if (acl_ref_performance_diff > 0 ) return status::unimplemented;
154
-
155
- // If the inner size is 1, we can get rid of the dimension.
156
- // This stops ACL doing a unnecessary permute
157
- arm_compute::TensorShape acl_tensor_shape
158
- = arm_compute::TensorShape (axis_size_, outer_size_);
159
- asp_.axis = 0 ;
160
-
161
- asp_.src_info = arm_compute::TensorInfo (
162
- acl_tensor_shape, 1 , acl_data_t , acl_layout);
163
- asp_.dst_info = arm_compute::TensorInfo (
164
- acl_tensor_shape, 1 , acl_data_t , acl_layout);
165
- } else {
166
- // A rough empirical heuristic, see comment above
167
- // The only difference here is that ACL does a reorder, and so
168
- // is considerably better
169
- double acl_ref_performance_diff = 1 + 0.005 * outer_size_
170
- - 0.01 * inner_size_ * axis_size_
171
- * std::ceil (double (outer_size_) / threads);
172
- if (threads > 1 || outer_size_ > 1 ) {
173
- // Using threads within ACL adds another constant overhead
174
- acl_ref_performance_diff += 17 ;
175
- }
176
-
177
- if (acl_ref_performance_diff > 0 ) return status::unimplemented;
178
-
179
- // Irrespective of the input dimensions, we construct a tensor
180
- // with dimensions such that softmax can be applied over the
181
- // middle axis (1), with the correct stride and vector length.
182
- arm_compute::TensorShape acl_tensor_shape
183
- = arm_compute::TensorShape (
184
- inner_size_, axis_size_, outer_size_);
185
- asp_.axis = 1 ;
186
-
187
- asp_.src_info = arm_compute::TensorInfo (
188
- acl_tensor_shape, 1 , acl_data_t , acl_layout);
189
- asp_.dst_info = arm_compute::TensorInfo (
190
- acl_tensor_shape, 1 , acl_data_t , acl_layout);
191
- }
192
-
193
- // Validate manually to check for return status
194
- if (asp_.is_logsoftmax ) {
195
- ACL_CHECK_VALID (arm_compute::NELogSoftmaxLayer::validate (
196
- &asp_.src_info , &asp_.dst_info , asp_.beta , asp_.axis ));
197
- } else {
198
- ACL_CHECK_VALID (arm_compute::NESoftmaxLayer::validate (
199
- &asp_.src_info , &asp_.dst_info , asp_.beta , asp_.axis ));
200
- }
201
-
202
- return status::success;
203
- }
46
+ status_t init (engine_t *engine);
204
47
205
48
acl_softmax_conf_t asp_;
206
49
}; // pd_t
207
50
51
+ // constructor
208
52
acl_softmax_fwd_t (const pd_t *apd) : primitive_t (apd) {}
209
53
210
- status_t create_resource (
211
- engine_t *engine, resource_mapper_t &mapper) const override {
212
- if (mapper.has_resource (this )) return status::success;
213
-
214
- auto r = utils::make_unique<acl_softmax_resource_t >();
215
- if (!r) return status::out_of_memory;
216
-
217
- // Configure the resource based on information from primitive descriptor
218
- auto st = r->configure (pd ()->asp_ );
219
- if (st == status::success) { mapper.add (this , std::move (r)); }
220
-
221
- return st;
222
- }
223
-
224
54
status_t execute (const exec_ctx_t &ctx) const override {
225
55
return execute_forward (ctx);
226
56
}
227
57
228
58
private:
229
- // To guard the const execute_forward, the mutex must be 'mutable'
230
- mutable std::mutex mtx;
59
+ const pd_t *pd () const ;
60
+
61
+ status_t init (engine_t *engine) override ;
231
62
status_t execute_forward (const exec_ctx_t &ctx) const ;
232
- const pd_t * pd () const { return ( const pd_t *) primitive_t::pd (). get (); }
63
+ std::unique_ptr<arm_compute::experimental::op::CpuSoftmax> softmax_op_;
233
64
}; // acl_softmax_fwd_t
234
65
235
66
} // namespace aarch64
0 commit comments