Skip to content

Commit 2cfff2b

Browse files
theComputeKidvpirogov
authored andcommitted
cpu: aarch64: Enable ACL stateless API for indirect conv
- Bump ACL requirements to 24.07 and document. - Call stateless ACL APIs from oneDNN for indirect convolution. - Update gitignore to handle the .cache folder for clangd code navigation. Signed-off-by: Hamza Butt <hamza.butt@arm.com>
1 parent 02b794d commit 2cfff2b

8 files changed

+232
-150
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#===============================================================================
22
# Copyright 2019-2021 Intel Corporation
3+
# Copyright 2024 Arm Limited and affiliates.
34
#
45
# Licensed under the Apache License, Version 2.0 (the "License");
56
# you may not use this file except in compliance with the License.
@@ -29,4 +30,4 @@ compile_commands.json
2930
.git-blame-ignore-revs
3031
**/.DS_Store
3132
__pycache__
32-
33+
.cache

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ On a CPU based on Arm AArch64 architecture, oneDNN CPU engine can be built with
171171
machine learning applications and provides AArch64 optimized implementations
172172
of core functions. This functionality currently requires that ACL is downloaded
173173
and built separately. See [Build from Source] section of the Developer Guide for
174-
details. oneDNN only supports Compute Library versions 24.04 or later.
174+
details. oneDNN only supports Compute Library versions 24.07 or later.
175175

176176
[Arm Compute Library (ACL)]: https://github.com/arm-software/ComputeLibrary
177177

cmake/ACL.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ endif()
3131

3232
find_package(ACL REQUIRED)
3333

34-
set(ACL_MINIMUM_VERSION "24.04")
34+
set(ACL_MINIMUM_VERSION "24.07")
3535

3636
if(ACL_FOUND)
3737
file(GLOB_RECURSE ACL_VERSION_FILE ${ACL_INCLUDE_DIR}/*/arm_compute_version.embed)

src/common/memory_tracking.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ enum {
199199
key_conv_gemm_zp_src_comp,
200200
key_conv_int_dat_in_acc_dt,
201201
key_conv_padded_bias,
202+
key_conv_permuted_weights,
202203
key_conv_rtus_space,
203204
key_conv_store_wsp,
204205
key_conv_tails,
@@ -225,6 +226,7 @@ enum {
225226
key_gemm_blocked_a,
226227
key_gemm_blocked_b,
227228
key_gemm_accumulator,
229+
key_gemm_pretranspose,
228230
key_generic_acc,
229231
key_gnorm_cvt,
230232
key_gnorm_reduction,

src/cpu/aarch64/acl_convolution_utils.cpp

-37
Original file line numberDiff line numberDiff line change
@@ -310,43 +310,6 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
310310
return status::success;
311311
}
312312

313-
status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
314-
memory_desc_t &weights_md, memory_desc_t &dst_md,
315-
memory_desc_t &bias_md, const convolution_desc_t &cd,
316-
const primitive_attr_t &attr) {
317-
if (weights_md.ndims != 4) return status::unimplemented;
318-
319-
// Indirect is slower for small convolution kernels, except when src, weight and dst are BF16
320-
if (weights_md.dims[2] == 1 && weights_md.dims[3] == 1
321-
&& !everyone_is(data_type::bf16, src_md.data_type,
322-
weights_md.data_type, dst_md.data_type))
323-
return status::unimplemented;
324-
325-
CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
326-
327-
// If we do not need to pad input channels for fast math mode then it would
328-
// be faster to run convolution with im2row instead of using indirect kernel
329-
int block_by = arm_compute::block_by(acp.weights_info.weight_format());
330-
int ic = src_md.dims[1];
331-
if (acp.fast_math && ic % block_by == 0) return status::unimplemented;
332-
333-
// clang-format off
334-
// NOTE: indirect convolution method supports only nhwc layout.
335-
ACL_CHECK_VALID(arm_compute::NEGEMMConv2d::validate(
336-
&acp.src_tensor_info,
337-
&acp.wei_tensor_info,
338-
acp.with_bias ? &acp.bia_tensor_info : nullptr,
339-
&acp.dst_tensor_info,
340-
arm_compute::Conv2dInfo(acp.padstride_info,
341-
acp.dilation_info,
342-
acp.act_info,
343-
acp.fast_math,
344-
1, acp.weights_info)));
345-
// clang-format on
346-
347-
return status::success;
348-
}
349-
350313
status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
351314
memory_desc_t &weights_md, memory_desc_t &dst_md,
352315
memory_desc_t &bias_md, const convolution_desc_t &cd,

src/cpu/aarch64/acl_convolution_utils.hpp

+109-1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ struct acl_obj_t {
3434
arm_compute::Tensor wei_tensor;
3535
arm_compute::Tensor bia_tensor;
3636
arm_compute::Tensor dst_tensor;
37+
arm_compute::experimental::MemoryRequirements aux_mem_req;
3738
};
3839

3940
struct acl_conv_conf_t {
@@ -65,7 +66,7 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
6566
memory_desc_t &bias_md, const convolution_desc_t &cd,
6667
const primitive_attr_t &attr);
6768

68-
status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
69+
status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
6970
memory_desc_t &weights_md, memory_desc_t &dst_md,
7071
memory_desc_t &bias_md, const convolution_desc_t &cd,
7172
const primitive_attr_t &attr);
@@ -81,6 +82,113 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
8182
const primitive_attr_t &attr);
8283
} // namespace acl_convolution_utils
8384

85+
// Keys are anonymous with local linkage. So deduce the type automagically.
86+
using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);
87+
88+
template <typename op_t, typename post_ops_t>
89+
status_t init_scratchpad(op_t &conv, memory_tracking::registrar_t &scratchpad,
90+
const std::map<int, conv_key_t> &conv_keys, engine_t *engine,
91+
post_ops_t &post_ops, dnnl::impl::post_ops_t &attr_post_ops,
92+
arm_compute::ActivationLayerInfo &act_info, bool &use_dst_acc_for_sum,
93+
const dnnl::impl::memory_desc_t &dst_md) {
94+
95+
// Book temp mem.
96+
const auto aux_mem_req = conv.workspace();
97+
for (const auto &key : conv_keys) {
98+
const auto id = key.first;
99+
if (aux_mem_req[id].size > 0) {
100+
scratchpad.book(key.second, aux_mem_req[id].size, 1,
101+
aux_mem_req[id].alignment, aux_mem_req[id].alignment);
102+
}
103+
}
104+
105+
CHECK(post_ops.init(engine, attr_post_ops, dst_md, act_info));
106+
use_dst_acc_for_sum = post_ops.has_sum();
107+
108+
if (use_dst_acc_for_sum) {
109+
const memory_desc_wrapper dst_d(&dst_md);
110+
scratchpad.book(memory_tracking::names::key_generic_acc, dst_d.nelems(),
111+
dst_d.data_type_size());
112+
}
113+
114+
return status::success;
115+
}
116+
117+
template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
118+
typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
119+
typename bia_data_t = src_data_t>
120+
status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
121+
conv_obj_t *acl_conv_obj, const conv_pd_t *pd,
122+
const std::map<int, conv_key_t> &conv_keys) {
123+
124+
auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
125+
auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
126+
127+
// import_memory() and free() methods do not allocate/free any additional
128+
// memory, only acquire/release pointers.
129+
arm_compute::Tensor src_tensor;
130+
arm_compute::Tensor wei_tensor;
131+
arm_compute::Tensor bia_tensor = nullptr;
132+
arm_compute::Tensor dst_tensor;
133+
134+
auto const acp = pd->acp_;
135+
136+
src_tensor.allocator()->init(acp.src_tensor_info);
137+
wei_tensor.allocator()->init(acp.wei_tensor_info);
138+
dst_tensor.allocator()->init(acp.dst_tensor_info);
139+
140+
src_tensor.allocator()->import_memory(const_cast<src_data_t *>(src_base));
141+
wei_tensor.allocator()->import_memory(const_cast<wei_data_t *>(wei_base));
142+
143+
const auto scratchpad = ctx.get_scratchpad_grantor();
144+
145+
// If we have an unfused sum post op, put the result in a scratchpad tensor.
146+
// Result will be summed to the dst during acl_post_ops.execute
147+
auto dst_base = acp.use_dst_acc_for_sum
148+
? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
149+
: CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
150+
dst_tensor.allocator()->import_memory(dst_base);
151+
152+
if (acp.with_bias) {
153+
auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
154+
bia_tensor.allocator()->init(acp.bia_tensor_info);
155+
bia_tensor.allocator()->import_memory(
156+
const_cast<bia_data_t *>(bia_base));
157+
}
158+
159+
arm_compute::ITensorPack pack
160+
= {{arm_compute::TensorType::ACL_SRC_0, &src_tensor},
161+
{arm_compute::TensorType::ACL_SRC_1, &wei_tensor},
162+
{arm_compute::TensorType::ACL_SRC_2, &bia_tensor},
163+
{arm_compute::TensorType::ACL_DST, &dst_tensor}};
164+
165+
// Get temp workspaces.
166+
const auto aux_mem = acl_conv_obj->aux_mem_req;
167+
168+
// Hold onto tmp tensors while we need pack.
169+
std::vector<arm_compute::Tensor> tmp_tensors(aux_mem.size());
170+
for (const auto &key : conv_keys) {
171+
const auto id = key.first;
172+
if (aux_mem[id].size > 0) {
173+
const auto info = arm_compute::TensorInfo(
174+
arm_compute::TensorShape(aux_mem[id].size), 1,
175+
arm_compute::DataType::U8);
176+
auto buffer = scratchpad.get<void>(key.second);
177+
tmp_tensors[id].allocator()->init(info, aux_mem[id].alignment);
178+
tmp_tensors[id].allocator()->import_memory(buffer);
179+
pack.add_tensor(aux_mem[id].slot, &tmp_tensors[id]);
180+
}
181+
}
182+
183+
acl_conv_obj->conv.prepare(pack);
184+
acl_conv_obj->conv.run(pack);
185+
186+
void *dst = dst_tensor.buffer();
187+
pd->post_ops.execute(ctx, dst);
188+
189+
return status::success;
190+
}
191+
84192
template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
85193
typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
86194
typename bia_data_t = src_data_t>

src/cpu/aarch64/acl_indirect_gemm_convolution.cpp

+102-14
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*******************************************************************************
2-
* Copyright 2021-2022 Arm Ltd. and affiliates
2+
* Copyright 2021-2022, 2024 Arm Ltd. and affiliates
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -14,27 +14,115 @@
1414
* limitations under the License.
1515
*******************************************************************************/
1616

17-
#include "cpu/aarch64/acl_indirect_gemm_convolution.hpp"
17+
#include "acl_indirect_gemm_convolution.hpp"
18+
#include "acl_convolution_utils.hpp"
19+
#include "common/memory_tracking.hpp"
20+
#include "common/utils.hpp"
1821

1922
namespace dnnl {
2023
namespace impl {
2124
namespace cpu {
2225
namespace aarch64 {
2326

27+
namespace {
28+
// Keys are anonymous. So deduce the type automagically.
29+
using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);
30+
31+
// Map: [slot , key]
32+
const std::map<int, conv_key_t> indirect_conv_keys
33+
= {{0, conv_key_t::key_gemm_tmp_buffer},
34+
{2, conv_key_t::key_gemm_pretranspose},
35+
{3, conv_key_t::key_conv_permuted_weights}};
36+
} // namespace
37+
38+
status_t acl_indirect_gemm_convolution_fwd_t::init(engine_t *engine) {
39+
auto acp_ = pd()->acp_;
40+
acl_obj_->conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
41+
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
42+
&acp_.dst_tensor_info,
43+
arm_compute::Conv2dInfo(acp_.padstride_info, acp_.dilation_info,
44+
acp_.act_info, acp_.fast_math, 1, acp_.weights_info));
45+
acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
46+
return status::success;
47+
}
48+
2449
status_t acl_indirect_gemm_convolution_fwd_t::execute_forward(
2550
const exec_ctx_t &ctx) const {
26-
// Lock here is needed because resource_mapper does not support
27-
// concurrent multithreaded access.
28-
std::lock_guard<std::mutex> _lock {this->mtx};
29-
// Retrieve primitive resource and configured Compute Library objects
30-
auto *acl_resource
31-
= ctx.get_resource_mapper()->get<acl_indirect_gemm_resource_t>(
32-
this);
33-
acl_obj_t<arm_compute::NEGEMMConv2d> &acl_indirect_gemm_obj
34-
= acl_resource->get_acl_obj();
35-
36-
return execute_forward_conv_acl<acl_obj_t<arm_compute::NEGEMMConv2d>, pd_t,
37-
data_t>(ctx, acl_indirect_gemm_obj, pd());
51+
return execute_forward_conv_acl<acl_obj_t<Op>, pd_t, data_t>(
52+
ctx, acl_obj_.get(), pd(), indirect_conv_keys);
53+
}
54+
55+
status_t acl_indirect_gemm_convolution_fwd_t::create_resource(
56+
engine_t *engine, resource_mapper_t &mapper) const {
57+
58+
CHECK(pd()->post_ops.create_resource(engine, mapper));
59+
return status::success;
60+
}
61+
62+
status_t acl_indirect_gemm_convolution_fwd_t::pd_t::init_conf() {
63+
if (weights_md_.ndims != 4) return status::unimplemented;
64+
65+
// Indirect is slower for small convolution kernels, except when src, weight and dst are BF16
66+
if (weights_md_.dims[2] == 1 && weights_md_.dims[3] == 1
67+
&& !dnnl::impl::utils::everyone_is(data_type::bf16,
68+
src_md_.data_type, weights_md_.data_type,
69+
dst_md_.data_type))
70+
return status::unimplemented;
71+
72+
CHECK(acl_convolution_utils::acl_init_conf(
73+
acp_, src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr()));
74+
75+
// If we do not need to pad input channels for fast math mode then it would
76+
// be faster to run convolution with im2row instead of using indirect kernel
77+
int block_by = arm_compute::block_by(acp_.weights_info.weight_format());
78+
int ic = src_md_.dims[1];
79+
if (acp_.fast_math && ic % block_by == 0) return status::unimplemented;
80+
81+
// clang-format off
82+
// NOTE: indirect convolution method supports only nhwc layout.
83+
ACL_CHECK_VALID(Op::validate(
84+
&acp_.src_tensor_info,
85+
&acp_.wei_tensor_info,
86+
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
87+
&acp_.dst_tensor_info,
88+
arm_compute::Conv2dInfo(acp_.padstride_info,
89+
acp_.dilation_info,
90+
acp_.act_info,
91+
acp_.fast_math,
92+
1, acp_.weights_info)));
93+
// clang-format on
94+
95+
return status::success;
96+
}
97+
98+
status_t acl_indirect_gemm_convolution_fwd_t::pd_t::init(engine_t *engine) {
99+
using namespace data_type;
100+
using smask_t = primitive_attr_t::skip_mask_t;
101+
102+
const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
103+
&& attr()->has_default_values(smask_t::post_ops, f16);
104+
const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
105+
&& attr()->has_default_values(
106+
smask_t::post_ops | smask_t::fpmath_mode, f32);
107+
bool ok = is_fwd() && set_default_alg_kind(alg_kind::convolution_direct)
108+
&& utils::one_of(true, is_fp16_ok, is_fp32_ok)
109+
&& !has_zero_dim_memory();
110+
if (!ok) return status::unimplemented;
111+
112+
CHECK(init_conf());
113+
114+
// Book memory.
115+
Op conv;
116+
conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
117+
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
118+
&acp_.dst_tensor_info,
119+
arm_compute::Conv2dInfo(acp_.padstride_info, acp_.dilation_info,
120+
acp_.act_info, acp_.fast_math, 1, acp_.weights_info));
121+
122+
auto scratchpad = scratchpad_registry().registrar();
123+
return init_scratchpad(conv, scratchpad, indirect_conv_keys, engine,
124+
post_ops, attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum,
125+
dst_md_);
38126
}
39127

40128
} // namespace aarch64

0 commit comments

Comments
 (0)