forked from uxlfoundation/oneDNN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathacl_winograd_convolution.cpp
158 lines (135 loc) · 6.86 KB
/
acl_winograd_convolution.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
/*******************************************************************************
* Copyright 2020-2024 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "acl_winograd_convolution.hpp"
#include "common/memory_tracking.hpp"
#include "common/utils.hpp"
namespace dnnl {
namespace impl {
namespace cpu {
namespace aarch64 {
namespace {
using data_t = prec_traits<data_type::f32>::type;
// Keys are anonymous. So deduce the type automagically.
using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);
// Map: [slot , key]
const std::map<int, conv_key_t> wino_conv_keys
= {{0, conv_key_t::key_gemm_asm_tmp_buffer},
{1, conv_key_t::key_gemm_pretranspose_b},
{2, conv_key_t::key_gemm_pretranspose},
{3, conv_key_t::key_gemm_interleaved_lhs},
{4, conv_key_t::key_gemm_pretransposed_rhs},
{5, conv_key_t::key_gemm_transposed_1xwrhs},
{6, conv_key_t::key_gemm_tmp_buffer},
{7, conv_key_t::key_conv_permuted_outputs},
{8, conv_key_t::key_conv_permuted_inputs},
{9, conv_key_t::key_wino_workspace},
{10, conv_key_t::key_wino_transformed_weights},
{11, conv_key_t::key_conv_permuted_weights}};
} // namespace
status_t acl_wino_convolution_fwd_t::pd_t::init(engine_t *engine) {
using namespace data_type;
const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
&& attr()->has_default_values(
primitive_attr_t::skip_mask_t::post_ops, f16);
const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
&& attr()->has_default_values(
primitive_attr_t::skip_mask_t::post_ops, f32);
bool ok = is_fwd()
&& utils::one_of(desc()->alg_kind, alg_kind::convolution_auto,
alg_kind::convolution_winograd)
&& utils::one_of(true, is_fp16_ok, is_fp32_ok)
&& !has_zero_dim_memory();
ok = ok && DNNL_CPU_THREADING_RUNTIME != DNNL_RUNTIME_THREADPOOL;
if (!ok) return status::unimplemented;
CHECK(init_conf());
set_default_alg_kind(alg_kind::convolution_winograd);
Op conv;
conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
&acp_.dst_tensor_info, acp_.padstride_info, acp_.act_info,
true); // to support 5x5, 7x7 filter shapes in addition to 3x3
auto scratchpad = scratchpad_registry().registrar();
const auto aux_mem = conv.workspace();
return init_scratchpad(conv, scratchpad, wino_conv_keys, engine, post_ops,
attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum, dst_md_);
}
status_t acl_wino_convolution_fwd_t::init(engine_t *engine) {
// commented due to hot fix solution for stateless API which should be replaced soon.
// auto acp = pd()->acp_;
// acl_obj_->conv.configure(&acp.src_tensor_info, &acp.wei_tensor_info,
// acp.with_bias ? &acp.bia_tensor_info : nullptr,
// &acp.dst_tensor_info, acp.padstride_info, acp.act_info,
// true); // to support 5x5, 7x7 filter shapes in addition to 3x3
// acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
return status::success;
}
status_t acl_wino_convolution_fwd_t::pd_t::init_conf() {
// Under these conditions, fallback to faster GEMM-based convolution
// unless the user explicitly specifies Winograd algorithm
if (utils::one_of(true, src_md_.dims[2] > 112, // ih
src_md_.dims[3] > 112, // iw
src_md_.dims[1] < 64, // ic
dst_md_.dims[1]<64, // oc
dnnl_get_max_threads()> 28)
&& desc()->alg_kind == alg_kind::convolution_auto) {
return status::unimplemented;
}
// General Compute Library checks, memory tags are also set there
acp_.alg_winograd = true;
CHECK(acl_convolution_utils::acl_init_conf(
acp_, src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr()));
const bool shape_ok
// only unit strides allowed
= (acp_.padstride_info.stride()
== std::pair<unsigned int, unsigned int> {1, 1})
// Note: Compute Library supports arbitrary padding for wino kernels
// but we only allow small padding to be consistent with oneDNN
&& (acp_.padstride_info.pad().first <= 1) // padding left/right
&& (acp_.padstride_info.pad().second <= 1) // padding top/bottom
// only non-dilated convolutions allowed
&& (acp_.dilation_info == arm_compute::Size2D(1, 1));
ACL_CHECK_SUPPORT(!shape_ok, "shape not supported by winograd kernels");
// Validate convolution manually to check for return status
ACL_CHECK_VALID(Op::validate(&acp_.src_tensor_info, &acp_.wei_tensor_info,
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
&acp_.dst_tensor_info, acp_.padstride_info, acp_.act_info,
true)); // enable_fast_math flag in ACL Winograd
return status::success;
}
std::unique_ptr<acl_obj_t<acl_wino_convolution_fwd_t::Op>>
acl_wino_convolution_fwd_t::reinitialize_acl_obj() const {
auto acp = pd()->acp_;
std::unique_ptr<acl_obj_t<Op>> acl_obj = std::make_unique<acl_obj_t<Op>>();
acl_obj->conv.configure(&acp.src_tensor_info, &acp.wei_tensor_info,
acp.with_bias ? &acp.bia_tensor_info : nullptr,
&acp.dst_tensor_info, acp.padstride_info, acp.act_info,
true); // to support 5x5, 7x7 filter shapes in addition to 3x3
acl_obj->aux_mem_req = acl_obj->conv.workspace();
return acl_obj;
}
status_t acl_wino_convolution_fwd_t::execute_forward(
const exec_ctx_t &ctx) const {
// Temporary hotfix: We're using a local acl_obj instance in this method
// instead of the class member acl_obj_. This hotfix is to bypass persistent aux mem requirements but is not the ideal solution.
// It should be refactored or removed in the future when a more permanent fix is implemented.
const auto acl_obj = reinitialize_acl_obj();
return execute_forward_conv_acl<acl_obj_t<Op>, pd_t, data_t>(
ctx, acl_obj.get(), pd(), wino_conv_keys);
}
} // namespace aarch64
} // namespace cpu
} // namespace impl
} // namespace dnnl