@@ -34,6 +34,7 @@ struct acl_obj_t {
34
34
arm_compute::Tensor wei_tensor;
35
35
arm_compute::Tensor bia_tensor;
36
36
arm_compute::Tensor dst_tensor;
37
+ arm_compute::experimental::MemoryRequirements aux_mem_req;
37
38
};
38
39
39
40
struct acl_conv_conf_t {
@@ -65,7 +66,7 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
65
66
memory_desc_t &bias_md, const convolution_desc_t &cd,
66
67
const primitive_attr_t &attr);
67
68
68
- status_t init_conf_indirect_gemm (acl_conv_conf_t &acp, memory_desc_t &src_md,
69
+ status_t acl_init_conf (acl_conv_conf_t &acp, memory_desc_t &src_md,
69
70
memory_desc_t &weights_md, memory_desc_t &dst_md,
70
71
memory_desc_t &bias_md, const convolution_desc_t &cd,
71
72
const primitive_attr_t &attr);
@@ -81,6 +82,113 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
81
82
const primitive_attr_t &attr);
82
83
} // namespace acl_convolution_utils
83
84
85
+ // Keys are anonymous with local linkage. So deduce the type automagically.
86
+ using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);
87
+
88
+ template <typename op_t , typename post_ops_t >
89
+ status_t init_scratchpad (op_t &conv, memory_tracking::registrar_t &scratchpad,
90
+ const std::map<int , conv_key_t > &conv_keys, engine_t *engine,
91
+ post_ops_t &post_ops, dnnl::impl::post_ops_t &attr_post_ops,
92
+ arm_compute::ActivationLayerInfo &act_info, bool &use_dst_acc_for_sum,
93
+ const dnnl::impl::memory_desc_t &dst_md) {
94
+
95
+ // Book temp mem.
96
+ const auto aux_mem_req = conv.workspace ();
97
+ for (const auto &key : conv_keys) {
98
+ const auto id = key.first ;
99
+ if (aux_mem_req[id].size > 0 ) {
100
+ scratchpad.book (key.second , aux_mem_req[id].size , 1 ,
101
+ aux_mem_req[id].alignment , aux_mem_req[id].alignment );
102
+ }
103
+ }
104
+
105
+ CHECK (post_ops.init (engine, attr_post_ops, dst_md, act_info));
106
+ use_dst_acc_for_sum = post_ops.has_sum ();
107
+
108
+ if (use_dst_acc_for_sum) {
109
+ const memory_desc_wrapper dst_d (&dst_md);
110
+ scratchpad.book (memory_tracking::names::key_generic_acc, dst_d.nelems (),
111
+ dst_d.data_type_size ());
112
+ }
113
+
114
+ return status::success;
115
+ }
116
+
117
+ template <typename conv_obj_t , typename conv_pd_t , typename src_data_t ,
118
+ typename wei_data_t = src_data_t , typename dst_data_t = src_data_t ,
119
+ typename bia_data_t = src_data_t >
120
+ status_t execute_forward_conv_acl (const exec_ctx_t &ctx,
121
+ conv_obj_t *acl_conv_obj, const conv_pd_t *pd,
122
+ const std::map<int , conv_key_t > &conv_keys) {
123
+
124
+ auto src_base = CTX_IN_MEM (const src_data_t *, DNNL_ARG_SRC);
125
+ auto wei_base = CTX_IN_MEM (const wei_data_t *, DNNL_ARG_WEIGHTS);
126
+
127
+ // import_memory() and free() methods do not allocate/free any additional
128
+ // memory, only acquire/release pointers.
129
+ arm_compute::Tensor src_tensor;
130
+ arm_compute::Tensor wei_tensor;
131
+ arm_compute::Tensor bia_tensor = nullptr ;
132
+ arm_compute::Tensor dst_tensor;
133
+
134
+ auto const acp = pd->acp_ ;
135
+
136
+ src_tensor.allocator ()->init (acp.src_tensor_info );
137
+ wei_tensor.allocator ()->init (acp.wei_tensor_info );
138
+ dst_tensor.allocator ()->init (acp.dst_tensor_info );
139
+
140
+ src_tensor.allocator ()->import_memory (const_cast <src_data_t *>(src_base));
141
+ wei_tensor.allocator ()->import_memory (const_cast <wei_data_t *>(wei_base));
142
+
143
+ const auto scratchpad = ctx.get_scratchpad_grantor ();
144
+
145
+ // If we have an unfused sum post op, put the result in a scratchpad tensor.
146
+ // Result will be summed to the dst during acl_post_ops.execute
147
+ auto dst_base = acp.use_dst_acc_for_sum
148
+ ? scratchpad.get <void >(memory_tracking::names::key_generic_acc)
149
+ : CTX_OUT_MEM (dst_data_t *, DNNL_ARG_DST);
150
+ dst_tensor.allocator ()->import_memory (dst_base);
151
+
152
+ if (acp.with_bias ) {
153
+ auto bia_base = CTX_IN_MEM (const bia_data_t *, DNNL_ARG_BIAS);
154
+ bia_tensor.allocator ()->init (acp.bia_tensor_info );
155
+ bia_tensor.allocator ()->import_memory (
156
+ const_cast <bia_data_t *>(bia_base));
157
+ }
158
+
159
+ arm_compute::ITensorPack pack
160
+ = {{arm_compute::TensorType::ACL_SRC_0, &src_tensor},
161
+ {arm_compute::TensorType::ACL_SRC_1, &wei_tensor},
162
+ {arm_compute::TensorType::ACL_SRC_2, &bia_tensor},
163
+ {arm_compute::TensorType::ACL_DST, &dst_tensor}};
164
+
165
+ // Get temp workspaces.
166
+ const auto aux_mem = acl_conv_obj->aux_mem_req ;
167
+
168
+ // Hold onto tmp tensors while we need pack.
169
+ std::vector<arm_compute::Tensor> tmp_tensors (aux_mem.size ());
170
+ for (const auto &key : conv_keys) {
171
+ const auto id = key.first ;
172
+ if (aux_mem[id].size > 0 ) {
173
+ const auto info = arm_compute::TensorInfo (
174
+ arm_compute::TensorShape (aux_mem[id].size ), 1 ,
175
+ arm_compute::DataType::U8);
176
+ auto buffer = scratchpad.get <void >(key.second );
177
+ tmp_tensors[id].allocator ()->init (info, aux_mem[id].alignment );
178
+ tmp_tensors[id].allocator ()->import_memory (buffer);
179
+ pack.add_tensor (aux_mem[id].slot , &tmp_tensors[id]);
180
+ }
181
+ }
182
+
183
+ acl_conv_obj->conv .prepare (pack);
184
+ acl_conv_obj->conv .run (pack);
185
+
186
+ void *dst = dst_tensor.buffer ();
187
+ pd->post_ops .execute (ctx, dst);
188
+
189
+ return status::success;
190
+ }
191
+
84
192
template <typename conv_obj_t , typename conv_pd_t , typename src_data_t ,
85
193
typename wei_data_t = src_data_t , typename dst_data_t = src_data_t ,
86
194
typename bia_data_t = src_data_t >
0 commit comments