Skip to content

Commit 0482f4a

Browse files
committed
add fc acl executor
1 parent 69dea7c commit 0482f4a

File tree

12 files changed

+489
-128
lines changed

12 files changed

+489
-128
lines changed

src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp

+6-6
Original file line numberDiff line numberDiff line change
@@ -220,18 +220,18 @@ bool CpuBlockedMemoryDesc::isBlockedCFormat(size_t blk_size) const {
220220
}
221221

222222
bool CpuBlockedMemoryDesc::isTailCFormat() const {
223-
if (shape.getRank() < 3) {
224-
return false;
225-
}
223+
// if (shape.getRank() < 3) {
224+
// return false;
225+
// }
226226
if (shape.getRank() != order.size()) {
227227
return false;
228228
}
229229
if (!std::is_sorted(order.begin(), --order.end())) {
230230
return false;
231231
}
232-
if (order.back() != 1) {
233-
return false;
234-
}
232+
// if (order.back() != 1) {
233+
// return false;
234+
// }
235235
return true;
236236
}
237237

src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp

+19-102
Original file line numberDiff line numberDiff line change
@@ -361,66 +361,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
361361
return acl_op;
362362
};
363363
break;
364-
case Algorithm::EltwiseRelu:
365-
if (aclEltwiseAttrs.alpha == 0) {
366-
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
367-
ActivationLayerInfo::ActivationFunction::RELU))
368-
return false;
369-
} else {
370-
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
371-
{ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha}))
372-
return false;
373-
}
374-
exec_func = [this]() -> std::unique_ptr<IFunction> {
375-
auto acl_op = std::make_unique<NEActivationLayer>();
376-
if (aclEltwiseAttrs.alpha == 0) {
377-
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::RELU);
378-
} else {
379-
acl_op->configure(&srcTensors[0], &dstTensors[0],
380-
{ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha});
381-
}
382-
return acl_op;
383-
};
384-
break;
385-
case Algorithm::EltwiseGeluErf:
386-
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::GELU))
387-
return false;
388-
exec_func = [this]() -> std::unique_ptr<IFunction> {
389-
auto acl_op = std::make_unique<NEActivationLayer>();
390-
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::GELU);
391-
return acl_op;
392-
};
393-
break;
394-
case Algorithm::EltwiseElu:
395-
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
396-
{ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha}))
397-
return false;
398-
exec_func = [this]() -> std::unique_ptr<IFunction> {
399-
auto acl_op = std::make_unique<NEActivationLayer>();
400-
acl_op->configure(&srcTensors[0], &dstTensors[0], {ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha});
401-
return acl_op;
402-
};
403-
break;
404-
case Algorithm::EltwiseTanh:
405-
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
406-
{ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f}))
407-
return false;
408-
exec_func = [this]() -> std::unique_ptr<IFunction> {
409-
auto acl_op = std::make_unique<NEActivationLayer>();
410-
acl_op->configure(&srcTensors[0], &dstTensors[0],
411-
{ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f});
412-
return acl_op;
413-
};
414-
break;
415-
case Algorithm::EltwiseSigmoid:
416-
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::LOGISTIC))
417-
return false;
418-
exec_func = [this]() -> std::unique_ptr<IFunction> {
419-
auto acl_op = std::make_unique<NEActivationLayer>();
420-
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::LOGISTIC);
421-
return acl_op;
422-
};
423-
break;
424364
case Algorithm::EltwiseAbs:
425365
if (!NEAbsLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
426366
return false;
@@ -430,24 +370,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
430370
return acl_op;
431371
};
432372
break;
433-
case Algorithm::EltwiseSqrt:
434-
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SQRT))
435-
return false;
436-
exec_func = [this]() -> std::unique_ptr<IFunction> {
437-
auto acl_op = std::make_unique<NEActivationLayer>();
438-
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SQRT);
439-
return acl_op;
440-
};
441-
break;
442-
case Algorithm::EltwiseSoftRelu:
443-
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU))
444-
return false;
445-
exec_func = [this]() -> std::unique_ptr<IFunction> {
446-
auto acl_op = std::make_unique<NEActivationLayer>();
447-
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU);
448-
return acl_op;
449-
};
450-
break;
451373
case Algorithm::EltwiseExp:
452374
if (!NEExpLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
453375
return false;
@@ -457,28 +379,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
457379
return acl_op;
458380
};
459381
break;
460-
case Algorithm::EltwiseClamp:
461-
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
462-
{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha}))
463-
return false;
464-
exec_func = [this]() -> std::unique_ptr<IFunction> {
465-
auto acl_op = std::make_unique<NEActivationLayer>();
466-
acl_op->configure(&srcTensors[0], &dstTensors[0],
467-
{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha});
468-
return acl_op;
469-
};
470-
break;
471-
case Algorithm::EltwiseSwish:
472-
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
473-
{ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha}))
474-
return false;
475-
exec_func = [this]() -> std::unique_ptr<IFunction> {
476-
auto acl_op = std::make_unique<NEActivationLayer>();
477-
acl_op->configure(&srcTensors[0], &dstTensors[0],
478-
{ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha});
479-
return acl_op;
480-
};
481-
break;
482382
case Algorithm::EltwisePrelu:
483383
if (!NEPReluLayer::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
484384
return false;
@@ -488,12 +388,29 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
488388
return acl_op;
489389
};
490390
break;
391+
case Algorithm::EltwiseRelu:
392+
case Algorithm::EltwiseGeluErf:
393+
case Algorithm::EltwiseElu:
394+
case Algorithm::EltwiseTanh:
395+
case Algorithm::EltwiseSigmoid:
396+
case Algorithm::EltwiseSqrt:
397+
case Algorithm::EltwiseSoftRelu:
398+
case Algorithm::EltwiseClamp:
399+
case Algorithm::EltwiseSwish:
491400
case Algorithm::EltwiseHswish:
492-
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH))
401+
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
402+
getActivationLayerInfo(aclEltwiseAttrs.algorithm,
403+
aclEltwiseAttrs.alpha,
404+
aclEltwiseAttrs.beta,
405+
aclEltwiseAttrs.gamma)))
493406
return false;
494407
exec_func = [this]() -> std::unique_ptr<IFunction> {
495408
auto acl_op = std::make_unique<NEActivationLayer>();
496-
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH);
409+
acl_op->configure(&srcTensors[0], &dstTensors[0],
410+
getActivationLayerInfo(aclEltwiseAttrs.algorithm,
411+
aclEltwiseAttrs.alpha,
412+
aclEltwiseAttrs.beta,
413+
aclEltwiseAttrs.gamma));
497414
return acl_op;
498415
};
499416
break;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
// Copyright (C) 2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "acl_executor.hpp"
6+
#include "acl_utils.hpp"
7+
#include "nodes/executors/executor.hpp"
8+
#include "nodes/executors/memory_arguments.hpp"
9+
#include "utils/debug_capabilities.h"
10+
11+
namespace ov {
12+
namespace intel_cpu {
13+
14+
bool ACLCommonExecutor::update(const MemoryArgs &memory) {
15+
std::unordered_map<int, arm_compute::DataType> acl_tensors_types_list;
16+
std::unordered_map<int, arm_compute::DataLayout> acl_tensors_layouts_list;
17+
for (auto& cpu_mem_ptr : memory) {
18+
acl_tensors_types_list[cpu_mem_ptr.first] = precisionToAclDataType(cpu_mem_ptr.second->getPrecision());
19+
acl_tensors_layouts_list[cpu_mem_ptr.first] = getAclDataLayoutByMemoryDesc(cpu_mem_ptr.second->getDescPtr());
20+
}
21+
22+
for (auto& cpu_mem_ptr : memory) {
23+
if (acl_tensors_types_list[cpu_mem_ptr.first] == arm_compute::DataType::UNKNOWN) {
24+
list_acl_tensors_infos[cpu_mem_ptr.first] = arm_compute::TensorInfo();
25+
continue;
26+
}
27+
28+
auto collapsed_dims = collapse_dims_to_max_rank(cpu_mem_ptr.second->getStaticDims(),
29+
aclTensorAttrs.maxDimsShape);
30+
auto acl_tensor_shape = shapeCast(collapsed_dims);
31+
if (aclTensorAttrs.enableNHWCReshape) {
32+
changeLayoutToNH_C({&acl_tensor_shape});
33+
}
34+
list_acl_tensors_infos[cpu_mem_ptr.first] = arm_compute::TensorInfo(acl_tensor_shape, 1,
35+
acl_tensors_types_list[cpu_mem_ptr.first],
36+
acl_tensors_layouts_list[cpu_mem_ptr.first]);
37+
}
38+
39+
auto status = prepare_tensors_info();
40+
if (!status) {
41+
DEBUG_LOG("ACL operator validation was failed: ", status.error_description());
42+
return false;
43+
}
44+
45+
for (auto& acl_tensor_info : list_acl_tensors_infos) {
46+
list_acl_tensors[acl_tensor_info.first].allocator()->init(acl_tensor_info.second);
47+
}
48+
49+
configureThreadSafe([&] { ifunc = configure_function();});
50+
return true;
51+
}
52+
53+
void ACLCommonExecutor::execute(const MemoryArgs &memory) {
54+
for (auto& acl_tensor : list_acl_tensors) {
55+
acl_tensor.second.allocator()->import_memory(memory.at(acl_tensor.first)->getData());
56+
}
57+
ifunc->run();
58+
for (auto& acl_tensor : list_acl_tensors) {
59+
acl_tensor.second.allocator()->free();
60+
}
61+
}
62+
63+
} // namespace intel_cpu
64+
} // namespace ov
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// Copyright (C) 2018-2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#pragma once
6+
7+
#include "cpu_memory.h"
8+
#include "nodes/executors/executor.hpp"
9+
#include "arm_compute/runtime/NEON/NEFunctions.h"
10+
11+
namespace ov {
12+
namespace intel_cpu {
13+
14+
struct ACLTensorAttrs {
15+
bool enableNHWCReshape = false;
16+
size_t maxDimsShape = arm_compute::MAX_DIMS;
17+
};
18+
19+
class ACLCommonExecutor : public Executor {
20+
public:
21+
virtual arm_compute::Status prepare_tensors_info() = 0;
22+
virtual std::unique_ptr<arm_compute::IFunction> configure_function() = 0;
23+
24+
protected:
25+
std::unique_ptr<arm_compute::IFunction> ifunc = nullptr;
26+
std::unordered_map<int, arm_compute::Tensor> list_acl_tensors;
27+
std::unordered_map<int, arm_compute::TensorInfo> list_acl_tensors_infos;
28+
ACLTensorAttrs aclTensorAttrs;
29+
30+
private:
31+
void execute(const MemoryArgs& memory) override;
32+
bool update(const MemoryArgs& memory) override;
33+
};
34+
35+
using ACLCommonExecutorPtr = std::shared_ptr<ACLCommonExecutor>;
36+
37+
} // namespace intel_cpu
38+
} // namespace ov
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
// Copyright (C) 2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "acl_fullyconnected.hpp"
6+
#include "acl_utils.hpp"
7+
#include "nodes/executors/executor.hpp"
8+
#include "nodes/executors/memory_arguments.hpp"
9+
#include "utils/debug_capabilities.h"
10+
11+
namespace ov {
12+
namespace intel_cpu {
13+
14+
ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const PostOps &postOps,
15+
const MemoryArgs &memory,
16+
const ExecutorContext::CPtr context) : withBias(attrs.withBias) {
17+
aclTensorAttrs.enableNHWCReshape = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
18+
fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr());
19+
fullyConnectedLayerInfo.transpose_weights = !attrs.weightsNonTransposed;
20+
if (memory.at(ARG_SRC)->getPrecision() == ov::element::f16) {
21+
fullyConnectedLayerInfo.fp_mixed_precision = true;
22+
}
23+
24+
// Add postops
25+
if (!postOps.empty() && postOps.size() == 1) {
26+
if (const auto activation = std::dynamic_pointer_cast<ActivationPostOp>(postOps[0])) {
27+
fullyConnectedLayerInfo.activation_info = getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()),
28+
activation->alpha(),
29+
activation->beta(),
30+
activation->gamma());
31+
}
32+
}
33+
}
34+
35+
bool ACLFullyConnectedExecutor::supports(const FCConfig &config) {
36+
if (!config.postOps.empty() && config.postOps.size() != 1) {
37+
DEBUG_LOG("ACLFullyConnectedExecutor supports only 1 post op");
38+
return false;
39+
}
40+
41+
const auto& srcDesc = config.descs.at(ARG_SRC);
42+
if (!one_of(srcDesc->getShape().getDims().size(), 2, 3, 4)) {
43+
DEBUG_LOG("ACLFullyConnectedExecutor supports only 2, 3 or 4 dimensions for inputs");
44+
return false;
45+
}
46+
47+
const auto& weiDesc = config.descs.at(ARG_WEI);
48+
if (!one_of(weiDesc->getShape().getDims().size(), 2, 3, 4)) {
49+
DEBUG_LOG("ACLFullyConnectedExecutor supports only 2, 3 or 4 dimensions for weights");
50+
return false;
51+
}
52+
return true;
53+
}
54+
55+
arm_compute::Status ACLFullyConnectedExecutor::prepare_tensors_info() {
56+
auto wei_shape = list_acl_tensors_infos.at(ARG_WEI).tensor_shape();
57+
if (wei_shape.num_dimensions() == 3) {
58+
list_acl_tensors_infos.at(ARG_WEI).set_tensor_shape({wei_shape[0], wei_shape[1] * wei_shape[2]});
59+
wei_shape = list_acl_tensors_infos.at(ARG_WEI).tensor_shape();
60+
}
61+
62+
auto src_shape = list_acl_tensors_infos.at(ARG_SRC).tensor_shape();
63+
if (src_shape.num_dimensions() == 3) {
64+
list_acl_tensors_infos.at(ARG_SRC).set_tensor_shape({wei_shape[0], src_shape.total_size() / wei_shape[0]});
65+
src_shape = list_acl_tensors_infos.at(ARG_SRC).tensor_shape();
66+
}
67+
68+
if (list_acl_tensors_infos.at(ARG_DST).tensor_shape().num_dimensions() == 3) {
69+
list_acl_tensors_infos.at(ARG_DST).set_tensor_shape({wei_shape[1], src_shape[1]});
70+
}
71+
72+
auto expected_weight_format = arm_compute::WeightFormat::ANY;
73+
weightsInfo = arm_compute::WeightsInfo(false, 1, 1,
74+
list_acl_tensors_infos.at(ARG_WEI).tensor_shape().total_size(),
75+
false, expected_weight_format);
76+
77+
auto opt_impl_status = arm_compute::NEFullyConnectedLayer::has_opt_impl(
78+
expected_weight_format,
79+
&list_acl_tensors_infos.at(ARG_SRC),
80+
&list_acl_tensors_infos.at(ARG_WEI),
81+
withBias ? &list_acl_tensors_infos.at(ARG_BIAS) : nullptr,
82+
&list_acl_tensors_infos.at(ARG_DST),
83+
fullyConnectedLayerInfo,
84+
weightsInfo);
85+
if (!opt_impl_status) { return opt_impl_status; }
86+
fullyConnectedLayerInfo.enable_fast_math = arm_compute::is_fixed_format_fast_math(expected_weight_format);
87+
88+
if (!fullyConnectedLayerInfo.transpose_weights) {
89+
arm_compute::TensorShape temp_weights_shape = list_acl_tensors_infos.at(ARG_WEI).tensor_shape();
90+
std::swap(temp_weights_shape[0], temp_weights_shape[1]);
91+
list_acl_tensors_infos.at(ARG_WEI).set_tensor_shape(temp_weights_shape);
92+
}
93+
94+
return arm_compute::NEFullyConnectedLayer::validate(&list_acl_tensors_infos.at(ARG_SRC),
95+
&list_acl_tensors_infos.at(ARG_WEI),
96+
withBias ? &list_acl_tensors_infos.at(ARG_BIAS) : nullptr,
97+
&list_acl_tensors_infos.at(ARG_DST),
98+
fullyConnectedLayerInfo,
99+
weightsInfo);
100+
}
101+
102+
std::unique_ptr<arm_compute::IFunction> ACLFullyConnectedExecutor::configure_function() {
103+
auto fc_func = make_unique<arm_compute::NEFullyConnectedLayer>();
104+
fc_func->configure(&list_acl_tensors.at(ARG_SRC),
105+
&list_acl_tensors.at(ARG_WEI),
106+
withBias ? &list_acl_tensors.at(ARG_BIAS) : nullptr,
107+
&list_acl_tensors.at(ARG_DST),
108+
fullyConnectedLayerInfo,
109+
weightsInfo);
110+
return fc_func;
111+
}
112+
113+
} // namespace intel_cpu
114+
} // namespace ov

0 commit comments

Comments
 (0)