-
Notifications
You must be signed in to change notification settings - Fork 2.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ARM CPU] Add ACL FC executor for FP32/FP16 precision #24123
Merged
dmitry-gorokhov
merged 6 commits into
openvinotoolkit:master
from
allnes:an/fc_acl_executor
Aug 13, 2024
+969
−127
Merged
Changes from 1 commit
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
972a62f
Add FullyConnected ACL executor
allnes 202be30
Merge branch 'master' into an/fc_acl_executor
allnes 3bfea24
Merge branch 'master' into an/fc_acl_executor
allnes 31482c3
Merge branch 'master' into an/fc_acl_executor
allnes 12b633e
Merge branch 'master' into an/fc_acl_executor
allnes db3c971
Merge branch 'master' into an/fc_acl_executor
allnes File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next
Next commit
Loading status checks…
Add FullyConnected ACL executor
commit 972a62fe106b379aa35055fac20448b54f461e6a
There are no files selected for viewing
134 changes: 134 additions & 0 deletions
134
src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
// Copyright (C) 2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include "acl_common_executor.hpp" | ||
#include "acl_utils.hpp" | ||
#include "nodes/executors/memory_arguments.hpp" | ||
#include "utils/debug_capabilities.h" | ||
|
||
namespace ov { | ||
namespace intel_cpu { | ||
|
||
static const std::unordered_map<int, ACLArgs> argConvert = { | ||
{ARG_SRC_0, ACL_SRC_0}, | ||
{ARG_SRC_1, ACL_SRC_1}, | ||
{ARG_SRC_2, ACL_SRC_2}, | ||
{ARG_BIAS, ACL_BIAS}, | ||
{ARG_WEI, ACL_WEI}, | ||
{ARG_DST, ACL_DST}, | ||
}; | ||
|
||
using ACLTypes = std::array<arm_compute::DataType, ACLArgs::COUNT_OF_ARGS>; | ||
using ACLLayouts = std::array<arm_compute::DataLayout, ACLArgs::COUNT_OF_ARGS>; | ||
|
||
static void initACLTensorParams(const MemoryPtr& memoryPtr, | ||
const ACLTensorAttrs& attrs, | ||
arm_compute::TensorShape& tensorShape, | ||
arm_compute::DataType& dataType, | ||
arm_compute::DataLayout& dataLayout) { | ||
dataType = precisionToAclDataType(memoryPtr->getPrecision()); | ||
dataLayout = getAclDataLayoutByMemoryDesc(memoryPtr->getDescPtr()); | ||
if (dataType != arm_compute::DataType::UNKNOWN) { | ||
auto collapsed_dims = collapse_dims_to_max_rank(memoryPtr->getStaticDims(), attrs.maxDimsShape); | ||
tensorShape = shapeCast(collapsed_dims); | ||
if (attrs.hasLayoutTypeNHWC) { | ||
changeLayoutToNH_C({&tensorShape}); | ||
} | ||
} | ||
} | ||
|
||
static std::shared_ptr<arm_compute::TensorInfo> initTensorInfo(const arm_compute::TensorShape& tensorShape, | ||
const arm_compute::DataType& dataType, | ||
const arm_compute::DataLayout& dataLayout) { | ||
std::shared_ptr<arm_compute::TensorInfo> aclMemoryInfo = nullptr; | ||
if (dataType != arm_compute::DataType::UNKNOWN) { | ||
aclMemoryInfo = std::make_shared<arm_compute::TensorInfo>( | ||
tensorShape, 1, | ||
dataType, | ||
dataLayout); | ||
} | ||
return aclMemoryInfo; | ||
} | ||
|
||
static std::shared_ptr<arm_compute::Tensor> initTensor(const std::shared_ptr<arm_compute::TensorInfo>& aclMemoryInfo) { | ||
std::shared_ptr<arm_compute::Tensor> aclMemory = nullptr; | ||
if (aclMemoryInfo) { | ||
aclMemory = std::make_shared<arm_compute::Tensor>(); | ||
aclMemory->allocator()->init(*aclMemoryInfo); | ||
} | ||
return aclMemory; | ||
} | ||
|
||
ACLCommonExecutor::ACLCommonExecutor() { | ||
for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; ++i) { | ||
aclTensorAttrs.memoryUsageIndicator[i] = false; | ||
} | ||
} | ||
|
||
bool ACLCommonExecutor::update(const MemoryArgs &memory) { | ||
// Initialize ACL tensors params | ||
ACLShapes aclMemoryShapes; | ||
ACLTypes aclDataType{}; | ||
ACLLayouts aclDataLayout{}; | ||
for (auto& cpu_mem_ptr : memory) { | ||
const ACLArgs index = argConvert.at(cpu_mem_ptr.first); | ||
initACLTensorParams(cpu_mem_ptr.second, aclTensorAttrs, | ||
aclMemoryShapes[index], | ||
aclDataType[index], | ||
aclDataLayout[index]); | ||
} | ||
|
||
// Update ACL tensors shapes | ||
updateTensorsShapes(aclMemoryShapes); | ||
|
||
// Initialize arm_compute::TensorInfo objects | ||
ACLInfos aclMemoryInfos; | ||
for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) { | ||
aclMemoryInfos[i] = initTensorInfo(aclMemoryShapes[i], aclDataType[i], aclDataLayout[i]); | ||
} | ||
|
||
// Validate arm_compute::TensorInfo objects for specific ACL function | ||
auto tensorsInfoValidateStatus = validateTensorsInfo(aclMemoryInfos); | ||
if (!tensorsInfoValidateStatus) { | ||
DEBUG_LOG("ACL operator validation failed: ", tensorsInfoValidateStatus.error_description()); | ||
return false; | ||
} | ||
|
||
// Initialize arm_compute::Tensor objects | ||
for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) { | ||
aclMemoryTensors[i] = initTensor(aclMemoryInfos[i]); | ||
// Indicate that arm_compute::Tensor object can use import_memory function | ||
if (aclMemoryTensors[i]) { | ||
aclTensorAttrs.memoryUsageIndicator[i] = true; | ||
} | ||
} | ||
|
||
// Configure arm_compute::IFunction object | ||
configureThreadSafe([&] { | ||
iFunction = configureFunction(aclMemoryTensors); | ||
}); | ||
return true; | ||
} | ||
|
||
void ACLCommonExecutor::execute(const MemoryArgs &memory) { | ||
// TODO: Move import_memory() to update() function - CVS-145871 | ||
for (auto& cpu_mem_ptr : memory) { | ||
const ACLArgs index = argConvert.at(cpu_mem_ptr.first); | ||
if (aclTensorAttrs.memoryUsageIndicator[index]) { | ||
aclMemoryTensors[index]->allocator()->import_memory(memory.at(cpu_mem_ptr.first)->getData()); | ||
} | ||
} | ||
iFunction->run(); | ||
} | ||
|
||
ACLCommonExecutor::~ACLCommonExecutor() { | ||
for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) { | ||
if (aclTensorAttrs.memoryUsageIndicator[i]) { | ||
aclMemoryTensors[i]->allocator()->free(); | ||
} | ||
} | ||
} | ||
|
||
} // namespace intel_cpu | ||
} // namespace ov |
58 changes: 58 additions & 0 deletions
58
src/plugins/intel_cpu/src/nodes/executors/acl/acl_common_executor.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
// Copyright (C) 2018-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#pragma once | ||
|
||
#include "cpu_memory.h" | ||
#include "nodes/executors/executor.hpp" | ||
#include "arm_compute/runtime/NEON/NEFunctions.h" | ||
|
||
namespace ov { | ||
namespace intel_cpu { | ||
|
||
enum ACLArgs { | ||
ACL_SRC_0, | ||
ACL_SRC_1, | ||
ACL_SRC_2, | ||
ACL_BIAS, | ||
ACL_WEI, | ||
ACL_DST, | ||
COUNT_OF_ARGS | ||
}; | ||
|
||
using ACLFunction = std::unique_ptr<arm_compute::IFunction>; | ||
using ACLShapes = std::array<arm_compute::TensorShape, ACLArgs::COUNT_OF_ARGS>; | ||
using ACLInfos = std::array<std::shared_ptr<arm_compute::TensorInfo>, ACLArgs::COUNT_OF_ARGS>; | ||
using ACLTensors = std::array<std::shared_ptr<arm_compute::Tensor>, ACLArgs::COUNT_OF_ARGS>; | ||
|
||
struct ACLTensorAttrs { | ||
bool hasLayoutTypeNHWC = false; | ||
size_t maxDimsShape = arm_compute::MAX_DIMS; | ||
std::array<bool, ACLArgs::COUNT_OF_ARGS> memoryUsageIndicator; | ||
}; | ||
|
||
class ACLCommonExecutor : public Executor { | ||
public: | ||
ACLCommonExecutor(); | ||
virtual void updateTensorsShapes(ACLShapes& aclMemoryShapes) = 0; | ||
virtual arm_compute::Status validateTensorsInfo(const ACLInfos& aclMemoryInfos) = 0; | ||
virtual ACLFunction configureFunction(const ACLTensors& aclMemoryTensors) = 0; | ||
impl_desc_type implType() const override { | ||
return impl_desc_type::acl; | ||
} | ||
void execute(const MemoryArgs& memory) override; | ||
bool update(const MemoryArgs& memory) override; | ||
~ACLCommonExecutor(); | ||
|
||
protected: | ||
ACLTensorAttrs aclTensorAttrs; | ||
private: | ||
ACLTensors aclMemoryTensors; | ||
ACLFunction iFunction = nullptr; | ||
}; | ||
|
||
using ACLCommonExecutorPtr = std::shared_ptr<ACLCommonExecutor>; | ||
|
||
} // namespace intel_cpu | ||
} // namespace ov |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I propose to leave a todo regarding the fact, that actually it should be enough to import_memory just once in scope of "update()" method, but it is not working for some reason and should be investigated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@EgorDuplensky added