Skip to content

Commit 5ce3c96

Browse files
authored
[CPU] Add dumping of the memory statistcs (#28441)
### Details: Add yet another debug capability: dumping the following memory statistics: 1. Memory statistics for specific memory managers: Type of the manager, number of memory regions, number of unique memory blocks, total memory size, theoretically optimal total memory size, the size of the largest memory region 2. The size of memory allocated for scratchpads 3. Weight cache statistics per socket: Total size, the number of memory objects Standard output and `*.csv` file dump are supported ### ToDo: - [x] Add corresponding documentation ### Tickets: - *ticket-id*
1 parent f74f914 commit 5ce3c96

18 files changed

+532
-48
lines changed

src/plugins/intel_cpu/docs/debug_capabilities/README.md

+5
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,8 @@ Use the following cmake option to enable debug capabilities:
2626
Internal performance counter will be enabled automatically.
2727
* [Average counters](average_counters.md)
2828
`OV_CPU_AVERAGE_COUNTERS=filename`
29+
* Memory statistics
30+
`OV_CPU_MEMORY_STATISTICS_PATH=cout`
31+
Set this environment variable to dump memory usage statistics to the standard output when the compiled model is destructed.
32+
`OV_CPU_MEMORY_STATISTICS_PATH=<file_path>.csv`
33+
Set this environment variable to dump memory usage statistics to *.csv files. The `file_path` will be enhanced with the name of each compiled model: `file_path_<model_name>.csv`.

src/plugins/intel_cpu/src/compiled_model.cpp

+10-2
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424
#include "openvino/runtime/threading/cpu_streams_info.hpp"
2525
#include "openvino/runtime/threading/executor_manager.hpp"
2626
#include "openvino/util/common_util.hpp"
27-
#include "transformations/transformation_pipeline.h"
28-
#include "transformations/utils/utils.hpp"
27+
#include "utils/debug_capabilities.h"
28+
#include "utils/memory_stats_dump.hpp"
2929
#include "utils/serialize.hpp"
3030

3131
#if defined(OV_CPU_WITH_ACL)
@@ -44,6 +44,14 @@ struct ImmediateSerialExecutor : public ov::threading::ITaskExecutor {
4444
std::mutex _mutex;
4545
};
4646

47+
CompiledModel::~CompiledModel() {
48+
if (m_has_sub_compiled_models) {
49+
m_sub_compiled_models.clear();
50+
m_sub_memory_manager->_memorys_table.clear();
51+
}
52+
CPU_DEBUG_CAP_ENABLE(dumpMemoryStats(m_cfg.debugCaps, m_name, m_graphs, m_socketWeights));
53+
}
54+
4755
CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
4856
const std::shared_ptr<const ov::IPlugin>& plugin,
4957
Config cfg,

src/plugins/intel_cpu/src/compiled_model.h

+3-10
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ namespace intel_cpu {
2121

2222
class CompiledModel : public ov::ICompiledModel {
2323
public:
24+
typedef std::shared_ptr<CompiledModel> Ptr;
25+
2426
struct GraphGuard : public Graph {
2527
std::mutex _mutex;
2628
struct Lock : public std::unique_lock<std::mutex> {
@@ -30,22 +32,13 @@ class CompiledModel : public ov::ICompiledModel {
3032
};
3133

3234
public:
33-
typedef std::shared_ptr<CompiledModel> Ptr;
34-
3535
CompiledModel(const std::shared_ptr<ov::Model>& model,
3636
const std::shared_ptr<const ov::IPlugin>& plugin,
3737
Config cfg,
3838
const bool loaded_from_cache,
3939
std::shared_ptr<SubMemoryManager> sub_memory_manager = nullptr);
4040

41-
~CompiledModel() {
42-
if (m_has_sub_compiled_models) {
43-
m_sub_compiled_models.clear();
44-
m_sub_memory_manager->_memorys_table.clear();
45-
}
46-
auto streamsExecutor = std::dynamic_pointer_cast<ov::threading::IStreamsExecutor>(m_task_executor);
47-
streamsExecutor->cpu_reset();
48-
}
41+
~CompiledModel();
4942

5043
std::shared_ptr<ov::IAsyncInferRequest> create_infer_request() const override;
5144

src/plugins/intel_cpu/src/cpu_memory.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,10 @@ void MemoryBlockWithReuse::free() {
268268
m_useExternalStorage = false;
269269
}
270270

271+
size_t MemoryBlockWithReuse::size() const {
272+
return m_memUpperBound;
273+
}
274+
271275
void MemoryBlockWithReuse::release(void* ptr) {}
272276

273277
void MemoryBlockWithReuse::destroy(void* ptr) {

src/plugins/intel_cpu/src/cpu_memory.h

+1
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ class MemoryBlockWithReuse : public IMemoryBlock {
8080
bool resize(size_t size) override;
8181
bool hasExtBuffer() const noexcept override;
8282
void free();
83+
size_t size() const; // in bytes
8384

8485
private:
8586
bool m_useExternalStorage = false;

src/plugins/intel_cpu/src/dnnl_scratch_pad.h

+10-1
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,25 @@ namespace intel_cpu {
1515

1616
class DnnlScratchPad {
1717
MemoryBlockPtr blockPtr;
18+
MemoryBlockWithReuse* baseBlockPtr = nullptr;
1819
dnnl::engine eng;
1920

2021
public:
2122
DnnlScratchPad(dnnl::engine eng, int numa_node = -1) : eng(std::move(eng)) {
22-
blockPtr = std::make_shared<DnnlMemoryBlock>(make_unique<MemoryBlockWithReuse>(numa_node));
23+
auto baseMemoryBlock = make_unique<MemoryBlockWithReuse>(numa_node);
24+
baseBlockPtr = baseMemoryBlock.get();
25+
blockPtr = std::make_shared<DnnlMemoryBlock>(std::move(baseMemoryBlock));
2326
}
2427

2528
MemoryPtr createScratchPadMem(const MemoryDescPtr& md) {
2629
return std::make_shared<Memory>(eng, md, blockPtr);
2730
}
31+
32+
size_t size() const {
33+
if (baseBlockPtr)
34+
return baseBlockPtr->size();
35+
return 0;
36+
}
2837
};
2938

3039
using DnnlScratchPadPtr = std::shared_ptr<DnnlScratchPad>;

src/plugins/intel_cpu/src/graph.cpp

+7-7
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model>& model,
253253
}
254254

255255
// update output precisions of producers to avoid extra reorders
256-
// do this only in case output configration is not provided explicitly
256+
// do this only in case output configuration is not provided explicitly
257257
if (outputConfigs.empty()) {
258258
for (auto& output : outputNodesMap) {
259259
const auto& outputNode = output.second;
@@ -1669,7 +1669,7 @@ void Graph::Infer(SyncInferRequest* request) {
16691669
void Graph::SortTopologically() {
16701670
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::SortTopologically");
16711671

1672-
// Set execIndex of all nodes to default invaild value
1672+
// Set execIndex of all nodes to default invalid value
16731673
for (auto& node : graphNodes) {
16741674
node->execIndex = -1;
16751675
}
@@ -2004,8 +2004,8 @@ void Graph::EnforceInferencePrecision() {
20042004
if (one_of(parent->getType(),
20052005
Type::Convolution, // conv nets
20062006
Type::FullyConnected, // conv / bert nets
2007-
Type::RNNCell, // recurent nets
2008-
Type::RNNSeq, // recurent nets
2007+
Type::RNNCell, // recurrent nets
2008+
Type::RNNSeq, // recurrent nets
20092009
Type::MatMul, // bert nets
20102010
Type::ROIPooling, // object detection nets
20112011
Type::Interpolate, // super resolution nets
@@ -2038,7 +2038,7 @@ void Graph::EnforceInferencePrecision() {
20382038

20392039
/* Skip low-precision float point enforcement for tail of the graph by forming set of nodes to skip.
20402040
* Necessary to maintain accuracy.
2041-
* Experiments show zero peformance impact on average */
2041+
* Experiments show zero performance impact on average */
20422042
std::unordered_set<NodePtr> nodesToSkip;
20432043
// starting from output nodes
20442044
for (const auto& entry : outputNodesMap) {
@@ -2093,7 +2093,7 @@ void Graph::EnforceInferencePrecision() {
20932093
return true;
20942094
}
20952095

2096-
// exclude Convert after Range since it may cause precision loss when integter type to LP.
2096+
// exclude Convert after Range since it may cause precision loss when integer type to LP.
20972097
if (parent->getType() == Type::Range && node->getType() == Type::Convert) {
20982098
return true;
20992099
}
@@ -2124,7 +2124,7 @@ void Graph::EnforceInferencePrecision() {
21242124
continue;
21252125
}
21262126

2127-
// exclude Convert before Range since it may cause precision loss when integter type to LP.
2127+
// exclude Convert before Range since it may cause precision loss when integer type to LP.
21282128
// TODO: Incorrect subgraph is generated by ONNX FE + ticket 117861.
21292129
const auto& child = node->getChildEdgeAt(i)->getChild();
21302130
if (child->getType() == Type::Range && node->getType() == Type::Convert) {

src/plugins/intel_cpu/src/graph_context.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ GraphContext::GraphContext(Config config,
2525

2626
m_memoryStatesRegister(std::make_shared<node::MemoryStatesRegister>()),
2727
m_auxiliaryNetworkMemoryControl(std::make_shared<NetworkMemoryControl>()),
28-
m_memoryControl(m_auxiliaryNetworkMemoryControl->createMemoryControlUnit()) {
28+
m_memoryControl(m_auxiliaryNetworkMemoryControl->createMemoryControlUnit("main")) {
2929
if (m_streamExecutor) {
3030
m_cpuStreamExecutor = std::dynamic_pointer_cast<ov::threading::CPUStreamsExecutor>(m_streamExecutor);
3131
m_numaNodeId = m_cpuStreamExecutor ? std::max(0, m_cpuStreamExecutor->get_numa_node_id()) : 0;

0 commit comments

Comments
 (0)