openvinotoolkit
diff --git a/‎src/plugins/intel_cpu/docs/debug_capabilities/README.md
+5 b/‎src/plugins/intel_cpu/docs/debug_capabilities/README.md
+5
diff --git a/‎src/plugins/intel_cpu/src/compiled_model.cpp
+10-2 b/‎src/plugins/intel_cpu/src/compiled_model.cpp
+10-2
diff --git a/‎src/plugins/intel_cpu/src/compiled_model.h
+3-10 b/‎src/plugins/intel_cpu/src/compiled_model.h
+3-10
diff --git a/‎src/plugins/intel_cpu/src/cpu_memory.cpp
+4 b/‎src/plugins/intel_cpu/src/cpu_memory.cpp
+4
diff --git a/‎src/plugins/intel_cpu/src/cpu_memory.h
+1 b/‎src/plugins/intel_cpu/src/cpu_memory.h
+1
diff --git a/‎src/plugins/intel_cpu/src/dnnl_scratch_pad.h
+10-1 b/‎src/plugins/intel_cpu/src/dnnl_scratch_pad.h
+10-1
diff --git a/‎src/plugins/intel_cpu/src/graph.cpp
+7-7 b/‎src/plugins/intel_cpu/src/graph.cpp
+7-7
diff --git a/‎src/plugins/intel_cpu/src/graph_context.cpp
+1-1 b/‎src/plugins/intel_cpu/src/graph_context.cpp
+1-1
@@ -26,3 +26,8 @@ Use the following cmake option to enable debug capabilities:
   Internal performance counter will be enabled automatically. 
 * [Average counters](average_counters.md)  
   `OV_CPU_AVERAGE_COUNTERS=filename`
+* Memory statistics  
+  `OV_CPU_MEMORY_STATISTICS_PATH=cout`  
+  Set this environment variable to dump memory usage statistics to the standard output when the compiled model is destructed.  
+  `OV_CPU_MEMORY_STATISTICS_PATH=<file_path>.csv`  
+  Set this environment variable to dump memory usage statistics to *.csv files. The `file_path` will be enhanced with the name of each compiled model: `file_path_<model_name>.csv`.
@@ -24,8 +24,8 @@
 #include "openvino/runtime/threading/cpu_streams_info.hpp"
 #include "openvino/runtime/threading/executor_manager.hpp"
 #include "openvino/util/common_util.hpp"
-#include "transformations/transformation_pipeline.h"
-#include "transformations/utils/utils.hpp"
+#include "utils/debug_capabilities.h"
+#include "utils/memory_stats_dump.hpp"
 #include "utils/serialize.hpp"
 
 #if defined(OV_CPU_WITH_ACL)
@@ -44,6 +44,14 @@ struct ImmediateSerialExecutor : public ov::threading::ITaskExecutor {
     std::mutex _mutex;
 };
 
+CompiledModel::~CompiledModel() {
+    if (m_has_sub_compiled_models) {
+        m_sub_compiled_models.clear();
+        m_sub_memory_manager->_memorys_table.clear();
+    }
+    CPU_DEBUG_CAP_ENABLE(dumpMemoryStats(m_cfg.debugCaps, m_name, m_graphs, m_socketWeights));
+}
+
 CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
                              const std::shared_ptr<const ov::IPlugin>& plugin,
                              Config cfg,
 
@@ -21,6 +21,8 @@ namespace intel_cpu {
 
 class CompiledModel : public ov::ICompiledModel {
 public:
+    typedef std::shared_ptr<CompiledModel> Ptr;
+
     struct GraphGuard : public Graph {
         std::mutex _mutex;
         struct Lock : public std::unique_lock<std::mutex> {
@@ -30,22 +32,13 @@ class CompiledModel : public ov::ICompiledModel {
     };
 
 public:
-    typedef std::shared_ptr<CompiledModel> Ptr;
-
     CompiledModel(const std::shared_ptr<ov::Model>& model,
                   const std::shared_ptr<const ov::IPlugin>& plugin,
                   Config cfg,
                   const bool loaded_from_cache,
                   std::shared_ptr<SubMemoryManager> sub_memory_manager = nullptr);
 
-    ~CompiledModel() {
-        if (m_has_sub_compiled_models) {
-            m_sub_compiled_models.clear();
-            m_sub_memory_manager->_memorys_table.clear();
-        }
-        auto streamsExecutor = std::dynamic_pointer_cast<ov::threading::IStreamsExecutor>(m_task_executor);
-        streamsExecutor->cpu_reset();
-    }
+    ~CompiledModel();
 
     std::shared_ptr<ov::IAsyncInferRequest> create_infer_request() const override;
 
 
@@ -268,6 +268,10 @@ void MemoryBlockWithReuse::free() {
     m_useExternalStorage = false;
 }
 
+size_t MemoryBlockWithReuse::size() const {
+    return m_memUpperBound;
+}
+
 void MemoryBlockWithReuse::release(void* ptr) {}
 
 void MemoryBlockWithReuse::destroy(void* ptr) {
 
@@ -80,6 +80,7 @@ class MemoryBlockWithReuse : public IMemoryBlock {
     bool resize(size_t size) override;
     bool hasExtBuffer() const noexcept override;
     void free();
+    size_t size() const;  // in bytes
 
 private:
     bool m_useExternalStorage = false;
 
@@ -15,16 +15,25 @@ namespace intel_cpu {
 
 class DnnlScratchPad {
     MemoryBlockPtr blockPtr;
+    MemoryBlockWithReuse* baseBlockPtr = nullptr;
     dnnl::engine eng;
 
 public:
     DnnlScratchPad(dnnl::engine eng, int numa_node = -1) : eng(std::move(eng)) {
-        blockPtr = std::make_shared<DnnlMemoryBlock>(make_unique<MemoryBlockWithReuse>(numa_node));
+        auto baseMemoryBlock = make_unique<MemoryBlockWithReuse>(numa_node);
+        baseBlockPtr = baseMemoryBlock.get();
+        blockPtr = std::make_shared<DnnlMemoryBlock>(std::move(baseMemoryBlock));
     }
 
     MemoryPtr createScratchPadMem(const MemoryDescPtr& md) {
         return std::make_shared<Memory>(eng, md, blockPtr);
     }
+
+    size_t size() const {
+        if (baseBlockPtr)
+            return baseBlockPtr->size();
+        return 0;
+    }
 };
 
 using DnnlScratchPadPtr = std::shared_ptr<DnnlScratchPad>;
 
@@ -253,7 +253,7 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model>& model,
     }
 
     // update output precisions of producers to avoid extra reorders
-    // do this only in case output configration is not provided explicitly
+    // do this only in case output configuration is not provided explicitly
     if (outputConfigs.empty()) {
         for (auto& output : outputNodesMap) {
             const auto& outputNode = output.second;
@@ -1669,7 +1669,7 @@ void Graph::Infer(SyncInferRequest* request) {
 void Graph::SortTopologically() {
     OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::SortTopologically");
 
-    // Set execIndex of all nodes to default invaild value
+    // Set execIndex of all nodes to default invalid value
     for (auto& node : graphNodes) {
         node->execIndex = -1;
     }
@@ -2004,8 +2004,8 @@ void Graph::EnforceInferencePrecision() {
                 if (one_of(parent->getType(),
                            Type::Convolution,     // conv nets
                            Type::FullyConnected,  // conv / bert nets
-                           Type::RNNCell,         // recurent nets
-                           Type::RNNSeq,          // recurent nets
+                           Type::RNNCell,         // recurrent nets
+                           Type::RNNSeq,          // recurrent nets
                            Type::MatMul,          // bert nets
                            Type::ROIPooling,      // object detection nets
                            Type::Interpolate,     // super resolution nets
@@ -2038,7 +2038,7 @@ void Graph::EnforceInferencePrecision() {
 
     /* Skip low-precision float point enforcement for tail of the graph by forming set of nodes to skip.
      * Necessary to maintain accuracy.
-     * Experiments show zero peformance impact on average */
+     * Experiments show zero performance impact on average */
     std::unordered_set<NodePtr> nodesToSkip;
     // starting from output nodes
     for (const auto& entry : outputNodesMap) {
@@ -2093,7 +2093,7 @@ void Graph::EnforceInferencePrecision() {
                     return true;
                 }
 
-                // exclude Convert after Range since it may cause precision loss when integter type to LP.
+                // exclude Convert after Range since it may cause precision loss when integer type to LP.
                 if (parent->getType() == Type::Range && node->getType() == Type::Convert) {
                     return true;
                 }
@@ -2124,7 +2124,7 @@ void Graph::EnforceInferencePrecision() {
                 continue;
             }
 
-            // exclude Convert before Range since it may cause precision loss when integter type to LP.
+            // exclude Convert before Range since it may cause precision loss when integer type to LP.
             // TODO: Incorrect subgraph is generated by ONNX FE + ticket 117861.
             const auto& child = node->getChildEdgeAt(i)->getChild();
             if (child->getType() == Type::Range && node->getType() == Type::Convert) {
 
@@ -25,7 +25,7 @@ GraphContext::GraphContext(Config config,
 
       m_memoryStatesRegister(std::make_shared<node::MemoryStatesRegister>()),
       m_auxiliaryNetworkMemoryControl(std::make_shared<NetworkMemoryControl>()),
-      m_memoryControl(m_auxiliaryNetworkMemoryControl->createMemoryControlUnit()) {
+      m_memoryControl(m_auxiliaryNetworkMemoryControl->createMemoryControlUnit("main")) {
     if (m_streamExecutor) {
         m_cpuStreamExecutor = std::dynamic_pointer_cast<ov::threading::CPUStreamsExecutor>(m_streamExecutor);
         m_numaNodeId = m_cpuStreamExecutor ? std::max(0, m_cpuStreamExecutor->get_numa_node_id()) : 0;
Original file line number	Diff line number	Diff line change
`@@ -268,6 +268,10 @@ void MemoryBlockWithReuse::free() {`
`268`	`268`	`m_useExternalStorage = false;`
`269`	`269`	`}`
`270`	`270`
	`271`	`+size_t MemoryBlockWithReuse::size() const {`
	`272`	`+ return m_memUpperBound;`
	`273`	`+}`
	`274`	`+`
`271`	`275`	`void MemoryBlockWithReuse::release(void* ptr) {}`
`272`	`276`
`273`	`277`	`void MemoryBlockWithReuse::destroy(void* ptr) {`