From b75cfb7e9bc5c45766cf17b7404aa775670ef81e Mon Sep 17 00:00:00 2001
From: Anton Voronov <anton.voronov@intel.com>
Date: Mon, 13 Feb 2023 12:23:07 +0000
Subject: [PATCH 1/2] [CPU] Sparse weights decompression: fixed property name
 and primitive caching

---
 .../python/tests/test_runtime/test_properties.py |  2 +-
 .../openvino/runtime/intel_cpu/properties.hpp    | 16 +++++++++++++++-
 .../intel_cpu/src/nodes/fullyconnected.cpp       |  5 ++---
 src/plugins/intel_cpu/src/nodes/fullyconnected.h |  2 --
 .../ov_executable_network/get_metric.cpp         |  8 ++++++++
 src/plugins/intel_cpu/thirdparty/onednn          |  2 +-
 6 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py
index 53b82ffa25a140..fb615c77d2b745 100644
--- a/src/bindings/python/tests/test_runtime/test_properties.py
+++ b/src/bindings/python/tests/test_runtime/test_properties.py
@@ -195,7 +195,7 @@ def test_properties_ro(ov_property_ro, expected_value):
         ),
         (
             properties.intel_cpu.sparse_weights_decompression_rate,
-            "SPARSE_WEIGHTS_DECOMPRESSION_RATE",
+            "CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE",
             (
                 (0.1, np.float32(0.1)),
                 (2.0, 2.0),
diff --git a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
index 119f83fe7a3670..f184027f086c45 100644
--- a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
+++ b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
@@ -47,7 +47,21 @@ namespace intel_cpu {
  */
 static constexpr Property<bool> denormals_optimization{"CPU_DENORMALS_OPTIMIZATION"};
 
-static constexpr Property<float> sparse_weights_decompression_rate{"SPARSE_WEIGHTS_DECOMPRESSION_RATE"};
+/**
+ * @brief This property defines threshold for sparse weights decompression feature activation
+ * @ingroup ov_runtime_cpu_prop_cpp_api
+ *
+ * Sparse weights decompression feature allows to pack weights for Matrix Multiplication operations directly in the CPU
+ * plugin at the model compilation stage and store non-zero values in a special packed format. Then, during the
+ * execution of the model, the weights are unpacked and used in the computational kernel. Since the weights are loaded
+ * from DDR/L3 cache in the packed format this significantly decreases memory consumption and as a consequence improve
+ * inference performance. The following code allows to set the sparse rate value.
+ *
+ * @code
+ * core.set_property(ov::intel_cpu::sparse_weights_decompression_rate(0.8));
+ * @endcode
+ */
+static constexpr Property<float> sparse_weights_decompression_rate{"CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE"};
 
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
index dbcb4d8eb59137..a78de155db85ff 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp
@@ -591,7 +591,7 @@ void FullyConnected::createDescriptorInternal(const dnnl::memory::desc &inputDes
     dnnl::memory::desc wgh_candidate;
     if (useSparseWeights) {
         wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()),
-                wdt, memory::desc::packed(nnzCount) };
+                wdt, memory::desc::packed() };
     } else {
         wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()),
                                         wdt, dnnl::memory::format_tag::any };
@@ -930,10 +930,9 @@ bool FullyConnected::useSparseWeightsDecompression() {
             zerosCounts++;
         }
     }
-    nnzCount = elementsCount - zerosCounts;
 
     DEBUG_LOG(getName(), ", weightsData.size() = ", elementsCount, ", zerosCounts = ",
-        zerosCounts, ", nnzCount = ", nnzCount);
+        zerosCounts, ", nnzCount = ", elementsCount - zerosCounts);
 
     weiSparseRate = static_cast<float>(zerosCounts) / static_cast<float>(elementsCount);
 
diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
index adedc27ac3da00..7f01e0e6dbc2ad 100644
--- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h
+++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h
@@ -42,7 +42,6 @@ class FullyConnected : public Node {
 
     void initSupportedPrimitiveDescriptors() override;
     void initOptimalPrimitiveDescriptor() override;
-    // void createPrimitive() override;
     std::shared_ptr<MemoryDesc> getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
     std::shared_ptr<MemoryDesc> getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
 
@@ -112,7 +111,6 @@ class FullyConnected : public Node {
 
     // sparse weights
     bool useSparseWeights = false;
-    int nnzCount = -1;
     float minSparseRate = 1.f;
     float weiSparseRate = 0.f;
     bool useSparseWeightsDecompression();
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp
index 64f00ad041ade0..255dc118f7ea73 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp
@@ -9,6 +9,7 @@
 #include "openvino/runtime/core.hpp"
 #include "openvino/runtime/compiled_model.hpp"
 #include "openvino/runtime/properties.hpp"
+#include "openvino/runtime/intel_cpu/properties.hpp"
 
 #include <gtest/gtest.h>
 
@@ -113,6 +114,13 @@ TEST_F(OVClassConfigTestCPU, smoke_CheckModelStreamsHasHigherPriorityThanThrough
     ASSERT_EQ(streams, value);
 }
 
+TEST_F(OVClassConfigTestCPU, smoke_CheckSparseWeigthsDecompressionRate) {
+    ov::Core core;
+
+    core.set_property(deviceName, ov::intel_cpu::sparse_weights_decompression_rate(0.8));
+    ASSERT_NO_THROW(ov::CompiledModel compiledModel = core.compile_model(model, deviceName));
+}
+
 const std::vector<ov::AnyMap> multiDevicePriorityConfigs = {
         {ov::device::priorities(CommonTestUtils::DEVICE_CPU)}};
 
diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn
index 44de3c3698b687..1746a9766bcef2 160000
--- a/src/plugins/intel_cpu/thirdparty/onednn
+++ b/src/plugins/intel_cpu/thirdparty/onednn
@@ -1 +1 @@
-Subproject commit 44de3c3698b687c26e487fc8f0213fa487e8fe2c
+Subproject commit 1746a9766bcef2b6af51d85ac01a980f1bff156f

From a396e1e2dc065fbb52afc09f0c180a1e1365f8ee Mon Sep 17 00:00:00 2001
From: Anton Voronov <anton.voronov@intel.com>
Date: Mon, 13 Feb 2023 12:33:02 +0000
Subject: [PATCH 2/2] removed property description

---
 .../openvino/runtime/intel_cpu/properties.hpp      | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
index f184027f086c45..c561b598d965c6 100644
--- a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
+++ b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp
@@ -47,20 +47,6 @@ namespace intel_cpu {
  */
 static constexpr Property<bool> denormals_optimization{"CPU_DENORMALS_OPTIMIZATION"};
 
-/**
- * @brief This property defines threshold for sparse weights decompression feature activation
- * @ingroup ov_runtime_cpu_prop_cpp_api
- *
- * Sparse weights decompression feature allows to pack weights for Matrix Multiplication operations directly in the CPU
- * plugin at the model compilation stage and store non-zero values in a special packed format. Then, during the
- * execution of the model, the weights are unpacked and used in the computational kernel. Since the weights are loaded
- * from DDR/L3 cache in the packed format this significantly decreases memory consumption and as a consequence improve
- * inference performance. The following code allows to set the sparse rate value.
- *
- * @code
- * core.set_property(ov::intel_cpu::sparse_weights_decompression_rate(0.8));
- * @endcode
- */
 static constexpr Property<float> sparse_weights_decompression_rate{"CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE"};
 
 }  // namespace intel_cpu