From b75cfb7e9bc5c45766cf17b7404aa775670ef81e Mon Sep 17 00:00:00 2001 From: Anton Voronov <anton.voronov@intel.com> Date: Mon, 13 Feb 2023 12:23:07 +0000 Subject: [PATCH 1/2] [CPU] Sparse weights decompression: fixed property name and primitive caching --- .../python/tests/test_runtime/test_properties.py | 2 +- .../openvino/runtime/intel_cpu/properties.hpp | 16 +++++++++++++++- .../intel_cpu/src/nodes/fullyconnected.cpp | 5 ++--- src/plugins/intel_cpu/src/nodes/fullyconnected.h | 2 -- .../ov_executable_network/get_metric.cpp | 8 ++++++++ src/plugins/intel_cpu/thirdparty/onednn | 2 +- 6 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/bindings/python/tests/test_runtime/test_properties.py b/src/bindings/python/tests/test_runtime/test_properties.py index 53b82ffa25a140..fb615c77d2b745 100644 --- a/src/bindings/python/tests/test_runtime/test_properties.py +++ b/src/bindings/python/tests/test_runtime/test_properties.py @@ -195,7 +195,7 @@ def test_properties_ro(ov_property_ro, expected_value): ), ( properties.intel_cpu.sparse_weights_decompression_rate, - "SPARSE_WEIGHTS_DECOMPRESSION_RATE", + "CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE", ( (0.1, np.float32(0.1)), (2.0, 2.0), diff --git a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp index 119f83fe7a3670..f184027f086c45 100644 --- a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp +++ b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp @@ -47,7 +47,21 @@ namespace intel_cpu { */ static constexpr Property<bool> denormals_optimization{"CPU_DENORMALS_OPTIMIZATION"}; -static constexpr Property<float> sparse_weights_decompression_rate{"SPARSE_WEIGHTS_DECOMPRESSION_RATE"}; +/** + * @brief This property defines threshold for sparse weights decompression feature activation + * @ingroup ov_runtime_cpu_prop_cpp_api + * + * Sparse weights decompression feature allows to pack weights for Matrix Multiplication operations directly in the CPU + * plugin at the model compilation stage and store non-zero values in a special packed format. Then, during the + * execution of the model, the weights are unpacked and used in the computational kernel. Since the weights are loaded + * from DDR/L3 cache in the packed format this significantly decreases memory consumption and as a consequence improve + * inference performance. The following code allows to set the sparse rate value. + * + * @code + * core.set_property(ov::intel_cpu::sparse_weights_decompression_rate(0.8)); + * @endcode + */ +static constexpr Property<float> sparse_weights_decompression_rate{"CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE"}; } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index dbcb4d8eb59137..a78de155db85ff 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -591,7 +591,7 @@ void FullyConnected::createDescriptorInternal(const dnnl::memory::desc &inputDes dnnl::memory::desc wgh_candidate; if (useSparseWeights) { wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()), - wdt, memory::desc::packed(nnzCount) }; + wdt, memory::desc::packed() }; } else { wgh_candidate = { DnnlExtensionUtils::convertToDnnlDims(getInputShapeAtPort(WEIGHTS_ID).getStaticDims()), wdt, dnnl::memory::format_tag::any }; @@ -930,10 +930,9 @@ bool FullyConnected::useSparseWeightsDecompression() { zerosCounts++; } } - nnzCount = elementsCount - zerosCounts; DEBUG_LOG(getName(), ", weightsData.size() = ", elementsCount, ", zerosCounts = ", - zerosCounts, ", nnzCount = ", nnzCount); + zerosCounts, ", nnzCount = ", elementsCount - zerosCounts); weiSparseRate = static_cast<float>(zerosCounts) / static_cast<float>(elementsCount); diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h index adedc27ac3da00..7f01e0e6dbc2ad 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h @@ -42,7 +42,6 @@ class FullyConnected : public Node { void initSupportedPrimitiveDescriptors() override; void initOptimalPrimitiveDescriptor() override; - // void createPrimitive() override; std::shared_ptr<MemoryDesc> getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override; std::shared_ptr<MemoryDesc> getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override; @@ -112,7 +111,6 @@ class FullyConnected : public Node { // sparse weights bool useSparseWeights = false; - int nnzCount = -1; float minSparseRate = 1.f; float weiSparseRate = 0.f; bool useSparseWeightsDecompression(); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp index 64f00ad041ade0..255dc118f7ea73 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/behavior/ov_executable_network/get_metric.cpp @@ -9,6 +9,7 @@ #include "openvino/runtime/core.hpp" #include "openvino/runtime/compiled_model.hpp" #include "openvino/runtime/properties.hpp" +#include "openvino/runtime/intel_cpu/properties.hpp" #include <gtest/gtest.h> @@ -113,6 +114,13 @@ TEST_F(OVClassConfigTestCPU, smoke_CheckModelStreamsHasHigherPriorityThanThrough ASSERT_EQ(streams, value); } +TEST_F(OVClassConfigTestCPU, smoke_CheckSparseWeigthsDecompressionRate) { + ov::Core core; + + core.set_property(deviceName, ov::intel_cpu::sparse_weights_decompression_rate(0.8)); + ASSERT_NO_THROW(ov::CompiledModel compiledModel = core.compile_model(model, deviceName)); +} + const std::vector<ov::AnyMap> multiDevicePriorityConfigs = { {ov::device::priorities(CommonTestUtils::DEVICE_CPU)}}; diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn index 44de3c3698b687..1746a9766bcef2 160000 --- a/src/plugins/intel_cpu/thirdparty/onednn +++ b/src/plugins/intel_cpu/thirdparty/onednn @@ -1 +1 @@ -Subproject commit 44de3c3698b687c26e487fc8f0213fa487e8fe2c +Subproject commit 1746a9766bcef2b6af51d85ac01a980f1bff156f From a396e1e2dc065fbb52afc09f0c180a1e1365f8ee Mon Sep 17 00:00:00 2001 From: Anton Voronov <anton.voronov@intel.com> Date: Mon, 13 Feb 2023 12:33:02 +0000 Subject: [PATCH 2/2] removed property description --- .../openvino/runtime/intel_cpu/properties.hpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp index f184027f086c45..c561b598d965c6 100644 --- a/src/inference/include/openvino/runtime/intel_cpu/properties.hpp +++ b/src/inference/include/openvino/runtime/intel_cpu/properties.hpp @@ -47,20 +47,6 @@ namespace intel_cpu { */ static constexpr Property<bool> denormals_optimization{"CPU_DENORMALS_OPTIMIZATION"}; -/** - * @brief This property defines threshold for sparse weights decompression feature activation - * @ingroup ov_runtime_cpu_prop_cpp_api - * - * Sparse weights decompression feature allows to pack weights for Matrix Multiplication operations directly in the CPU - * plugin at the model compilation stage and store non-zero values in a special packed format. Then, during the - * execution of the model, the weights are unpacked and used in the computational kernel. Since the weights are loaded - * from DDR/L3 cache in the packed format this significantly decreases memory consumption and as a consequence improve - * inference performance. The following code allows to set the sparse rate value. - * - * @code - * core.set_property(ov::intel_cpu::sparse_weights_decompression_rate(0.8)); - * @endcode - */ static constexpr Property<float> sparse_weights_decompression_rate{"CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE"}; } // namespace intel_cpu