diff --git a/src/common/transformations/include/transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp b/src/common/transformations/include/transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp index 7576abfac15e9e..7d17f3ffc6e4b2 100644 --- a/src/common/transformations/include/transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp +++ b/src/common/transformations/include/transformations/fp16_compression/mark_decompression_convert_constant_folding.hpp @@ -14,6 +14,7 @@ namespace pass { class TRANSFORMATIONS_API EnableDecompressionConvertConstantFolding; class TRANSFORMATIONS_API DisableDecompressionConvertConstantFolding; class TRANSFORMATIONS_API KeepConstAndDecompression; +class TRANSFORMATIONS_API KeepConstFP32Unfolded; class TRANSFORMATIONS_API KeepConstantsPrecisionAndAddConverts; } // namespace pass @@ -49,6 +50,12 @@ class ov::pass::KeepConstAndDecompression : public MatcherPass { KeepConstAndDecompression(); }; +class ov::pass::KeepConstFP32Unfolded : public MatcherPass { +public: + OPENVINO_RTTI("KeepConstFP32Unfolded", "0"); + KeepConstFP32Unfolded(); +}; + /** * @ingroup ie_transformation_common_api * @brief Prevents Consts precision conversion and adds Convert with disabled ConstantFolding diff --git a/src/common/transformations/include/transformations/rt_info/decompression.hpp b/src/common/transformations/include/transformations/rt_info/decompression.hpp index cda46272ca1da4..092186c5de7ef1 100644 --- a/src/common/transformations/include/transformations/rt_info/decompression.hpp +++ b/src/common/transformations/include/transformations/rt_info/decompression.hpp @@ -23,6 +23,12 @@ TRANSFORMATIONS_API void unmark_as_decompression(const std::shared_ptr& no TRANSFORMATIONS_API bool is_decompression(const std::shared_ptr& node); +TRANSFORMATIONS_API void mark_as_compression(const std::shared_ptr& node); + +TRANSFORMATIONS_API void unmark_as_compression(const std::shared_ptr& node); + +TRANSFORMATIONS_API bool is_compression(const std::shared_ptr& node); + /** * @ingroup ie_runtime_attr_api * @brief Decompression class represents runtime info attribute that marks operation @@ -43,4 +49,19 @@ class TRANSFORMATIONS_API Decompression : public RuntimeAttribute { } }; +class TRANSFORMATIONS_API Compression : public RuntimeAttribute { +public: + OPENVINO_RTTI("Compression", "0"); + + Compression() = default; + + bool visit_attributes(AttributeVisitor& visitor) override { + return true; + } + + bool is_copyable() const override { + return false; + } +}; + } // namespace ov diff --git a/src/common/transformations/src/transformations/fp16_compression/align_mixed_fp32_fp16_types.cpp b/src/common/transformations/src/transformations/fp16_compression/align_mixed_fp32_fp16_types.cpp index 990f85fc6eea80..d52b33fa7dad62 100644 --- a/src/common/transformations/src/transformations/fp16_compression/align_mixed_fp32_fp16_types.cpp +++ b/src/common/transformations/src/transformations/fp16_compression/align_mixed_fp32_fp16_types.cpp @@ -10,6 +10,7 @@ #include "openvino/op/result.hpp" #include "openvino/op/util/precision_sensitive_attribute.hpp" #include "openvino/pass/constant_folding.hpp" +#include "transformations/rt_info/decompression.hpp" #include "transformations/rt_info/disable_fp16_compression.hpp" using namespace ov; @@ -48,6 +49,7 @@ bool ov::pass::AlignMixedFP32FP16Types::run_on_model(const std::shared_ptrget_friendly_name() + "_compressed_to_f16"; convert->set_friendly_name(generate_uniq_name(init_name)); out_inputs.replace_source_output(convert); + mark_as_compression(convert); pass::disable_constant_folding(convert); is_changed = true; } diff --git a/src/common/transformations/src/transformations/fp16_compression/mark_decompression_convert_constant_folding.cpp b/src/common/transformations/src/transformations/fp16_compression/mark_decompression_convert_constant_folding.cpp index 26505dee5278b8..c63b4dd2d88e17 100644 --- a/src/common/transformations/src/transformations/fp16_compression/mark_decompression_convert_constant_folding.cpp +++ b/src/common/transformations/src/transformations/fp16_compression/mark_decompression_convert_constant_folding.cpp @@ -77,6 +77,32 @@ pass::KeepConstAndDecompression::KeepConstAndDecompression() { register_matcher(m, callback); } +pass::KeepConstFP32Unfolded::KeepConstFP32Unfolded() { + MATCHER_SCOPE(KeepConstFP16Unfolded); + + auto node_pattern = pattern::wrap_type(); + + matcher_pass_callback callback = [=](pattern::Matcher& m) { + auto node = m.get_match_root(); + + if (transformation_callback(node)) { + return false; + } + + auto constNode = node->get_input_node_shared_ptr(1); + if (!is_type(constNode) || constNode->get_output_element_type(0) != element::f32) + return false; + + disable_constant_folding(constNode); + enable_keep_const_precision(constNode); + disable_fp16_compression(constNode); + + return false; + }; + auto m = std::make_shared(node_pattern, matcher_name); + register_matcher(m, callback); +} + pass::KeepConstantsPrecisionAndAddConverts::KeepConstantsPrecisionAndAddConverts() { MATCHER_SCOPE(KeepConstantsPrecisionAndAddConverts); auto const_pattern = pattern::wrap_type(); diff --git a/src/common/transformations/src/transformations/rt_info/decompression.cpp b/src/common/transformations/src/transformations/rt_info/decompression.cpp index b87589aee0af1a..273b658530f98d 100644 --- a/src/common/transformations/src/transformations/rt_info/decompression.cpp +++ b/src/common/transformations/src/transformations/rt_info/decompression.cpp @@ -18,3 +18,18 @@ bool ov::is_decompression(const std::shared_ptr& node) { const auto& rt_info = node->get_rt_info(); return rt_info.count(Decompression::get_type_info_static()); } + +void ov::mark_as_compression(const std::shared_ptr& node) { + auto& rt_info = node->get_rt_info(); + rt_info[Compression::get_type_info_static()] = Compression(); +} + +void ov::unmark_as_compression(const std::shared_ptr& node) { + auto& rt_info = node->get_rt_info(); + rt_info.erase(Compression::get_type_info_static()); +} + +bool ov::is_compression(const std::shared_ptr& node) { + const auto& rt_info = node->get_rt_info(); + return rt_info.count(Compression::get_type_info_static()); +} diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 63c7e1cc8d10df..28188a4dc645cb 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -949,8 +949,8 @@ void GraphOptimizer::FuseFCAndConvertOnWeights(Graph& graph) { && parent->getChildEdges().size() == 1 && parent->getChildEdgeAt(0)->getOutputNum() == 1 && parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected - && one_of(parent->getOriginalInputPrecisionAtPort(0), ov::element::f16) - && one_of(parent->getOriginalOutputPrecisionAtPort(0), ov::element::f32, ov::element::bf16) + && one_of(parent->getOriginalInputPrecisionAtPort(0), ov::element::f32, ov::element::bf16, ov::element::f16) + && one_of(parent->getOriginalOutputPrecisionAtPort(0), ov::element::f32, ov::element::bf16, ov::element::f16) && parent->isConstant(); return res; }; diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index d11ed228d9922c..934585c7c6559f 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -464,11 +464,33 @@ void FullyConnected::prepareWeightsUsingDummyShape() { if (selected_pd == nullptr) OPENVINO_THROW("Preferable primitive descriptor is not set for node ", getName(), "."); - auto inDesc = MemoryDescUtils::convertToDnnlMemoryDesc(MemoryDescUtils::makeDummyDesc(*getBaseMemDescAtInputPort(DATA_ID))); + DnnlMemoryDescPtr inDesc = nullptr; auto weightDesc = MemoryDescUtils::convertToDnnlMemoryDesc(weightDescIP); auto biasDesc = withBiases ? MemoryDescUtils::convertToDnnlMemoryDesc(getBaseMemDescAtInputPort(BIAS_ID)) : nullptr; auto outDesc = MemoryDescUtils::convertToDnnlMemoryDesc(MemoryDescUtils::makeDummyDesc(*getBaseMemDescAtOutputPort(0))); + Shape newInShape = getBaseMemDescAtInputPort(DATA_ID)->getShape(); + if (isDynamicNode()) { + auto originalInDesc = getBaseMemDescAtInputPort(DATA_ID); + auto originalInDims = originalInDesc->getShape().getDims(); + size_t dimIdx = originalInDims.size() == 3 ? 1 : 0; + // Propagate N dim from the output shape to the input shape + if (newInShape.getDims()[dimIdx] == Shape::UNDEFINED_DIM && + getBaseMemDescAtOutputPort(0)->getShape().getDims()[dimIdx] != Shape::UNDEFINED_DIM) { + newInShape = cloneShapeWithNewDim(newInShape, getBaseMemDescAtOutputPort(0)->getShape().getDims()[dimIdx], dimIdx); + } + // Propagate K dim from the weights shape to the input shape + if (newInShape.getDims()[dimIdx+1] == Shape::UNDEFINED_DIM && + weightDesc->getShape().getDims()[1] != Shape::UNDEFINED_DIM) { + newInShape = cloneShapeWithNewDim(newInShape, weightDesc->getShape().getDims()[1], dimIdx+1); + } + + auto newInDesc = DnnlBlockedMemoryDesc(originalInDesc->getPrecision(), MemoryDescUtils::makeDummyShape(newInShape)); + inDesc = MemoryDescUtils::convertToDnnlMemoryDesc(MemoryDescUtils::makeDummyDesc(newInDesc)); + } else { + inDesc = MemoryDescUtils::convertToDnnlMemoryDesc(MemoryDescUtils::makeDummyDesc(*getBaseMemDescAtInputPort(DATA_ID))); + } + const FCKey key = {inDesc, weightDesc, biasDesc, diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp index 8079461b73a803..96fd378c7de08c 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp @@ -37,7 +37,7 @@ ov::intel_cpu::ConvertMatMulToFC::ConvertMatMulToFC() { auto fc_input_b = pattern_map.at(weights_m); bool is_convert = false; if (auto convert_node = std::dynamic_pointer_cast(fc_input_b.get_node_shared_ptr())) { - if (is_decompression(convert_node)) { + if (is_decompression(convert_node) || fp16_compression_is_disabled(convert_node) || is_compression(convert_node)) { is_convert = true; fc_input_b = convert_node->get_input_node_shared_ptr(0); } else { diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index b4761b903411ac..35637d21f235a7 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -300,6 +300,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis // It cannot be static data, because it may be difference for different inferencePrecision const auto precisions = get_convert_precisions(); if (inferencePrecision == ov::element::f16) { + CPU_REGISTER_PASS_ARM(manager, ov::pass::KeepConstFP32Unfolded); precisions_map fp_convert_precision_map = {{ov::element::f32, ov::element::f16}}; type_to_fuse_map empty_fuse_map = {}; const bool keep_precision_sensitive_in_fp32 = true; diff --git a/src/plugins/intel_cpu/src/utils/cpu_utils.hpp b/src/plugins/intel_cpu/src/utils/cpu_utils.hpp index c2f7e867956382..a89a44721bdaad 100644 --- a/src/plugins/intel_cpu/src/utils/cpu_utils.hpp +++ b/src/plugins/intel_cpu/src/utils/cpu_utils.hpp @@ -48,6 +48,23 @@ inline std::vector getNormalizedDimsBySize(const VectorDims &dims, size_ return normalizedDims; } +/** +* @brief Clones passed shape and replaces one its dimention. +* @param originalShape +* shape to clone +* @param newDimValue +* new dimention value +* @param dim +* dimention index +* @return cloned shape +*/ +inline Shape cloneShapeWithNewDim(Shape originalShape, Dim newDimValue, size_t dim) { + VectorDims newDims = originalShape.getDims(); + assert(dim < newDims.size()); + newDims[dim] = newDimValue; + return Shape(originalShape.getMinDims(), newDims); +} + /** * @brief Checked that secondInputDims unidirectional broadcastable per tensor or per channel to firstInputDims * @param firstInputDims diff --git a/src/plugins/intel_cpu/tests/functional/CMakeLists.txt b/src/plugins/intel_cpu/tests/functional/CMakeLists.txt index 2d837abb25eef3..ee5f1abf28a260 100644 --- a/src/plugins/intel_cpu/tests/functional/CMakeLists.txt +++ b/src/plugins/intel_cpu/tests/functional/CMakeLists.txt @@ -37,7 +37,7 @@ else() file(GLOB_RECURSE TMP_LIST_OF_TEST_CLASSES ${CMAKE_CURRENT_SOURCE_DIR}/single_layer_tests/classes/*.cpp) file(GLOB_RECURSE TMP_LIST_OF_COMMON_TEST_INSTANCES ${CMAKE_CURRENT_SOURCE_DIR}/single_layer_tests/instances/common/*.cpp) file(GLOB_RECURSE TMP_LIST_OF_ARM_TEST_INSTANCES ${CMAKE_CURRENT_SOURCE_DIR}/single_layer_tests/instances/arm/*.cpp) - file(GLOB_RECURSE TMP_LIST_OF_ARM_SUBGRAPH_TESTS ${CMAKE_CURRENT_SOURCE_DIR}/subgraph_tests/arm/*.cpp) + file(GLOB_RECURSE TMP_LIST_OF_ARM_SUBGRAPH_TESTS ${CMAKE_CURRENT_SOURCE_DIR}/subgraph_tests/src/arm/*.cpp) list(APPEND TMP_LIST_OF_EXPLICITLY_ENABLED_TESTS ${TMP_LIST_OF_TEST_CLASSES} ${TMP_LIST_OF_COMMON_TEST_INSTANCES} ${TMP_LIST_OF_ARM_TEST_INSTANCES} ${TMP_LIST_OF_ARM_SUBGRAPH_TESTS}) set(TMP_EXPLICITLY_ENABLED_TESTS "${TMP_LIST_OF_EXPLICITLY_ENABLED_TESTS}") diff --git a/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/arm/matmul_compress_convert.cpp b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/arm/matmul_compress_convert.cpp new file mode 100644 index 00000000000000..dab6e1f1e85a35 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/subgraph_tests/src/arm/matmul_compress_convert.cpp @@ -0,0 +1,194 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include +#include +#include +#include "common_test_utils/common_utils.hpp" +#include +#include "functional_test_utils/skip_tests_config.hpp" +#include "test_utils/cpu_test_utils.hpp" +#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" + +#include "test_utils/cpu_test_utils.hpp" +#include "test_utils/convolution_params.hpp" + +using namespace CPUTestUtils; + +namespace ov { +namespace test { + +using MatMulCompressConvertParams = std::tuple< + std::vector, // input shapes + std::pair, // transposeA, transposeB + element::Type, // inference precision + CPUSpecificParams +>; + +class MatMulCompressConvertCPUTest: public testing::WithParamInterface, + virtual public SubgraphBaseTest, public CPUTestsBase { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + std::vector inputShapes; + std::pair transpose; + element::Type inferPrecision; + CPUSpecificParams cpuParams; + + std::tie(inputShapes, transpose, inferPrecision, cpuParams) = obj.param; + + std::ostringstream result; + for (const auto& shape : inputShapes) { + result << ov::test::utils::partialShape2str({shape.first}) << "_"; + } + result << "TS="; + for (const auto& shape : inputShapes) { + result << "("; + if (!shape.second.empty()) { + auto itr = shape.second.begin(); + do { + result << ov::test::utils::vec2str(*itr); + } while (++itr != shape.second.end() && result << "_"); + } + result << ")_"; + } + result << "transpose_a=" << transpose.first << "_"; + result << "transpose_b=" << transpose.second << "_"; + + result << "infer_precision=" << inferPrecision << "_"; + + result << CPUTestsBase::getTestCaseName(cpuParams); + + return result.str(); + } + +protected: + template + void transposeShape(T& shape) { + IE_ASSERT(shape.size() > 1); + std::swap(*(shape.end() - 1), *(shape.end() - 2)); + } + + void CheckFCWeightsPrecision(element::Type expectedWeiElemType) const { + auto getExecValue = [](const ov::Node::RTMap& rtInfo, const std::string ¶mName) -> std::string { + auto it = rtInfo.find(paramName); + IE_ASSERT(rtInfo.end() != it); + return it->second.as(); + }; + + const auto execFunction = compiledModel.get_runtime_model(); + ASSERT_NE(nullptr, execFunction); + for (const auto &fcNode : execFunction->get_ops()) { + if (getExecValue(fcNode->get_rt_info(), ExecGraphInfoSerialization::LAYER_TYPE) == "FullyConnected") { + const auto &constNode = fcNode->get_input_node_shared_ptr(1); + element::Type expectedType(getExecValue(constNode->get_rt_info(), ExecGraphInfoSerialization::OUTPUT_PRECISIONS)); + ASSERT_EQ(expectedType, expectedWeiElemType); + } + } + } + + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_CPU; + + std::vector inputShapes; + std::pair transpose; + element::Type inferPrecision; + CPUSpecificParams cpuParams; + + std::tie(inputShapes, transpose, inferPrecision, cpuParams) = this->GetParam(); + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + + init_input_shapes(inputShapes); + + bool transpA = transpose.first; + bool transpB = transpose.second; + + if (transpA) { + transposeShape(inputDynamicShapes[0]); + for (auto& shapes : targetStaticShapes) { + transposeShape(shapes[0]); + } + } + if (transpB) { + transposeShape(inputDynamicShapes[1]); + for (auto& shapes : targetStaticShapes) { + transposeShape(shapes[1]); + } + } + + if (inferPrecision == element::f16) { + convertCount = 2; // convert f32->f16 on the activation input and convert f16->f32 on the output + } + + const auto& inShapeA = inputDynamicShapes[0]; + const auto& inShapeB = inputDynamicShapes[1]; + + configuration.emplace(ov::hint::inference_precision(inferPrecision)); + + element::Type netType = element::f32; + inType = outType = netType; + + std::string cpuNodeType = "FullyConnected"; + selectedType = makeSelectedTypeStr(selectedType, outType); + ov::ParameterVector params{std::make_shared(inType, inShapeA)}; + auto tensor = ov::test::utils::create_and_fill_tensor(element::f32, inShapeB.get_shape()); + std::shared_ptr inputB = std::make_shared(tensor); + + auto matMul = std::make_shared(params[0], inputB, transpA, transpB); + + function = CPUTestsBase::makeNgraphFunction(netType, params, matMul, cpuNodeType); + } + + void CheckExecutionGraph() { + CheckNumberOfNodesWithType(compiledModel, "FullyConnected", 1); + CheckNumberOfNodesWithType(compiledModel, "Convert", convertCount); + CheckFCWeightsPrecision(element::f32); + } + + size_t convertCount = 0; +}; + +TEST_P(MatMulCompressConvertCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED(); + run(); + CheckExecutionGraph(); +} + +namespace { + +const std::vector> transposeParams = { + {false, true}, +}; + +const std::vector> inputShapes2D = { + static_shapes_to_test_representation({{2, 3}, {3, 4}}), + { + {{-1, -1}, {{2, 3}, {5, 3}}}, + {{3, 4}, {{3, 4}, {3, 4}}} + }, +}; + +const std::vector inferPrecisions = { + element::f32, +#if defined(OV_CPU_ARM_ENABLE_FP16) + element::f16, +#endif +}; + +const auto testParams2D_ARM_smoke = ::testing::Combine( + ::testing::ValuesIn(inputShapes2D), + ::testing::ValuesIn(transposeParams), + ::testing::ValuesIn(inferPrecisions), + ::testing::Values(CPUSpecificParams{})); + +INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_ARM, MatMulCompressConvertCPUTest, testParams2D_ARM_smoke, + MatMulCompressConvertCPUTest::getTestCaseName); + +} // namespace + +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/thirdparty/ACLConfig.cmake b/src/plugins/intel_cpu/thirdparty/ACLConfig.cmake index ef0427aa9c2168..7281daf5dd3898 100644 --- a/src/plugins/intel_cpu/thirdparty/ACLConfig.cmake +++ b/src/plugins/intel_cpu/thirdparty/ACLConfig.cmake @@ -166,10 +166,10 @@ elseif(NOT TARGET arm_compute::arm_compute) list(APPEND ARM_COMPUTE_OPTIONS --jobs=${ARM_COMPUTE_SCONS_JOBS}) endif() - set(ARM_COMPUTE_DEBUG_OPTIONS - debug=1 - asserts=1 - logging=1) + # set(ARM_COMPUTE_DEBUG_OPTIONS + # debug=1 + # asserts=1 + # logging=1) # cmake older 3.20 does not support generator expressions in add_custom_command # https://cmake.org/cmake/help/latest/command/add_custom_command.html#examples-generating-files diff --git a/src/plugins/intel_cpu/thirdparty/onednn b/src/plugins/intel_cpu/thirdparty/onednn index cc986b3180d3e2..ee65efcd49a622 160000 --- a/src/plugins/intel_cpu/thirdparty/onednn +++ b/src/plugins/intel_cpu/thirdparty/onednn @@ -1 +1 @@ -Subproject commit cc986b3180d3e241f68c16249b7a6fac0dc54859 +Subproject commit ee65efcd49a622a8097ab990be7c141a7c4cd673