Skip to content

Commit 540a560

Browse files
committed
Merge remote-tracking branch 'upstream/master' into attribute_pattern_matching
Signed-off-by: Evgeniia Nugmanova <evgeniia.nugmanova@intel.com>
2 parents 56ac8ed + a8e776b commit 540a560

File tree

23 files changed

+342
-185
lines changed

23 files changed

+342
-185
lines changed

src/common/transformations/include/transformations/utils/utils.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,8 @@ TRANSFORMATIONS_API bool constantIsEqualTo(const std::shared_ptr<ov::op::v0::Con
193193

194194
TRANSFORMATIONS_API bool has_f16_constants(const std::shared_ptr<const ov::Model>& function);
195195

196+
TRANSFORMATIONS_API bool is_large_language_model(const ov::Model& model);
197+
196198
/**
197199
* \brief Check if 'other_shape' can be broadcasted to 'ref_shape'
198200
*

src/common/transformations/src/transformations/utils/utils.cpp

+29
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,15 @@
1212
#include "openvino/core/validation_util.hpp"
1313
#include "openvino/op/add.hpp"
1414
#include "openvino/op/broadcast.hpp"
15+
#include "openvino/op/concat.hpp"
1516
#include "openvino/op/constant.hpp"
17+
#include "openvino/op/convert.hpp"
1618
#include "openvino/op/divide.hpp"
1719
#include "openvino/op/gather.hpp"
1820
#include "openvino/op/multiply.hpp"
21+
#include "openvino/op/paged_attention.hpp"
1922
#include "openvino/op/parameter.hpp"
23+
#include "openvino/op/read_value.hpp"
2024
#include "openvino/op/relu.hpp"
2125
#include "openvino/op/reshape.hpp"
2226
#include "openvino/op/shape_of.hpp"
@@ -25,6 +29,9 @@
2529
#include "openvino/op/tanh.hpp"
2630
#include "openvino/op/util/multi_subgraph_base.hpp"
2731
#include "openvino/op/util/shape_of_base.hpp"
32+
#include "openvino/pass/pattern/op/optional.hpp"
33+
#include "openvino/pass/pattern/op/or.hpp"
34+
#include "openvino/pass/pattern/op/wrap_type.hpp"
2835

2936
namespace ov {
3037
namespace op {
@@ -133,6 +140,28 @@ bool has_f16_constants(const std::shared_ptr<const ov::Model>& function) {
133140
return false;
134141
}
135142

143+
bool is_large_language_model(const ov::Model& model) {
144+
using namespace ov::pass::pattern;
145+
146+
const auto past = wrap_type<ov::op::v6::ReadValue>();
147+
const auto convert_past = ov::pass::pattern::optional<ov::op::v0::Convert>(past);
148+
const auto beam_idx = wrap_type<ov::op::v0::Parameter>();
149+
const auto gather_past = wrap_type<ov::op::v8::Gather>({convert_past, beam_idx, wrap_type<ov::op::v0::Constant>()});
150+
const auto gather_convert = ov::pass::pattern::optional<ov::op::v0::Convert>(gather_past);
151+
const auto concat_past_input =
152+
std::make_shared<ov::pass::pattern::op::Or>(OutputVector{convert_past, gather_convert});
153+
const auto concat = wrap_type<ov::op::v0::Concat>({concat_past_input, any_input()});
154+
const auto convert_present = ov::pass::pattern::optional<ov::op::v0::Convert>(concat);
155+
const auto present = wrap_type<ov::op::v6::Assign>({convert_present});
156+
const auto kvcache_matcher = std::make_shared<ov::pass::pattern::Matcher>(present, "KVCacheMatcher");
157+
158+
for (const auto& op : model.get_ops()) {
159+
if (kvcache_matcher->match(op->output(0)) || ov::is_type<ov::op::PagedAttentionExtension>(op))
160+
return true;
161+
}
162+
return false;
163+
}
164+
136165
bool check_for_broadcast(const ov::PartialShape& ref_shape, const ov::PartialShape& other_shape) {
137166
if (ref_shape.rank().is_dynamic() || other_shape.rank().is_dynamic()) {
138167
return false;

src/plugins/intel_cpu/src/cpu_memory.cpp

+39-13
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "memory_desc/cpu_memory_desc_utils.h"
1010
#include "nodes/common/cpu_memcpy.h"
1111
#include "nodes/reorder.h"
12+
#include "utils/bfloat16.hpp"
1213
#include "utils/debug_capabilities.h"
1314
#if defined(__linux__)
1415
# include <sys/syscall.h> /* Definition of SYS_* constants */
@@ -30,19 +31,44 @@ BlockedMemoryDescPtr IMemory::getDescWithType<BlockedMemoryDesc, 0, 0>() const {
3031
}
3132

3233
namespace {
33-
inline void setSubnormalsToZero(float* data, size_t size) {
34+
inline void setSubnormalsToZeroAndbf16Saturation(float* data, size_t size, bool ftz, bool bf16saturation) {
3435
uint32_t* u32data = reinterpret_cast<uint32_t*>(data);
35-
for (size_t i = 0; i < size; ++i) {
36-
if ((u32data[i] & (0xFF << 23)) == 0) {
37-
u32data[i] = 0;
36+
float* floatdata = reinterpret_cast<float*>(data);
37+
if (ftz && bf16saturation) {
38+
for (size_t i = 0; i < size; ++i) {
39+
if ((u32data[i] & (0xFF << 23)) == 0) {
40+
u32data[i] = 0;
41+
} else if (!std::isnan(floatdata[i]) && !std::isinf(floatdata[i])) {
42+
floatdata[i] = (floatdata[i] < static_cast<float>(std::numeric_limits<ov::bfloat16>::lowest()))
43+
? static_cast<float>(std::numeric_limits<ov::bfloat16>::lowest())
44+
: (floatdata[i] > static_cast<float>(std::numeric_limits<ov::bfloat16>::max()))
45+
? static_cast<float>(std::numeric_limits<ov::bfloat16>::max())
46+
: floatdata[i];
47+
}
48+
}
49+
} else if (ftz) {
50+
for (size_t i = 0; i < size; ++i) {
51+
if ((u32data[i] & (0xFF << 23)) == 0) {
52+
u32data[i] = 0;
53+
}
54+
}
55+
} else if (bf16saturation) {
56+
for (size_t i = 0; i < size; ++i) {
57+
if (!std::isnan(floatdata[i]) && !std::isinf(floatdata[i])) {
58+
floatdata[i] = (floatdata[i] < static_cast<float>(std::numeric_limits<ov::bfloat16>::lowest()))
59+
? static_cast<float>(std::numeric_limits<ov::bfloat16>::lowest())
60+
: (floatdata[i] > static_cast<float>(std::numeric_limits<ov::bfloat16>::max()))
61+
? static_cast<float>(std::numeric_limits<ov::bfloat16>::max())
62+
: floatdata[i];
63+
}
3864
}
3965
}
4066
}
4167

42-
void transferData(const IMemory& src, const IMemory& dst, bool ftz) {
68+
void transferData(const IMemory& src, const IMemory& dst, bool ftz, bool bf16saturation) {
4369
node::Reorder::reorderData(src, dst);
4470

45-
if (!ftz) {
71+
if (!ftz && !bf16saturation) {
4672
return;
4773
}
4874
if (src.getDesc().getPrecision() != ov::element::f32 || dst.getDesc().getPrecision() != ov::element::f32) {
@@ -62,7 +88,7 @@ void transferData(const IMemory& src, const IMemory& dst, bool ftz) {
6288
// actual FTZ
6389
auto* memData = static_cast<float*>(dst.getData());
6490
memData += offset;
65-
setSubnormalsToZero(memData, dst.getSize() / sizeof(float));
91+
setSubnormalsToZeroAndbf16Saturation(memData, dst.getSize() / sizeof(float), ftz, bf16saturation);
6692
}
6793

6894
} // namespace
@@ -125,11 +151,11 @@ void Memory::create(MemoryDescPtr desc, const void* data, bool pads_zeroing) {
125151
}
126152
}
127153

128-
void Memory::load(const IMemory& src, bool ftz) const {
154+
void Memory::load(const IMemory& src, bool ftz, bool bf16saturation) const {
129155
if (src.getDesc().getPrecision() == element::string) {
130156
OPENVINO_THROW("[CPU] Memory object cannot load string data.");
131157
}
132-
transferData(src, *this, ftz);
158+
transferData(src, *this, ftz, bf16saturation);
133159
}
134160

135161
void Memory::nullify() {
@@ -273,12 +299,12 @@ StringMemory::StringMemory(dnnl::engine engine, MemoryDescPtr desc, const void*
273299
}
274300
}
275301

276-
void StringMemory::load(const IMemory& src, bool ftz) const {
302+
void StringMemory::load(const IMemory& src, bool ftz, bool bf16saturation) const {
277303
if (src.getDesc().getPrecision() != element::string) {
278304
OPENVINO_THROW("[CPU] String memory cannot load a non-string object.");
279305
}
280306

281-
transferData(src, *this, false);
307+
transferData(src, *this, false, false);
282308
}
283309

284310
void* StringMemory::getData() const {
@@ -472,11 +498,11 @@ void StaticMemory::redefineDesc(MemoryDescPtr desc) {
472498
OPENVINO_THROW("Unexpected: Memory descriptor may not be modified in StaticMemory object");
473499
}
474500

475-
void StaticMemory::load(const IMemory& src, bool ftz) const {
501+
void StaticMemory::load(const IMemory& src, bool ftz, bool bf16saturation) const {
476502
if (src.getDesc().getPrecision() == element::string) {
477503
OPENVINO_THROW("[CPU] StaticMemory cannot load string data.");
478504
}
479-
transferData(src, *this, ftz);
505+
transferData(src, *this, ftz, bf16saturation);
480506
}
481507

482508
MemoryBlockPtr StaticMemory::getMemoryBlock() const {

src/plugins/intel_cpu/src/cpu_memory.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ class IMemory {
188188
// Caution!!! This action invalidates the previous data layout. The old data may become unreachable.
189189
virtual void redefineDesc(MemoryDescPtr desc) = 0;
190190

191-
virtual void load(const IMemory& src, bool ftz) const = 0;
191+
virtual void load(const IMemory& src, bool ftz, bool bf16saturation) const = 0;
192192

193193
virtual MemoryBlockPtr getMemoryBlock() const = 0;
194194

@@ -260,7 +260,7 @@ class StaticMemory final : public IMemory {
260260
// Always throws since a static memory descriptor should not be modified
261261
void redefineDesc(MemoryDescPtr desc) override;
262262

263-
void load(const IMemory& src, bool ftz) const override;
263+
void load(const IMemory& src, bool ftz, bool bf16saturation) const override;
264264

265265
MemoryBlockPtr getMemoryBlock() const override;
266266

@@ -315,7 +315,7 @@ class Memory : public IMemory {
315315

316316
void redefineDesc(MemoryDescPtr desc) override;
317317

318-
void load(const IMemory& src, bool ftz) const override;
318+
void load(const IMemory& src, bool ftz, bool bf16saturation) const override;
319319
void nullify() override;
320320

321321
dnnl::engine getEngine() const {
@@ -421,7 +421,7 @@ class StringMemory : public IMemory {
421421

422422
void redefineDesc(MemoryDescPtr desc) override;
423423

424-
void load(const IMemory& src, bool ftz) const override;
424+
void load(const IMemory& src, bool ftz, bool bf16saturation) const override;
425425

426426
MemoryBlockPtr getMemoryBlock() const override;
427427

src/plugins/intel_cpu/src/dnnl_postops_composer.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,7 @@ static MemoryPtr prepackDecompressionParams(const MemoryCPtr& paramsPtr,
659659
srcFormat);
660660
auto srcMem = std::make_shared<Memory>(engine, srcMemoryDesc, paramsPtr->getData());
661661

662-
dstMem->load(*srcMem, true);
662+
dstMem->load(*srcMem, true, false);
663663
return dstMem;
664664
}
665665

src/plugins/intel_cpu/src/emitters/plugin/x64/jit_bf16_emitters.hpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,12 @@ class jit_uni_vcvtneps2bf16 : public jit_emitter {
1717
conversion_mode mode = conversion_mode::default_mode)
1818
: jit_emitter(host, host_isa, exec_prc),
1919
mode_(mode) {
20-
prepare_table();
20+
// only saturation_mode or non avx512_core_bf16/avx2_vnni_2 platforms requires table
21+
if ((!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16) &&
22+
!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2_vnni_2)) ||
23+
mode_ == conversion_mode::saturation_mode) {
24+
prepare_table();
25+
}
2126
}
2227

2328
size_t get_inputs_num() const override {

src/plugins/intel_cpu/src/graph.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -1201,10 +1201,10 @@ void Graph::PushInputData(const std::size_t& index, const ov::SoPtr<ITensor>& in
12011201

12021202
if (actualDesc->getPrecision() == element::string) {
12031203
StringMemory ext_mem(getEngine(), ext_tensor_desc, ext_data_ptr);
1204-
edgeMemory->load(ext_mem, false);
1204+
edgeMemory->load(ext_mem, false, false);
12051205
} else if (!actualDesc->isCompatible(*ext_tensor_desc)) {
12061206
Memory ext_mem(getEngine(), ext_tensor_desc, ext_data_ptr, false);
1207-
edgeMemory->load(ext_mem, false);
1207+
edgeMemory->load(ext_mem, false, false);
12081208
} else {
12091209
size_t size_to_copy = ext_tensor_desc->getCurrentMemSize();
12101210
cpu_parallel_memcpy(inter_data_ptr, ext_data_ptr, size_to_copy);
@@ -1311,10 +1311,10 @@ void Graph::PullOutputData(std::unordered_map<std::size_t, ov::SoPtr<ITensor>>&
13111311

13121312
if (actualDesc->getPrecision() == element::string) {
13131313
StringMemory outBloMem(getEngine(), expected_desc_ptr, ext_blob_ptr);
1314-
outBloMem.load(intr_blob, false);
1314+
outBloMem.load(intr_blob, false, false);
13151315
} else if (!actualDesc->isCompatible(*expected_desc_ptr) && !isScalarOutput) {
13161316
Memory outBloMem(getEngine(), expected_desc_ptr, ext_blob_ptr, false);
1317-
outBloMem.load(intr_blob, false);
1317+
outBloMem.load(intr_blob, false, false);
13181318
} else {
13191319
OPENVINO_ASSERT(srcPrec == dstPrec,
13201320
"The precision of the CPU output tensor index",

src/plugins/intel_cpu/src/memory_state.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ void VariableStateBase::set_state_impl(const ov::SoPtr<ov::ITensor>& state) {
5757
auto src = state->data();
5858

5959
Memory mem(get_engine(), state_desc, src);
60-
input_mem()->load(mem, true);
60+
input_mem()->load(mem, true, false);
6161
reset_state_flag = false;
6262
}
6363

@@ -96,7 +96,7 @@ ov::SoPtr<ov::ITensor> VariableStateBase::get_state() const {
9696

9797
// reorder
9898
auto mem = std::make_shared<Memory>(get_engine(), current_ext_desc);
99-
mem->load(*(internal_state_mem()), true);
99+
mem->load(*(internal_state_mem()), true, false);
100100
return std::make_shared<Tensor>(mem);
101101
}
102102

@@ -312,7 +312,7 @@ void VariableStateKVcache::set_state_impl(const ov::SoPtr<ov::ITensor>& state) {
312312
m_scale_zp.at<float>({m, b, h, size_t{1}}));
313313
});
314314
} else {
315-
m_internal_mem->load(external_mem, true);
315+
m_internal_mem->load(external_mem, true, false);
316316
}
317317

318318
// 2. Reset the beam search table

src/plugins/intel_cpu/src/nodes/conv.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -1687,7 +1687,7 @@ void Convolution::executeDynamicImpl(const dnnl::stream& strm) {
16871687
const auto& outMem = out->getParentEdgeAt(0)->getMemory();
16881688
auto convOutMem = getDstMemoryAtPort(0);
16891689
Node::redefineOutputMemory({outMem.getStaticDims()});
1690-
convOutMem->load(outMem, true);
1690+
convOutMem->load(outMem, true, false);
16911691
}
16921692
}
16931693

src/plugins/intel_cpu/src/nodes/eltwise.cpp

+2-25
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,6 @@ struct EltwiseKey {
360360
ov::element::Type outPrc;
361361
dnnl::post_ops postOps;
362362
EltwiseImplType implType;
363-
bool doOutputSaturation;
364363

365364
size_t hash() const {
366365
using namespace dnnl::impl;
@@ -396,10 +395,6 @@ struct EltwiseKey {
396395
seed = hash_combine(seed, outPrc.hash());
397396
seed = get_post_op_hash(seed, *postOps.get());
398397
seed = hash_combine(seed, implType);
399-
400-
if (outPrc == ov::element::bf16) {
401-
seed = hash_combine(seed, doOutputSaturation);
402-
}
403398
return seed;
404399
}
405400

@@ -427,9 +422,6 @@ struct EltwiseKey {
427422
result = result && (inpDims[i] == rhs.inpDims[i]);
428423
}
429424
}
430-
if (doOutputSaturation != rhs.doOutputSaturation) {
431-
return false;
432-
}
433425
}
434426

435427
return result;
@@ -462,8 +454,7 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor {
462454
const std::vector<ov::element::Type>& inpPrc,
463455
const ov::element::Type& outPrc,
464456
const dnnl::post_ops& post_ops,
465-
bool useRuntimePtrs,
466-
bool doOutputSaturation) {
457+
bool useRuntimePtrs) {
467458
auto collapseLastDims = [](std::vector<size_t>& dims, int dimsToCollapse) {
468459
for (size_t i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
469460
dims[dims.size() - 1] *= dims[i];
@@ -657,7 +648,6 @@ class EltwiseJitExecutor : public Eltwise::IEltwiseExecutor {
657648
jep.dst_prc = outPrc;
658649
jep.work_amount = jep.dst_size = jep.dims.back();
659650
jep.oc_size = oc_size;
660-
jep.do_output_saturation = doOutputSaturation;
661651

662652
std::transform(jep.oc_offsets.begin(), jep.oc_offsets.end(), jep.oc_offsets.begin(), [](size_t& offset) {
663653
return offset * sizeof(float);
@@ -1189,8 +1179,7 @@ static Eltwise::executorPtr buildExecutor(const EltwiseKey& key) {
11891179
key.inpPrc,
11901180
key.outPrc,
11911181
key.postOps,
1192-
key.implType == EltwiseImplType::optimizedShapeAgnostic,
1193-
key.doOutputSaturation);
1182+
key.implType == EltwiseImplType::optimizedShapeAgnostic);
11941183
}
11951184

11961185
bool Eltwise::isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept {
@@ -1906,18 +1895,6 @@ void Eltwise::prepareParams() {
19061895
}
19071896
}
19081897

1909-
// FP32 constant inputs may contain values out of BF16 representable range. In case output precision is BF16 we
1910-
// choose "saturation" mode for fp32->bf16 conversion procedure to prevent getting -Inf/+Inf values in the
1911-
// outputs. Since "saturation" conversion is more time consuming, better solution would be to clamp constants on
1912-
// compilation stage (ticket: 159589).
1913-
key.doOutputSaturation = false;
1914-
for (size_t i = 0; i < getParentEdges().size(); i++) {
1915-
if (getParentEdgeAt(i)->getParent()->isConstant()) {
1916-
key.doOutputSaturation = true;
1917-
break;
1918-
}
1919-
}
1920-
19211898
auto cache = context->getParamsCache();
19221899
auto result = cache->getOrCreate(key, buildExecutor);
19231900
execPtr = result.first;

0 commit comments

Comments
 (0)