Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[intel-npu] Support new internal cached_model_buffer config for memory mapped cached blobs #27822

Open
wants to merge 28 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
8c54b3f
Squash commits from rebase
alexandruenache1111 Dec 16, 2024
a168ca9
Few important changes:
alexandruenache1111 Dec 18, 2024
160ce46
Added test for reading any invalid metadata version
alexandruenache1111 Dec 20, 2024
ebb78a4
Updated writeAndReadInvalidMetadataVersion test body
alexandruenache1111 Jan 8, 2025
45e3fef
Move version functions as static methods inside MetadataBase
alexandruenache1111 Jan 9, 2025
37c53b9
Nitpicks
alexandruenache1111 Jan 9, 2025
a1b89e6
Move version field to MetadataBase
alexandruenache1111 Jan 13, 2025
8aff5a6
Add `ov::internal::caching_with_mmap` property logic
MirceaDan99 Nov 14, 2024
0544093
Refactor compiler type selection
MirceaDan99 Nov 14, 2024
f09b447
Fix OV cache header not being removed from blob for memory mapped cac…
MirceaDan99 Nov 14, 2024
e8b6b72
Keep `shared_ptr` of blob in IGraph to fix `export_model` for import …
MirceaDan99 Nov 20, 2024
1ffb961
Refactor changes for CIP & Drop `parse` function from `ICompilerAdapt…
MirceaDan99 Nov 20, 2024
2793039
Update plugin API to import model with mmap buffer
olpipi Nov 19, 2024
27f8ec7
Use new `import_model` with `model_buffer` API
MirceaDan99 Nov 21, 2024
e53c3ea
New fix for adding offset to `model_buffer` relative to end position …
MirceaDan99 Nov 22, 2024
42ca42e
Fix `std::vector` being moved after accesing its `.data()` and `.size…
MirceaDan99 Nov 22, 2024
4932d30
Refactor `getGraphHandle` to drop dependency to `ov::AlignedBuffer`
MirceaDan99 Nov 25, 2024
aa44d2e
Refactor `import_model` new API to accept only either `std::istream` …
MirceaDan99 Nov 26, 2024
cf006c2
Re-add `DriverGraph::release_blob` method and adapt to `ov::AlignedBu…
MirceaDan99 Nov 26, 2024
67cd64c
Code clean-up
MirceaDan99 Nov 27, 2024
2f404c1
Revert changes in new `import_model` API, so `NPU` plugin will have `…
MirceaDan99 Nov 27, 2024
c8f4abf
Add `BlobContainer` class and derivates for each `std::vector<uint8_t…
MirceaDan99 Nov 27, 2024
6badc9a
Fix clang formats
MirceaDan99 Dec 9, 2024
1077884
Use alternative from `PR #27981` instead for memory mapped buffers
MirceaDan99 Dec 11, 2024
0eea082
Add suggested changes
MirceaDan99 Dec 12, 2024
0f8caef
Prepare `BlobContainerAlignedBuffer` for `OV versioning metadata`
MirceaDan99 Dec 12, 2024
2d290ef
Fix broken stream processed by NPUW
MirceaDan99 Jan 14, 2025
49e880b
Fix offsets mismatch for HETERO plugin blob headers
MirceaDan99 Jan 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <memory>
#include <vector>

#include "openvino/runtime/shared_buffer.hpp"

namespace intel_npu {

class BlobContainer {
public:
virtual void* get_ptr() = 0;

virtual size_t size() const = 0;

virtual bool release_from_memory() = 0;

virtual ~BlobContainer() = default;
};

class BlobContainerVector : public BlobContainer {
public:
BlobContainerVector(std::vector<uint8_t> blob) : _ownershipBlob(std::move(blob)) {}

void* get_ptr() override {
return reinterpret_cast<void*>(_ownershipBlob.data());
}

size_t size() const override {
return _ownershipBlob.size();
}

bool release_from_memory() override {
_ownershipBlob.clear();
_ownershipBlob.shrink_to_fit();
return true;
}

private:
std::vector<uint8_t> _ownershipBlob;
};

class BlobContainerAlignedBuffer : public BlobContainer {
public:
BlobContainerAlignedBuffer(const std::shared_ptr<ov::AlignedBuffer>& blobSO,
size_t ovHeaderOffset,
uint64_t blobSize)
: _blobSize(blobSize),
_ovHeaderOffset(ovHeaderOffset),
_ownershipBlob(blobSO) {}

void* get_ptr() override {
return _ownershipBlob->get_ptr(_ovHeaderOffset);
}

size_t size() const override {
return _blobSize;
}

bool release_from_memory() override {
return false;
}

private:
uint64_t _blobSize;
size_t _ovHeaderOffset;
std::shared_ptr<ov::AlignedBuffer> _ownershipBlob;
};

} // namespace intel_npu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class ICompilerAdapter {
public:
virtual std::shared_ptr<IGraph> compile(const std::shared_ptr<const ov::Model>& model,
const Config& config) const = 0;
virtual std::shared_ptr<IGraph> parse(std::vector<uint8_t> network, const Config& config) const = 0;
virtual std::shared_ptr<IGraph> parse(std::unique_ptr<BlobContainer> blobPtr, const Config& config) const = 0;
virtual ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const = 0;
virtual uint32_t get_version() const = 0;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <mutex>
#include <vector>

#include "intel_npu/common/blob_container.hpp"
#include "intel_npu/network_metadata.hpp"
#include "intel_npu/utils/zero/zero_init.hpp"
#include "intel_npu/utils/zero/zero_utils.hpp"
Expand All @@ -21,7 +22,7 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
IGraph(ze_graph_handle_t handle,
NetworkMetadata metadata,
const Config& config,
std::optional<std::vector<uint8_t>> blob);
std::unique_ptr<BlobContainer> blobPtr);

virtual size_t export_blob(std::ostream& stream) const = 0;

Expand Down Expand Up @@ -89,7 +90,7 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
// first inference starts running
std::mutex _mutex;

std::vector<uint8_t> _blob;
std::unique_ptr<BlobContainer> _blobPtr;

uint32_t _unique_id = 0;
uint32_t _last_submitted_id;
Expand Down
9 changes: 3 additions & 6 deletions src/plugins/intel_npu/src/common/src/igraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,11 @@ namespace intel_npu {
IGraph::IGraph(ze_graph_handle_t handle,
NetworkMetadata metadata,
const Config& config,
std::optional<std::vector<uint8_t>> blob)
std::unique_ptr<BlobContainer> blobPtr)
: _handle(handle),
_metadata(std::move(metadata)),
_logger("IGraph", config.get<LOG_LEVEL>()) {
if (blob.has_value()) {
_blob = std::move(*blob);
}
}
_blobPtr(std::move(blobPtr)),
_logger("IGraph", config.get<LOG_LEVEL>()) {}

const NetworkMetadata& IGraph::get_metadata() const {
return _metadata;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class DriverCompilerAdapter final : public ICompilerAdapter {

std::shared_ptr<IGraph> compile(const std::shared_ptr<const ov::Model>& model, const Config& config) const override;

std::shared_ptr<IGraph> parse(std::vector<uint8_t> network, const Config& config) const override;
std::shared_ptr<IGraph> parse(std::unique_ptr<BlobContainer> blobPtr, const Config& config) const override;

ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class DriverGraph final : public IGraph {
ze_graph_handle_t graphHandle,
NetworkMetadata metadata,
const Config& config,
std::optional<std::vector<uint8_t>> blob);
std::unique_ptr<BlobContainer> blobPtr);

size_t export_blob(std::ostream& stream) const override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class PluginCompilerAdapter final : public ICompilerAdapter {

std::shared_ptr<IGraph> compile(const std::shared_ptr<const ov::Model>& model, const Config& config) const override;

std::shared_ptr<IGraph> parse(std::vector<uint8_t> network, const Config& config) const override;
std::shared_ptr<IGraph> parse(std::unique_ptr<BlobContainer> blobPtr, const Config& config) const override;

ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class PluginGraph final : public IGraph {
const std::shared_ptr<ZeroInitStructsHolder>& zeroInitStruct,
ze_graph_handle_t graphHandle,
NetworkMetadata metadata,
std::vector<uint8_t> blob,
std::unique_ptr<BlobContainer> blobPtr,
const Config& config);

size_t export_blob(std::ostream& stream) const override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class ZeGraphExtWrappers {
const std::string& buildFlags,
const uint32_t& flags) const;

ze_graph_handle_t getGraphHandle(const std::vector<uint8_t>& network) const;
ze_graph_handle_t getGraphHandle(const uint8_t& data, size_t size) const;

NetworkMetadata getNetworkMeta(ze_graph_handle_t graphHandle) const;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,16 @@ std::shared_ptr<IGraph> DriverCompilerAdapter::compile(const std::shared_ptr<con
graphHandle,
std::move(networkMeta),
config,
std::nullopt);
nullptr);
}

std::shared_ptr<IGraph> DriverCompilerAdapter::parse(std::vector<uint8_t> network, const Config& config) const {
std::shared_ptr<IGraph> DriverCompilerAdapter::parse(std::unique_ptr<BlobContainer> blobPtr,
const Config& config) const {
OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "DriverCompilerAdapter", "parse");

_logger.debug("parse start");
ze_graph_handle_t graphHandle = _zeGraphExt->getGraphHandle(network);
ze_graph_handle_t graphHandle =
_zeGraphExt->getGraphHandle(*reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()), blobPtr->size());
_logger.debug("parse end");

OV_ITT_TASK_NEXT(PARSE_BLOB, "getNetworkMeta");
Expand All @@ -218,7 +220,7 @@ std::shared_ptr<IGraph> DriverCompilerAdapter::parse(std::vector<uint8_t> networ
graphHandle,
std::move(networkMeta),
config,
std::optional<std::vector<uint8_t>>(std::move(network)));
std::move(blobPtr));
}

ov::SupportedOpsMap DriverCompilerAdapter::query(const std::shared_ptr<const ov::Model>& model,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ DriverGraph::DriverGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,
ze_graph_handle_t graphHandle,
NetworkMetadata metadata,
const Config& config,
std::optional<std::vector<uint8_t>> blob)
: IGraph(graphHandle, std::move(metadata), config, std::move(blob)),
std::unique_ptr<BlobContainer> blobPtr)
: IGraph(graphHandle, std::move(metadata), config, std::move(blobPtr)),
_zeGraphExt(zeGraphExt),
_zeroInitStruct(zeroInitStruct),
_logger("DriverGraph", config.get<LOG_LEVEL>()) {
Expand All @@ -34,7 +34,7 @@ DriverGraph::DriverGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,

size_t DriverGraph::export_blob(std::ostream& stream) const {
const uint8_t* blobPtr = nullptr;
size_t blobSize;
size_t blobSize = -1;
std::vector<uint8_t> blob;

if (_blobIsReleased) {
Expand Down Expand Up @@ -140,7 +140,7 @@ void DriverGraph::initialize(const Config& config) {
}

bool DriverGraph::release_blob(const Config& config) {
if (_blob.empty() || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 ||
if (_blobPtr == nullptr || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 ||
config.get<PERF_COUNT>()) {
return false;
}
Expand All @@ -153,8 +153,9 @@ bool DriverGraph::release_blob(const Config& config) {
return false;
}

_blob.clear();
_blob.shrink_to_fit();
if (!_blobPtr->release_from_memory()) {
return false;
}

_logger.debug("Blob is released");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,16 @@ std::shared_ptr<IGraph> PluginCompilerAdapter::compile(const std::shared_ptr<con

_logger.debug("compile start");
auto networkDesc = _compiler->compile(model, config);
auto blobPtr = std::make_unique<BlobContainerVector>(std::move(networkDesc.compiledNetwork));
_logger.debug("compile end");

ze_graph_handle_t graphHandle = nullptr;

if (_zeGraphExt) {
// Depending on the config, we may get an error when trying to get the graph handle from the compiled network
try {
graphHandle = _zeGraphExt->getGraphHandle(networkDesc.compiledNetwork);
graphHandle =
_zeGraphExt->getGraphHandle(*reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()), blobPtr->size());
} catch (...) {
_logger.info("Failed to obtain the level zero graph handle. Inference requests for this model are not "
"allowed. Only exports are available");
Expand All @@ -99,29 +101,36 @@ std::shared_ptr<IGraph> PluginCompilerAdapter::compile(const std::shared_ptr<con
_zeroInitStruct,
graphHandle,
std::move(networkDesc.metadata),
std::move(networkDesc.compiledNetwork),
std::move(blobPtr),
config);
}

std::shared_ptr<IGraph> PluginCompilerAdapter::parse(std::vector<uint8_t> network, const Config& config) const {
std::shared_ptr<IGraph> PluginCompilerAdapter::parse(std::unique_ptr<BlobContainer> blobPtr,
const Config& config) const {
OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "PluginCompilerAdapter", "parse");

_logger.debug("parse start");
std::vector<uint8_t> network(blobPtr->size());
network.assign(reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()),
reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()) + blobPtr->size());
auto networkMeta = _compiler->parse(network, config);
network.clear();
network.shrink_to_fit();
_logger.debug("parse end");

ze_graph_handle_t graphHandle = nullptr;

if (_zeGraphExt) {
graphHandle = _zeGraphExt->getGraphHandle(network);
graphHandle =
_zeGraphExt->getGraphHandle(*reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()), blobPtr->size());
}

return std::make_shared<PluginGraph>(_zeGraphExt,
_compiler,
_zeroInitStruct,
graphHandle,
std::move(networkMeta),
std::move(network),
std::move(blobPtr),
config);
}

Expand Down
19 changes: 12 additions & 7 deletions src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ PluginGraph::PluginGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,
const std::shared_ptr<ZeroInitStructsHolder>& zeroInitStruct,
ze_graph_handle_t graphHandle,
NetworkMetadata metadata,
std::vector<uint8_t> blob,
std::unique_ptr<BlobContainer> blobPtr,
const Config& config)
: IGraph(graphHandle, std::move(metadata), config, std::optional<std::vector<uint8_t>>(std::move(blob))),
: IGraph(graphHandle, std::move(metadata), config, std::move(blobPtr)),
_zeGraphExt(zeGraphExt),
_zeroInitStruct(zeroInitStruct),
_compiler(compiler),
Expand All @@ -31,7 +31,7 @@ PluginGraph::PluginGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,
}

size_t PluginGraph::export_blob(std::ostream& stream) const {
stream.write(reinterpret_cast<const char*>(_blob.data()), _blob.size());
stream.write(reinterpret_cast<const char*>(_blobPtr->get_ptr()), _blobPtr->size());

if (!stream) {
_logger.error("Write blob to stream failed. Blob is broken!");
Expand All @@ -40,21 +40,26 @@ size_t PluginGraph::export_blob(std::ostream& stream) const {

if (_logger.level() >= ov::log::Level::INFO) {
std::uint32_t result = 1171117u;
for (const uint8_t* it = _blob.data(); it != _blob.data() + _blob.size(); ++it) {
for (const uint8_t* it = reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr());
it != reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr()) + _blobPtr->size();
++it) {
result = ((result << 7) + result) + static_cast<uint32_t>(*it);
}

std::stringstream str;
str << "Blob size: " << _blob.size() << ", hash: " << std::hex << result;
str << "Blob size: " << _blobPtr->size() << ", hash: " << std::hex << result;
_logger.info(str.str().c_str());
}
_logger.info("Write blob to stream successfully.");
return _blob.size();
return _blobPtr->size();
}

std::vector<ov::ProfilingInfo> PluginGraph::process_profiling_output(const std::vector<uint8_t>& profData,
const Config& config) const {
return _compiler->process_profiling_output(profData, _blob, config);
std::vector<uint8_t> blob(_blobPtr->size());
blob.assign(reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr()),
reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr()) + _blobPtr->size());
return _compiler->process_profiling_output(profData, blob, config);
}

void PluginGraph::set_argument_value(uint32_t argi, const void* argv) const {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -363,19 +363,15 @@ ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(std::pair<size_t, std::shar
return graphHandle;
}

ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(const std::vector<uint8_t>& network) const {
ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(const uint8_t& blobData, size_t blobSize) const {
ze_graph_handle_t graphHandle;

if (network.empty()) {
if (blobSize == 0) {
OPENVINO_THROW("Empty blob");
}

ze_graph_desc_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
nullptr,
ZE_GRAPH_FORMAT_NATIVE,
network.size(),
network.data(),
nullptr};
ze_graph_desc_t desc =
{ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, nullptr, ZE_GRAPH_FORMAT_NATIVE, blobSize, &blobData, nullptr};

_logger.debug("getGraphHandle - perform pfnCreate");
auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate(_zeroInitStruct->getContext(),
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_npu/src/plugin/include/metrics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ class Metrics final {
ov::intel_npu::batch_mode.name(),
ov::hint::execution_mode.name()};

const std::vector<ov::PropertyName> _internalSupportedProperties = {ov::internal::caching_properties.name()};
const std::vector<ov::PropertyName> _internalSupportedProperties = {ov::internal::caching_properties.name(),
ov::internal::caching_with_mmap.name()};

// Metric to provide a hint for a range for number of async infer requests. (bottom bound, upper bound, step)
const std::tuple<uint32_t, uint32_t, uint32_t> _rangeForAsyncInferRequests{1u, 10u, 1u};
Expand Down
Loading
Loading