Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPUW] Serialization fixes #28442

Merged
merged 17 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,19 @@ std::shared_ptr<ov::npuw::ICompiledModel> ov::npuw::ICompiledModel::create(
auto use_llm_key = ov::intel_npu::npuw::llm::enabled.name();
if (properties.count(use_llm_key) && properties.at(use_llm_key).as<bool>() == true) {
LOG_INFO("ov::npuw::LLMCompiledModel will be created.");
compiled_model = std::make_shared<ov::npuw::LLMCompiledModel>(model, plugin, properties);
// Drop CACHE_DIR from the config
// If it's present we will be utilizing LLMCompiledModel's import
// and not the underlying models and submodels
auto config = properties;
config.erase(ov::cache_dir.name());
compiled_model = std::make_shared<ov::npuw::LLMCompiledModel>(model, plugin, config);
dmatveev marked this conversation as resolved.
Show resolved Hide resolved
} else {
LOG_INFO("ov::npuw::CompiledModel will be created.");
// CACHE_DIR isn't supported with NPU_USE_NPUW
if (properties.count(ov::cache_dir.name())) {
OPENVINO_THROW(
"Option 'CACHE_DIR' is not supported with configuration: NPU_USE_NPUW : YES, NPUW_LLM : NO");
}
pre_load_transform(model, properties);
compiled_model = std::make_shared<ov::npuw::CompiledModel>(model, plugin, properties);
}
Expand Down Expand Up @@ -611,6 +621,12 @@ void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {

// Write config
write(stream, m_cfg);
// FIXME: utilize overload instead
write(stream, m_non_npuw_props.size());
for (const auto& p : m_non_npuw_props) {
write(stream, p.first);
write_any(stream, p.second);
}

// Serialize compiled submodels
write(stream, m_compiled_submodels.size());
Expand Down Expand Up @@ -671,6 +687,18 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(

// Deserialize config
read(stream, compiled->m_cfg);
compiled->m_cfg.parseEnvVars();
// FIXME: utilize overload instead
std::size_t props_size;
read(stream, props_size);
for (std::size_t i = 0; i < props_size; ++i) {
std::string key;
read(stream, key);
ov::Any val;
read_any(stream, val);
compiled->m_non_npuw_props[key] = val;
}
compiled->implement_properties();

// Deserialize compiled submodels
std::size_t subm_size = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -611,21 +611,21 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial
if (vmajor != OPENVINO_VERSION_MAJOR || vminor != OPENVINO_VERSION_MINOR || vpatch != OPENVINO_VERSION_PATCH ||
s11n_version != std::string(NPUW_SERIALIZATION_VERSION)) {
OPENVINO_THROW("This blobs was serialized with different OV version!",
" Serialized by OV ",
"\nSerialized by OV ",
vmajor,
'.',
vminor,
'.',
vpatch,
" Current OV version ",
"\nCurrent OV version ",
OPENVINO_VERSION_MAJOR,
'.',
OPENVINO_VERSION_MINOR,
'.',
OPENVINO_VERSION_PATCH,
" NPUW serialized by version ",
"\nNPUW serialized by version ",
s11n_version,
" NPUW current serialization version ",
"\nNPUW current serialization version ",
NPUW_SERIALIZATION_VERSION);
}

Expand Down Expand Up @@ -653,6 +653,7 @@ std::shared_ptr<ov::npuw::LLMCompiledModel> ov::npuw::LLMCompiledModel::deserial

// Deserialize config
read(stream, compiled->m_cfg);
compiled->implement_properties();

// Deserialize CompiledModels
compiled->m_kvcache_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin);
Expand Down
92 changes: 92 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/serialization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ void ov::npuw::s11n::write(std::ostream& stream, const bool& var) {
stream.write(reinterpret_cast<const char*>(&var), sizeof var);
}

void ov::npuw::s11n::write(std::ostream& stream, const float& var) {
stream.write(reinterpret_cast<const char*>(&var), sizeof var);
}

void ov::npuw::s11n::write(std::ostream& stream, const ov::npuw::compiled::Spatial& var) {
using ov::npuw::s11n::write;

Expand Down Expand Up @@ -74,6 +78,43 @@ void ov::npuw::s11n::write(std::ostream& stream, const ov::Output<const ov::Node
write(stream, var.get_names());
}

enum class AnyType : int { STRING = 0, CHARS, INT, UINT32, INT64, UINT64, SIZET, FLOAT, BOOL };

void ov::npuw::s11n::write_any(std::ostream& stream, const ov::Any& var) {
// FIXME: figure out a proper way to serialize Any (for config)
AsyaPronina marked this conversation as resolved.
Show resolved Hide resolved
if (var.is<std::string>()) {
write(stream, static_cast<int>(AnyType::STRING));
write(stream, var.as<std::string>());
} else if (var.is<const char*>()) {
// FIXME: handle properly
write(stream, static_cast<int>(AnyType::CHARS));
write(stream, std::string(var.as<const char*>()));
} else if (var.is<std::size_t>()) {
write(stream, static_cast<int>(AnyType::SIZET));
write(stream, var.as<std::size_t>());
} else if (var.is<int>()) {
write(stream, static_cast<int>(AnyType::INT));
write(stream, var.as<int>());
} else if (var.is<int64_t>()) {
write(stream, static_cast<int>(AnyType::INT64));
write(stream, var.as<int64_t>());
} else if (var.is<uint32_t>()) {
write(stream, static_cast<int>(AnyType::UINT32));
write(stream, var.as<uint32_t>());
} else if (var.is<uint64_t>()) {
write(stream, static_cast<int>(AnyType::UINT64));
write(stream, var.as<uint64_t>());
} else if (var.is<float>()) {
write(stream, static_cast<int>(AnyType::FLOAT));
write(stream, var.as<float>());
} else if (var.is<bool>()) {
write(stream, static_cast<int>(AnyType::BOOL));
write(stream, var.as<bool>());
} else {
NPUW_ASSERT(false && "Unsupported type");
}
}

void ov::npuw::s11n::read(std::istream& stream, std::streampos& var) {
stream.read(reinterpret_cast<char*>(&var), sizeof var);
}
Expand All @@ -89,6 +130,10 @@ void ov::npuw::s11n::read(std::istream& stream, bool& var) {
stream.read(reinterpret_cast<char*>(&var), sizeof var);
}

void ov::npuw::s11n::read(std::istream& stream, float& var) {
stream.read(reinterpret_cast<char*>(&var), sizeof var);
}

void ov::npuw::s11n::read(std::istream& stream, ov::npuw::compiled::Spatial& var) {
using ov::npuw::s11n::read;

Expand Down Expand Up @@ -169,3 +214,50 @@ void ov::npuw::s11n::read(std::istream& stream, std::shared_ptr<ov::Node>& var)
var->output(0).set_tensor_ptr(tensor_dummy);
var->set_friendly_name(*names.begin()); // any_name ?
}

void ov::npuw::s11n::read_any(std::istream& stream, ov::Any& var) {
// FIXME: ugly, but cannot use .read(stream) here due to its usage of operator>>()
int type_int;
read(stream, type_int);
AnyType type = static_cast<AnyType>(type_int);
if (type == AnyType::STRING) {
std::string val;
read(stream, val);
var = std::move(val);
} else if (type == AnyType::CHARS) {
// FIXME: handle properly
std::string val;
AsyaPronina marked this conversation as resolved.
Show resolved Hide resolved
read(stream, val);
var = std::move(val);
} else if (type == AnyType::SIZET) {
std::size_t val;
read(stream, val);
var = val;
} else if (type == AnyType::INT) {
int val;
read(stream, val);
var = val;
} else if (type == AnyType::INT64) {
int64_t val;
read(stream, val);
var = val;
} else if (type == AnyType::UINT32) {
uint32_t val;
read(stream, val);
var = val;
} else if (type == AnyType::UINT64) {
uint64_t val;
read(stream, val);
var = val;
} else if (type == AnyType::FLOAT) {
float val;
read(stream, val);
var = val;
} else if (type == AnyType::BOOL) {
bool val;
read(stream, val);
var = val;
} else {
NPUW_ASSERT(false && "Unsupported type");
}
}
5 changes: 5 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class Config;
namespace ov {

// Forward declaration
class Any;
class Node;
class Tensor;
template <class>
Expand All @@ -52,19 +53,23 @@ namespace s11n {
void write(std::ostream& stream, const std::streampos& var);
void write(std::ostream& stream, const std::string& var);
void write(std::ostream& stream, const bool& var);
void write(std::ostream& stream, const float& var);
void write(std::ostream& stream, const ov::npuw::compiled::Spatial& var);
void write(std::ostream& stream, const ov::Tensor& var);
void write(std::ostream& stream, const ::intel_npu::Config& var);
void write(std::ostream& stream, const ov::Output<const ov::Node>& var);
void write_any(std::ostream& stream, const ov::Any& var);

void read(std::istream& stream, std::streampos& var);
void read(std::istream& stream, std::string& var);
void read(std::istream& stream, bool& var);
void read(std::istream& stream, float& var);
void read(std::istream& stream, ov::npuw::compiled::Spatial& var);
void read(std::istream& stream, ov::Tensor& var);
void read(std::istream& stream, ::intel_npu::Config& var);
void read(std::istream& stream, std::shared_ptr<ov::op::v0::Parameter>& var);
void read(std::istream& stream, std::shared_ptr<ov::Node>& var);
void read_any(std::istream& stream, ov::Any& var);

// Forward declaration
template <typename T1, typename T2>
Expand Down
7 changes: 6 additions & 1 deletion src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,8 @@ void Bank::read_and_add_tensor(std::istream& stream, int64_t uid, const std::str
auto iter_device = device_bank.storage.find(uid);

if (iter_device != device_bank.storage.end()) {
// Already allocated
// Shouldn't be possible
NPUW_ASSERT(false);
return;
}
Comment on lines 221 to 225
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a change in the contract here, what was OK previously is now not OK, can you please explain more here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It occurs during bank deserialization. Our storage is an unordered_map with unique UIDs which all have been evaluated and serialized at this point. Thus it shouldn't be possible to encounter a duplicate UIDs during deserialization. Just a sanity check - this if can be removed altogether with no effect


Expand All @@ -234,6 +235,10 @@ void Bank::read_and_add_tensor(std::istream& stream, int64_t uid, const std::str
ov::Tensor allocated_tensor;

// FIXME: reading not via a dedicated function
bool is_intialized = false;
read(stream, is_intialized);
NPUW_ASSERT(is_intialized);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this too. Haven't you wrote empty tensors before?
if is_initialized must always be true, why write it to the stream at all?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed Tensor serialization in some of the previous PRs. It occurred when I was serializing empty scales/zerops. This change is just to align with the previous change

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't get it. Do you serialize empty tensors now or not?
If you don't, you don't need this bool in the stream either.

If you do, this code breaks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do but not for the bank

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this code doesn't break

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did I read this right you have a common routine to write a tensor to disk but you read it here in some specific way?
it doesn't sound good tbh.

you'd need to use the same function to read tensors as you write those. If that routine checks for false and does nothing, but you still need to raise an assert, you can make it in the caller code (= HERE) if the resulting tensor is still empty after read.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed on call, this change is caused by optimization (reading from stream directly to the L0 buffer)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed locally


std::string type_str;
read(stream, type_str);
ov::element::Type type(type_str);
Expand Down
4 changes: 0 additions & 4 deletions src/plugins/intel_npu/src/plugin/src/plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -622,10 +622,6 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
ov::AnyMap localProperties = properties;
if (localProperties.count(useNpuwKey)) {
if (localProperties.at(useNpuwKey).as<bool>() == true) {
// CACHE_DIR isn't supported with NPU_USE_NPUW
if (localProperties.count(ov::cache_dir.name()) || !_globalConfig.get<CACHE_DIR>().empty()) {
OPENVINO_THROW("Option 'CACHE_DIR' is not supported with NPU_USE_NPUW!");
}
return ov::npuw::ICompiledModel::create(model->clone(), shared_from_this(), localProperties);
} else {
// NPUW is disabled, remove the key from the properties
Expand Down
Loading