Skip to content

Commit 3ff0943

Browse files
authored
NPUW: Function memory management (openvinotoolkit#27043)
### Details: - Optimize out temporary results (activations) when possible ### Tickets: - *ticket-id*
1 parent 296ab9a commit 3ff0943

File tree

4 files changed

+238
-44
lines changed

4 files changed

+238
-44
lines changed

src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp

-5
Original file line numberDiff line numberDiff line change
@@ -516,11 +516,6 @@ std::string ov::npuw::CompiledModel::global_mem_device() const {
516516
}
517517

518518
std::string ov::npuw::CompiledModel::funcall_mem_device(const std::size_t idx) const {
519-
// FIXME: currently we allocate intermediate tensors for EVERY submodel.
520-
// It's not feasible to allocate them in L0 due to high memory consumption.
521-
// Until we make such memory reusable, hard-coding those tensors to CPU.
522-
return "CPU";
523-
524519
// Force globally set device if set
525520
const std::string device_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>();
526521
if (!device_alloc.empty()) {

src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ class CompiledModel : public ov::ICompiledModel {
4646
// FIXME: This class has many friends..
4747
friend class IBaseInferRequest;
4848
friend class JustInferRequest;
49+
friend class MemAccessSim;
50+
friend class FuncMemMgr;
4951

5052
bool compile_for_success(std::size_t id);
5153
bool compile_for_device(std::size_t id, const std::string& device_to_try);

src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp

+182-29
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,173 @@
2020
#include "util.hpp"
2121
#include "weights_bank.hpp"
2222

23+
ov::npuw::MemAccessSim::MemAccessSim(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model) {
24+
LOG_VERB("Running memory access simulation...");
25+
LOG_BLOCK();
26+
27+
// Initialize the read list
28+
m_read_list.resize(compiled_model->m_compiled_submodels.size());
29+
30+
// Initialize read counters for tensors in the graph:
31+
// 1. Interconnect
32+
for (const auto& kvp : compiled_model->m_submodels_input_to_prev_output) {
33+
const auto& read_to = kvp.first; // who reads
34+
const auto& read_from = kvp.second; // reads what
35+
36+
if (read_to == CompiledModel::NO_LINK || read_from == CompiledModel::NO_LINK) {
37+
continue;
38+
}
39+
40+
// Record # of reads for this particular Source
41+
m_remaining_reads[read_from]++;
42+
43+
// Record a read request for this particular Subgraph (who reads the Source)
44+
m_read_list[read_to.first].push_back(read_from);
45+
}
46+
// 2. Global model's outputs
47+
for (auto&& read_from : compiled_model->m_outputs_to_submodels_outputs) {
48+
m_remaining_reads[read_from]++;
49+
}
50+
51+
LOG_VERB("Done");
52+
}
53+
54+
const ov::npuw::MemAccessSim::ReadList& ov::npuw::MemAccessSim::read_list(std::size_t idx) const {
55+
return m_read_list.at(idx);
56+
}
57+
58+
std::size_t ov::npuw::MemAccessSim::remaining_reads(const LinkFrom& from) {
59+
return m_remaining_reads.at(from);
60+
}
61+
62+
void ov::npuw::MemAccessSim::register_read(const LinkFrom& from) {
63+
m_remaining_reads.at(from)--;
64+
}
65+
66+
ov::npuw::FuncMemMgr::FuncMemMgr(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
67+
: m_sim(compiled_model),
68+
m_model(compiled_model) {}
69+
70+
void ov::npuw::FuncMemMgr::set_alloc(AllocFcn&& fcn) {
71+
m_alloc = std::move(fcn);
72+
}
73+
74+
void ov::npuw::FuncMemMgr::assign_memory() {
75+
LOG_VERB("Assigning function memory...");
76+
LOG_BLOCK();
77+
78+
const auto num_submodels = m_model->m_compiled_submodels.size();
79+
80+
// Walk over the subgraphs, pre-allocate and pre-assign tensors to the subgraphs
81+
// outputs.
82+
for (std::size_t idx = 0u; idx < num_submodels; idx++) {
83+
LOG_VERB("Process Subgraph[" << idx << "]");
84+
LOG_BLOCK();
85+
const auto& comp_model_desc = m_model->m_compiled_submodels[idx];
86+
if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
87+
// no model & no funcall - optimized out, do nothing
88+
continue;
89+
}
90+
91+
// Simulate subgraph execution: poll its input list first
92+
const auto& read_list = m_sim.read_list(idx);
93+
94+
// Now, get the outputs for the subgraph. If it is "regular", there's
95+
// nothing to do - this subgraph owns its outputs on its own.
96+
// If it is a function, though - look up in the function's memory storage.
97+
if (comp_model_desc.replaced_by) {
98+
const auto real_idx = comp_model_desc.replaced_by.value();
99+
const auto& proto_comp_model_desc = m_model->m_compiled_submodels[real_idx];
100+
101+
const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
102+
for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
103+
const LinkFrom this_out = LinkFrom{idx, out_idx};
104+
assign(this_out);
105+
}
106+
}
107+
108+
// Here happens the imaginary execution... Hocus pocus, done - that's a
109+
// simulation after all
110+
// After the execution, mark that the read_list was read.
111+
for (auto&& from : read_list) {
112+
m_sim.register_read(from);
113+
}
114+
LOG_VERB("Done");
115+
}
116+
117+
// Report memory residency
118+
for (auto&& m : m_memory) {
119+
LOG_VERB("Function " << m.first.first << "/out port " << m.first.second << " : maximum memory residency "
120+
<< m.second.size() << " tensor(s)");
121+
}
122+
123+
LOG_VERB("Done");
124+
}
125+
126+
void ov::npuw::FuncMemMgr::assign(const LinkFrom& from) {
127+
// This method is the center of the function memory management.
128+
// The logic is simple:
129+
// - Look for an output tensor to reuse
130+
// - If there's one, assign it to this allocation
131+
// - If there's none, allocate a new tensor
132+
// - How a tensor to reuse is piced:
133+
// 1. It should exist
134+
// 2. It's "remaining reads" count should be 0 (all planned reads
135+
// happened at this point).
136+
// The tensor storage is organized like this:
137+
// - Function: Here we use .replaced_by as a function identifier; taken from `from`
138+
// - Output index: taken from `from`
139+
// - A vector of resident tensors
140+
141+
LOG_VERB("Assinging tensor for Subgraph[" << from.first << "]/" << from.second << "...");
142+
LOG_BLOCK();
143+
144+
const auto& comp_model_desc = m_model->m_compiled_submodels[from.first];
145+
NPUW_ASSERT(comp_model_desc.replaced_by.has_value());
146+
147+
const auto real_idx = comp_model_desc.replaced_by.value();
148+
149+
FO func_output = {real_idx, from.second};
150+
auto& assigned_memory = m_memory[func_output];
151+
auto asgn_iter = std::find_if(assigned_memory.begin(), assigned_memory.end(), [&](Assignment& a) {
152+
return m_sim.remaining_reads(a.from) == 0u;
153+
});
154+
if (asgn_iter != assigned_memory.end()) {
155+
// Reassign this memory slot to the new "from"
156+
asgn_iter->from = from;
157+
m_table[from] = asgn_iter->ptr;
158+
} else {
159+
// No free space at this point - allocate a new tensor
160+
const auto& proto_comp_model_desc = m_model->m_compiled_submodels[real_idx];
161+
const auto& proto_comp_model = proto_comp_model_desc.compiled_model;
162+
163+
const auto& oport = proto_comp_model->outputs()[from.second];
164+
ov::Shape oshape = oport.get_shape();
165+
166+
if (proto_comp_model_desc.spatial) {
167+
oshape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range;
168+
}
169+
const auto& device = m_model->funcall_mem_device(real_idx);
170+
TensorPtr new_tensor = m_alloc(oport.get_element_type(), oshape, device);
171+
NPUW_ASSERT(new_tensor);
172+
173+
assigned_memory.push_back(Assignment{new_tensor, from});
174+
m_table[from] = new_tensor;
175+
}
176+
LOG_VERB("Done");
177+
}
178+
179+
ov::npuw::TensorPtr ov::npuw::FuncMemMgr::get_tensor(const LinkFrom& from) {
180+
return m_table.at(from);
181+
}
182+
23183
ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
24-
: IBaseInferRequest(compiled_model) {
184+
: IBaseInferRequest(compiled_model),
185+
m_func_mem_mgr(compiled_model) {
186+
using namespace std::placeholders;
187+
m_func_mem_mgr.set_alloc(std::bind(&JustInferRequest::allocMem, this, _1, _2, _3));
188+
m_func_mem_mgr.assign_memory();
189+
25190
m_use_function_pipelining = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>();
26191
if (m_use_function_pipelining) {
27192
LOG_WARN("Function call pipelining is enabled for " << m_npuw_model->m_name
@@ -67,27 +232,20 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
67232
for (auto&& p : proto_comp_model_desc.spatial->params) {
68233
const auto& iport = proto_comp_model_desc.compiled_model->inputs()[p.idx];
69234
m_spatial_io[real_idx].input_tails[p.idx] =
70-
allocTensor(iport, m_npuw_model->funcall_mem_device(real_idx));
235+
allocOut(iport, m_npuw_model->funcall_mem_device(real_idx));
71236
}
72237
const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size();
73238
for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) {
74239
const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx];
75240
m_spatial_io[real_idx].output_tails[out_idx] =
76-
allocTensor(oport, m_npuw_model->funcall_mem_device(real_idx));
241+
allocOut(oport, m_npuw_model->funcall_mem_device(real_idx));
77242
}
78243
}
79244
} // if(spatial)
80245

81246
for (size_t out_idx = 0; out_idx < num_outputs; out_idx++) {
82-
const auto& port = proto_comp_model->outputs()[out_idx];
83-
ov::Shape shape = port.get_shape();
84-
85-
// If the subgraph is spatial, promote the output size to the full vector size
86-
if (proto_comp_model_desc.spatial) {
87-
shape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range;
88-
}
89-
m_funcall_result[LinkFrom{i, out_idx}] =
90-
allocTensor(port.get_element_type(), shape, m_npuw_model->funcall_mem_device(real_idx));
247+
const auto from = LinkFrom{i, out_idx};
248+
m_funcall_result[from] = m_func_mem_mgr.get_tensor(from);
91249
}
92250
if (real_idx != i) {
93251
// If this function call is NOT the function body, do nothing here - the original
@@ -152,7 +310,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
152310
LOG_INFO("Preallocating input tensors...");
153311
for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
154312
const auto& port = m_npuw_model->inputs()[i];
155-
ov::SoPtr<ov::ITensor> allocated = allocTensor(port, m_npuw_model->global_mem_device());
313+
ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
156314
m_input_tensors.push_back(allocated);
157315
m_input_allocated.insert(allocated->data());
158316
m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
@@ -174,7 +332,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
174332
const auto& tensor =
175333
funcall_result_iter != m_funcall_result.end()
176334
? funcall_result_iter->second // Function calls have their tensors allocated, so just use one
177-
: allocTensor(port, m_npuw_model->global_mem_device());
335+
: allocOut(port, m_npuw_model->global_mem_device());
178336

179337
m_output_tensors.push_back(tensor);
180338
m_port_to_tensor[port] = TensorStorage{tensor, true};
@@ -920,27 +1078,22 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
9201078
} // if (replaced_by)
9211079
}
9221080

923-
ov::SoPtr<ov::ITensor> ov::npuw::JustInferRequest::allocTensor(const ov::element::Type type,
924-
const ov::Shape& shape,
925-
const std::string& device) {
1081+
ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem(const ov::element::Type type,
1082+
const ov::Shape& shape,
1083+
const std::string& device) {
9261084
if (device == "CPU" || ov::shape_size(shape) == 0) {
9271085
return ov::get_tensor_impl(ov::Tensor(type, shape));
9281086
}
9291087

930-
ov::SoPtr<ov::ITensor> remote_tensor;
931-
ov::Tensor allocated_tensor;
932-
{
933-
std::lock_guard<std::mutex> guard(m_alloc_mutex);
934-
m_remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
935-
remote_tensor = m_remote_ctx->create_host_tensor(type, shape);
936-
allocated_tensor = ov::make_tensor(remote_tensor);
937-
}
938-
return ov::get_tensor_impl(allocated_tensor);
1088+
std::lock_guard<std::mutex> guard(m_alloc_mutex);
1089+
auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
1090+
auto remote_tensor = remote_ctx->create_host_tensor(type, shape);
1091+
return ov::get_tensor_impl(ov::make_tensor(remote_tensor));
9391092
}
9401093

941-
ov::SoPtr<ov::ITensor> ov::npuw::JustInferRequest::allocTensor(const ov::Output<const ov::Node>& node,
942-
const std::string& device) {
943-
return allocTensor(node.get_element_type(), node.get_shape(), device);
1094+
ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocOut(const ov::Output<const ov::Node>& node,
1095+
const std::string& device) {
1096+
return allocMem(node.get_element_type(), node.get_shape(), device);
9441097
}
9451098

9461099
void ov::npuw::JustInferRequest::subscribe_subrequest(std::size_t idx, Completed cb) {

src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp

+54-10
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,56 @@ namespace npuw {
2222
class CompiledModel;
2323
class AsyncInferRequest;
2424

25+
using LinkFrom = std::pair<std::size_t /* Subrequest index */
26+
,
27+
std::size_t /* Subrequest output index */
28+
>; // FIXME: This is a third, if not fourth, definitiion of such structure
29+
30+
using TensorPtr = ov::SoPtr<ov::ITensor>;
31+
32+
class MemAccessSim {
33+
public:
34+
explicit MemAccessSim(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
35+
36+
using ReadList = std::list<LinkFrom>;
37+
const ReadList& read_list(std::size_t idx) const;
38+
39+
std::size_t remaining_reads(const LinkFrom& from);
40+
void register_read(const LinkFrom& from);
41+
42+
private:
43+
std::map<LinkFrom, std::size_t> m_remaining_reads;
44+
std::vector<ReadList> m_read_list;
45+
};
46+
47+
class FuncMemMgr {
48+
MemAccessSim m_sim;
49+
std::shared_ptr<ov::npuw::CompiledModel> m_model;
50+
51+
void assign(const LinkFrom& from);
52+
53+
// Function ID -> Output port number
54+
using FO = std::pair<std::size_t, std::size_t>;
55+
struct Assignment {
56+
TensorPtr ptr;
57+
LinkFrom from;
58+
};
59+
std::map<FO, std::vector<Assignment>> m_memory; // Dynamic assignment table
60+
std::map<LinkFrom, TensorPtr> m_table; // Static allocation/assignment table
61+
62+
public:
63+
explicit FuncMemMgr(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
64+
65+
using AllocFcn = std::function<TensorPtr(const ov::element::Type&, const ov::Shape&, const std::string&)>;
66+
void set_alloc(AllocFcn&& fcn);
67+
void assign_memory();
68+
69+
TensorPtr get_tensor(const LinkFrom& from);
70+
71+
private:
72+
AllocFcn m_alloc;
73+
};
74+
2575
class JustInferRequest final : public IBaseInferRequest {
2676
public:
2777
explicit JustInferRequest(const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model);
@@ -64,15 +114,11 @@ class JustInferRequest final : public IBaseInferRequest {
64114
void connect_subrequests();
65115
void recreate_subrequests(std::size_t idx);
66116

67-
ov::SoPtr<ov::ITensor> allocTensor(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
68-
ov::SoPtr<ov::ITensor> allocTensor(const ov::Output<const ov::Node>& node, const std::string& device);
117+
TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
118+
TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device);
69119

70-
using LinkFrom = std::pair<std::size_t /* Subrequest index */
71-
,
72-
std::size_t /* Subrequest output index */
73-
>; // FIXME: This is a third, if not fourth, definitiion of such structure
74-
using TensorPtr = ov::SoPtr<ov::ITensor>;
75-
std::map<LinkFrom, TensorPtr> m_funcall_result;
120+
FuncMemMgr m_func_mem_mgr; // Owns memory
121+
std::map<LinkFrom, TensorPtr> m_funcall_result; // Provides a convenient link
76122

77123
bool is_pipelined(std::size_t idx) const;
78124
bool m_use_function_pipelining = false;
@@ -103,8 +149,6 @@ class JustInferRequest final : public IBaseInferRequest {
103149
std::vector<GlobalIO> m_subrequests_gio;
104150

105151
std::mutex m_alloc_mutex;
106-
std::shared_ptr<ov::IRemoteContext> m_remote_ctx = nullptr;
107-
108152
std::unordered_set<void*> m_input_allocated;
109153
};
110154

0 commit comments

Comments
 (0)