20
20
#include " util.hpp"
21
21
#include " weights_bank.hpp"
22
22
23
+ ov::npuw::MemAccessSim::MemAccessSim (const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model) {
24
+ LOG_VERB (" Running memory access simulation..." );
25
+ LOG_BLOCK ();
26
+
27
+ // Initialize the read list
28
+ m_read_list.resize (compiled_model->m_compiled_submodels .size ());
29
+
30
+ // Initialize read counters for tensors in the graph:
31
+ // 1. Interconnect
32
+ for (const auto & kvp : compiled_model->m_submodels_input_to_prev_output ) {
33
+ const auto & read_to = kvp.first ; // who reads
34
+ const auto & read_from = kvp.second ; // reads what
35
+
36
+ if (read_to == CompiledModel::NO_LINK || read_from == CompiledModel::NO_LINK) {
37
+ continue ;
38
+ }
39
+
40
+ // Record # of reads for this particular Source
41
+ m_remaining_reads[read_from]++;
42
+
43
+ // Record a read request for this particular Subgraph (who reads the Source)
44
+ m_read_list[read_to.first ].push_back (read_from);
45
+ }
46
+ // 2. Global model's outputs
47
+ for (auto && read_from : compiled_model->m_outputs_to_submodels_outputs ) {
48
+ m_remaining_reads[read_from]++;
49
+ }
50
+
51
+ LOG_VERB (" Done" );
52
+ }
53
+
54
+ const ov::npuw::MemAccessSim::ReadList& ov::npuw::MemAccessSim::read_list (std::size_t idx) const {
55
+ return m_read_list.at (idx);
56
+ }
57
+
58
+ std::size_t ov::npuw::MemAccessSim::remaining_reads (const LinkFrom& from) {
59
+ return m_remaining_reads.at (from);
60
+ }
61
+
62
+ void ov::npuw::MemAccessSim::register_read (const LinkFrom& from) {
63
+ m_remaining_reads.at (from)--;
64
+ }
65
+
66
+ ov::npuw::FuncMemMgr::FuncMemMgr (const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
67
+ : m_sim(compiled_model),
68
+ m_model(compiled_model) {}
69
+
70
+ void ov::npuw::FuncMemMgr::set_alloc (AllocFcn&& fcn) {
71
+ m_alloc = std::move (fcn);
72
+ }
73
+
74
+ void ov::npuw::FuncMemMgr::assign_memory () {
75
+ LOG_VERB (" Assigning function memory..." );
76
+ LOG_BLOCK ();
77
+
78
+ const auto num_submodels = m_model->m_compiled_submodels .size ();
79
+
80
+ // Walk over the subgraphs, pre-allocate and pre-assign tensors to the subgraphs
81
+ // outputs.
82
+ for (std::size_t idx = 0u ; idx < num_submodels; idx++) {
83
+ LOG_VERB (" Process Subgraph[" << idx << " ]" );
84
+ LOG_BLOCK ();
85
+ const auto & comp_model_desc = m_model->m_compiled_submodels [idx];
86
+ if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by ) {
87
+ // no model & no funcall - optimized out, do nothing
88
+ continue ;
89
+ }
90
+
91
+ // Simulate subgraph execution: poll its input list first
92
+ const auto & read_list = m_sim.read_list (idx);
93
+
94
+ // Now, get the outputs for the subgraph. If it is "regular", there's
95
+ // nothing to do - this subgraph owns its outputs on its own.
96
+ // If it is a function, though - look up in the function's memory storage.
97
+ if (comp_model_desc.replaced_by ) {
98
+ const auto real_idx = comp_model_desc.replaced_by .value ();
99
+ const auto & proto_comp_model_desc = m_model->m_compiled_submodels [real_idx];
100
+
101
+ const auto num_outs = proto_comp_model_desc.compiled_model ->outputs ().size ();
102
+ for (std::size_t out_idx = 0u ; out_idx < num_outs; out_idx++) {
103
+ const LinkFrom this_out = LinkFrom{idx, out_idx};
104
+ assign (this_out);
105
+ }
106
+ }
107
+
108
+ // Here happens the imaginary execution... Hocus pocus, done - that's a
109
+ // simulation after all
110
+ // After the execution, mark that the read_list was read.
111
+ for (auto && from : read_list) {
112
+ m_sim.register_read (from);
113
+ }
114
+ LOG_VERB (" Done" );
115
+ }
116
+
117
+ // Report memory residency
118
+ for (auto && m : m_memory) {
119
+ LOG_VERB (" Function " << m.first .first << " /out port " << m.first .second << " : maximum memory residency "
120
+ << m.second .size () << " tensor(s)" );
121
+ }
122
+
123
+ LOG_VERB (" Done" );
124
+ }
125
+
126
+ void ov::npuw::FuncMemMgr::assign (const LinkFrom& from) {
127
+ // This method is the center of the function memory management.
128
+ // The logic is simple:
129
+ // - Look for an output tensor to reuse
130
+ // - If there's one, assign it to this allocation
131
+ // - If there's none, allocate a new tensor
132
+ // - How a tensor to reuse is piced:
133
+ // 1. It should exist
134
+ // 2. It's "remaining reads" count should be 0 (all planned reads
135
+ // happened at this point).
136
+ // The tensor storage is organized like this:
137
+ // - Function: Here we use .replaced_by as a function identifier; taken from `from`
138
+ // - Output index: taken from `from`
139
+ // - A vector of resident tensors
140
+
141
+ LOG_VERB (" Assinging tensor for Subgraph[" << from.first << " ]/" << from.second << " ..." );
142
+ LOG_BLOCK ();
143
+
144
+ const auto & comp_model_desc = m_model->m_compiled_submodels [from.first ];
145
+ NPUW_ASSERT (comp_model_desc.replaced_by .has_value ());
146
+
147
+ const auto real_idx = comp_model_desc.replaced_by .value ();
148
+
149
+ FO func_output = {real_idx, from.second };
150
+ auto & assigned_memory = m_memory[func_output];
151
+ auto asgn_iter = std::find_if (assigned_memory.begin (), assigned_memory.end (), [&](Assignment& a) {
152
+ return m_sim.remaining_reads (a.from ) == 0u ;
153
+ });
154
+ if (asgn_iter != assigned_memory.end ()) {
155
+ // Reassign this memory slot to the new "from"
156
+ asgn_iter->from = from;
157
+ m_table[from] = asgn_iter->ptr ;
158
+ } else {
159
+ // No free space at this point - allocate a new tensor
160
+ const auto & proto_comp_model_desc = m_model->m_compiled_submodels [real_idx];
161
+ const auto & proto_comp_model = proto_comp_model_desc.compiled_model ;
162
+
163
+ const auto & oport = proto_comp_model->outputs ()[from.second ];
164
+ ov::Shape oshape = oport.get_shape ();
165
+
166
+ if (proto_comp_model_desc.spatial ) {
167
+ oshape[proto_comp_model_desc.spatial ->out_dim ] = proto_comp_model_desc.spatial ->range ;
168
+ }
169
+ const auto & device = m_model->funcall_mem_device (real_idx);
170
+ TensorPtr new_tensor = m_alloc (oport.get_element_type (), oshape, device);
171
+ NPUW_ASSERT (new_tensor);
172
+
173
+ assigned_memory.push_back (Assignment{new_tensor, from});
174
+ m_table[from] = new_tensor;
175
+ }
176
+ LOG_VERB (" Done" );
177
+ }
178
+
179
+ ov::npuw::TensorPtr ov::npuw::FuncMemMgr::get_tensor (const LinkFrom& from) {
180
+ return m_table.at (from);
181
+ }
182
+
23
183
ov::npuw::JustInferRequest::JustInferRequest (const std::shared_ptr<ov::npuw::CompiledModel>& compiled_model)
24
- : IBaseInferRequest(compiled_model) {
184
+ : IBaseInferRequest(compiled_model),
185
+ m_func_mem_mgr(compiled_model) {
186
+ using namespace std ::placeholders;
187
+ m_func_mem_mgr.set_alloc (std::bind (&JustInferRequest::allocMem, this , _1, _2, _3));
188
+ m_func_mem_mgr.assign_memory ();
189
+
25
190
m_use_function_pipelining = m_npuw_model->m_cfg .get <::intel_npu::NPUW_FUNCALL_ASYNC>();
26
191
if (m_use_function_pipelining) {
27
192
LOG_WARN (" Function call pipelining is enabled for " << m_npuw_model->m_name
@@ -67,27 +232,20 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
67
232
for (auto && p : proto_comp_model_desc.spatial ->params ) {
68
233
const auto & iport = proto_comp_model_desc.compiled_model ->inputs ()[p.idx ];
69
234
m_spatial_io[real_idx].input_tails [p.idx ] =
70
- allocTensor (iport, m_npuw_model->funcall_mem_device (real_idx));
235
+ allocOut (iport, m_npuw_model->funcall_mem_device (real_idx));
71
236
}
72
237
const auto num_outs = proto_comp_model_desc.compiled_model ->outputs ().size ();
73
238
for (std::size_t out_idx = 0u ; out_idx < num_outs; out_idx++) {
74
239
const auto & oport = proto_comp_model_desc.compiled_model ->outputs ()[out_idx];
75
240
m_spatial_io[real_idx].output_tails [out_idx] =
76
- allocTensor (oport, m_npuw_model->funcall_mem_device (real_idx));
241
+ allocOut (oport, m_npuw_model->funcall_mem_device (real_idx));
77
242
}
78
243
}
79
244
} // if(spatial)
80
245
81
246
for (size_t out_idx = 0 ; out_idx < num_outputs; out_idx++) {
82
- const auto & port = proto_comp_model->outputs ()[out_idx];
83
- ov::Shape shape = port.get_shape ();
84
-
85
- // If the subgraph is spatial, promote the output size to the full vector size
86
- if (proto_comp_model_desc.spatial ) {
87
- shape[proto_comp_model_desc.spatial ->out_dim ] = proto_comp_model_desc.spatial ->range ;
88
- }
89
- m_funcall_result[LinkFrom{i, out_idx}] =
90
- allocTensor (port.get_element_type (), shape, m_npuw_model->funcall_mem_device (real_idx));
247
+ const auto from = LinkFrom{i, out_idx};
248
+ m_funcall_result[from] = m_func_mem_mgr.get_tensor (from);
91
249
}
92
250
if (real_idx != i) {
93
251
// If this function call is NOT the function body, do nothing here - the original
@@ -152,7 +310,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
152
310
LOG_INFO (" Preallocating input tensors..." );
153
311
for (size_t i = 0 ; i < m_npuw_model->inputs ().size (); i++) {
154
312
const auto & port = m_npuw_model->inputs ()[i];
155
- ov::SoPtr<ov::ITensor> allocated = allocTensor (port, m_npuw_model->global_mem_device ());
313
+ ov::SoPtr<ov::ITensor> allocated = allocOut (port, m_npuw_model->global_mem_device ());
156
314
m_input_tensors.push_back (allocated);
157
315
m_input_allocated.insert (allocated->data ());
158
316
m_port_to_tensor[port] = TensorStorage{m_input_tensors.back (), true };
@@ -174,7 +332,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
174
332
const auto & tensor =
175
333
funcall_result_iter != m_funcall_result.end ()
176
334
? funcall_result_iter->second // Function calls have their tensors allocated, so just use one
177
- : allocTensor (port, m_npuw_model->global_mem_device ());
335
+ : allocOut (port, m_npuw_model->global_mem_device ());
178
336
179
337
m_output_tensors.push_back (tensor);
180
338
m_port_to_tensor[port] = TensorStorage{tensor, true };
@@ -920,27 +1078,22 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
920
1078
} // if (replaced_by)
921
1079
}
922
1080
923
- ov::SoPtr<ov::ITensor> ov::npuw::JustInferRequest::allocTensor (const ov::element::Type type,
924
- const ov::Shape& shape,
925
- const std::string& device) {
1081
+ ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem (const ov::element::Type type,
1082
+ const ov::Shape& shape,
1083
+ const std::string& device) {
926
1084
if (device == " CPU" || ov::shape_size (shape) == 0 ) {
927
1085
return ov::get_tensor_impl (ov::Tensor (type, shape));
928
1086
}
929
1087
930
- ov::SoPtr<ov::ITensor> remote_tensor;
931
- ov::Tensor allocated_tensor;
932
- {
933
- std::lock_guard<std::mutex> guard (m_alloc_mutex);
934
- m_remote_ctx = m_npuw_model->get_plugin ()->get_core ()->get_default_context (device)._ptr ;
935
- remote_tensor = m_remote_ctx->create_host_tensor (type, shape);
936
- allocated_tensor = ov::make_tensor (remote_tensor);
937
- }
938
- return ov::get_tensor_impl (allocated_tensor);
1088
+ std::lock_guard<std::mutex> guard (m_alloc_mutex);
1089
+ auto remote_ctx = m_npuw_model->get_plugin ()->get_core ()->get_default_context (device)._ptr ;
1090
+ auto remote_tensor = remote_ctx->create_host_tensor (type, shape);
1091
+ return ov::get_tensor_impl (ov::make_tensor (remote_tensor));
939
1092
}
940
1093
941
- ov::SoPtr<ov::ITensor> ov::npuw::JustInferRequest::allocTensor (const ov::Output<const ov::Node>& node,
942
- const std::string& device) {
943
- return allocTensor (node.get_element_type (), node.get_shape (), device);
1094
+ ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocOut (const ov::Output<const ov::Node>& node,
1095
+ const std::string& device) {
1096
+ return allocMem (node.get_element_type (), node.get_shape (), device);
944
1097
}
945
1098
946
1099
void ov::npuw::JustInferRequest::subscribe_subrequest (std::size_t idx, Completed cb) {
0 commit comments