Skip to content

Commit 212be8e

Browse files
authored
NPUW: Dynamic Spatial (#27104)
### Details: - Introduce a new SPATIAL pipeline which is a shortcut to PIPELINE:REG+ISOLATE:COMPUTE+SPATIAL:ON; - Refactor some code re: spatial regions handling in models and requests; - Finally, introduce a dyn dispatch over the spatial range - Based on runtime-detected features - Can be disabled to measure full range performance ### Tickets: - E-143572
1 parent 8822480 commit 212be8e

14 files changed

+270
-55
lines changed

src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime);
4545
DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, CompileTime);
4646
DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime);
4747
DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, CompileTime);
48-
DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 64, npuw::partitioning::spatial_nway, CompileTime);
48+
DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 128, npuw::partitioning::spatial_nway, CompileTime);
49+
DEFINE_OPT(NPUW_SPATIAL_DYN, bool, true, npuw::partitioning::spatial_dyn, CompileTime);
4950
DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, CompileTime);
5051
DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, CompileTime);
5152
DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime);

src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp

+10-2
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ namespace online {
6767
* @brief
6868
* Type: std::string.
6969
* Specify which partitioning pipeline to run.
70-
* Possible values: "NONE", "INIT", "JUST", "REP", "REG", "COMPUTE".
70+
* Possible values: "NONE", "INIT", "JUST", "REP", "REG", "COMPUTE", "SPATIAL".
7171
* Default value: "REG".
7272
*/
7373
static constexpr ov::Property<std::string> pipeline{"NPUW_ONLINE_PIPELINE"};
@@ -206,10 +206,18 @@ static constexpr ov::Property<bool> spatial{"NPUW_SPATIAL"};
206206
* @brief
207207
* Type: std::size_t.
208208
* Submission size for the spatial execution.
209-
* Default value: 64
209+
* Default value: 128
210210
*/
211211
static constexpr ov::Property<std::size_t> spatial_nway{"NPUW_SPATIAL_NWAY"};
212212

213+
/**
214+
* @brief
215+
* Type: boolean.
216+
* Enable dynamic submission for spatial subgraphs. Requires SPATIAL pipeline to be selected.
217+
* Default value: true
218+
*/
219+
static constexpr ov::Property<bool> spatial_dyn{"NPUW_SPATIAL_DYN"};
220+
213221
/**
214222
* @brief
215223
* Type: boolean

src/plugins/intel_npu/src/al/src/config/npuw.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
3030
desc.add<NPUW_PMM>();
3131
desc.add<NPUW_SPATIAL>();
3232
desc.add<NPUW_SPATIAL_NWAY>();
33+
desc.add<NPUW_SPATIAL_DYN>();
3334
desc.add<NPUW_HOST_GATHER>();
3435
desc.add<NPUW_DCOFF_TYPE>();
3536
desc.add<NPUW_DCOFF_SCALE>();

src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp

+4-13
Original file line numberDiff line numberDiff line change
@@ -283,18 +283,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
283283

284284
// Fill in the spatial information, if it is present
285285
if (fcn_template._spatial) {
286-
using S = CompiledModelDesc::Spatial;
287-
S s;
288-
s.range = fcn_template._spatial->_range;
289-
s.nway = fcn_template._spatial->_slice;
290-
s.out_dim = fcn_template._spatial->_out_dim;
291-
s.nway_iters = s.range / s.nway;
292-
s.tail_size = s.range % s.nway;
293-
for (auto&& input : fcn_template._spatial->_inputs) {
294-
std::size_t p_idx = fcn_template._model->get_parameter_index(input.param);
295-
s.params.push_back(S::Param{p_idx, input.dim});
296-
}
297-
m_compiled_submodels[id].spatial = std::move(s);
286+
m_compiled_submodels[id].spatial =
287+
compiled::Spatial(fcn_template._spatial.value(), fcn_template._model);
298288
}
299289
LOG_INFO("Subgraph[" << id << "] is a function body for " << subgraph._funcall);
300290
} else {
@@ -918,7 +908,8 @@ void ov::npuw::CompiledModel::implement_properties() {
918908
BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
919909
BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM),
920910
BIND(npuw::partitioning::spatial, NPUW_SPATIAL),
921-
BIND(npuw::partitioning::spatial, NPUW_SPATIAL_NWAY),
911+
BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY),
912+
BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN),
922913
BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
923914
BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
924915
BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),

src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp

+3-15
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (C) 2023 Intel Corporation
1+
// Copyright (C) 2023-2024 Intel Corporation
22
// SPDX-License-Identifier: Apache-2.0
33
//
44

@@ -13,6 +13,7 @@
1313
#include "openvino/runtime/icompiled_model.hpp"
1414
#include "openvino/runtime/so_ptr.hpp"
1515
#include "partitioning/partitioning.hpp"
16+
#include "spatial.hpp"
1617
#include "weights_bank.hpp"
1718

1819
namespace intel_npu {
@@ -123,20 +124,7 @@ class CompiledModel : public ov::ICompiledModel {
123124
std::optional<std::size_t> replaced_by;
124125

125126
Subgraph::Gather host_gather;
126-
struct Spatial {
127-
struct Param {
128-
std::size_t idx;
129-
std::size_t dim;
130-
};
131-
std::vector<Param> params;
132-
std::size_t range = 0u;
133-
std::size_t nway = 0u;
134-
std::size_t out_dim = 0u;
135-
136-
std::size_t nway_iters = 0u;
137-
std::size_t tail_size = 0u;
138-
};
139-
std::optional<Spatial> spatial;
127+
std::optional<ov::npuw::compiled::Spatial> spatial;
140128

141129
// FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
142130
// w.r.t. function calls

src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp

+33-2
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
199199
// Create infer requests
200200
// Preallocate funcall tensors & substitute function call requests
201201
bool failover_happened = false;
202+
bool has_spatial = false;
202203
for (size_t i = 0; i < m_num_submodels; i++) {
203204
LOG_INFO("Creating infer request for Subgraph[" << i << "]...");
204205
LOG_BLOCK();
@@ -221,6 +222,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
221222

222223
// Initialize the spatial IO placeholders, if required
223224
if (proto_comp_model_desc.spatial) {
225+
has_spatial = true;
226+
224227
m_spatial_io[real_idx].inputs.resize(proto_comp_model_desc.param_base);
225228
m_spatial_io[real_idx].input_tails.resize(proto_comp_model_desc.param_base);
226229
m_spatial_io[real_idx].outputs.resize(num_outputs);
@@ -399,6 +402,24 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
399402
} // for(closure)
400403
LOG_VERB("DONE");
401404
}
405+
406+
// Handle spatial dynamic submission
407+
if (has_spatial) {
408+
if (m_npuw_model->m_cfg.get<::intel_npu::NPUW_SPATIAL_DYN>()) {
409+
LOG_VERB("Finding spatial features...");
410+
LOG_BLOCK();
411+
m_spatial_selector = runtime::spatial::AttentionMask::find(*this);
412+
if (!m_spatial_selector) {
413+
LOG_WARN("Spatial capability is enabled, but no run-time features were found.");
414+
// Fallback selector to ALL
415+
m_spatial_selector.reset(new runtime::spatial::All());
416+
}
417+
} else {
418+
// Just force selector to ALL
419+
m_spatial_selector.reset(new runtime::spatial::All());
420+
}
421+
LOG_VERB("Done");
422+
}
402423
}
403424

404425
void ov::npuw::JustInferRequest::connect_subrequests() {
@@ -506,6 +527,11 @@ void ov::npuw::JustInferRequest::prepare_for_infer() {
506527
LOG_DEBUG("Pre-initializing weights for subgraph[" << id << "]");
507528
unpack_closure(id, m_subrequests[id]);
508529
}
530+
531+
// Adjust spatial input range, if supported
532+
if (m_spatial_selector) {
533+
m_spatial_selector->prepare();
534+
}
509535
LOG_DEBUG("Done");
510536
}
511537

@@ -915,6 +941,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
915941
// must be prepared in the m_spatial_io at this point
916942
const auto& spatial = comp_model_desc.spatial.value();
917943
const auto num_outputs = comp_model_desc.compiled_model->outputs().size();
944+
NPUW_ASSERT(m_spatial_selector);
918945

919946
// Create a sparse vector with full input sizes.
920947
// For the access simplicity, its size is aligned with function's
@@ -940,6 +967,10 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
940967

941968
std::size_t offset = 0u;
942969
for (std::size_t i = 0u; i < spatial.nway_iters; i++, offset += spatial.nway) {
970+
if (!m_spatial_selector->need_submit(offset, spatial.nway)) {
971+
continue;
972+
}
973+
943974
// Collect spatial inputs for this offset
944975
for (auto&& param : spatial.params) {
945976
const auto& iport = comp_model_desc.compiled_model->inputs()[param.idx];
@@ -963,7 +994,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
963994
} // for(full_nway_times)
964995

965996
// Now process the tail, if required
966-
if (spatial.tail_size) {
997+
if (spatial.tail_size && m_spatial_selector->need_submit(offset, spatial.tail_size)) {
967998
// Copy the sub-ranges to spatial inputs
968999
// NOTE: tails buffers are read from/written to at 0th offset!
9691000
for (auto&& param : spatial.params) {
@@ -1085,7 +1116,7 @@ ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem(const ov::element::Type
10851116
return ov::get_tensor_impl(ov::Tensor(type, shape));
10861117
}
10871118

1088-
std::lock_guard<std::mutex> guard(m_alloc_mutex);
1119+
// Protect access to shared context(s) - at least among infer requests
10891120
auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
10901121
auto remote_tensor = remote_ctx->create_host_tensor(type, shape);
10911122
return ov::get_tensor_impl(ov::make_tensor(remote_tensor));

src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "openvino/runtime/iremote_context.hpp"
1616
#include "openvino/runtime/make_tensor.hpp"
1717
#include "openvino/runtime/tensor.hpp"
18+
#include "spatial.hpp"
1819

1920
namespace ov {
2021
namespace npuw {
@@ -148,8 +149,10 @@ class JustInferRequest final : public IBaseInferRequest {
148149
};
149150
std::vector<GlobalIO> m_subrequests_gio;
150151

151-
std::mutex m_alloc_mutex;
152152
std::unordered_set<void*> m_input_allocated;
153+
154+
// Represents spatial run-time info
155+
runtime::spatial::Selector::Ptr m_spatial_selector;
153156
};
154157

155158
} // namespace npuw

src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp

+19-6
Original file line numberDiff line numberDiff line change
@@ -267,12 +267,13 @@ void dump_partitioning(const ov::npuw::Ensemble& ens, const std::string& to) {
267267
// Interface to get online partitioning from the model
268268
class Compiler {
269269
enum class Pipeline {
270-
NONE, // Partitioning will consist of a single group with all the Ops
271-
INIT, // Initialize only. The hardest mode, every group has just 1 layer inside
272-
JUST, // "justParitioning" - combination of LHF + Remnants
273-
REP, // Repeated blocks pipeline - combination of repeatedBlocks and Remnants
274-
REG, // Regularized repeated blocks pipeline -same as REP, but with some strong hints first
275-
COMPUTE // Separates non-foldable compute subgraphs from the model based on predefined rules + REP
270+
NONE, // Partitioning will consist of a single group with all the Ops
271+
INIT, // Initialize only. The hardest mode, every group has just 1 layer inside
272+
JUST, // "justParitioning" - combination of LHF + Remnants
273+
REP, // Repeated blocks pipeline - combination of repeatedBlocks and Remnants
274+
REG, // Regularized repeated blocks pipeline - same as REP, but with some strong hints first
275+
COMPUTE, // Separates non-foldable compute subgraphs from the model based on predefined rules + REP
276+
SPATIAL // Similar to COMPUTE but allows folding
276277
};
277278

278279
template <class C>
@@ -299,6 +300,8 @@ class Compiler {
299300
return Pipeline::REG;
300301
} else if (pipeline_opt == "COMPUTE") {
301302
return Pipeline::COMPUTE;
303+
} else if (pipeline_opt == "SPATIAL") {
304+
return Pipeline::SPATIAL;
302305
} else {
303306
LOG_WARN("Unknown partitioning compiler pipeline " << pipeline_opt << ", switching to REP");
304307
return Pipeline::REP;
@@ -428,6 +431,16 @@ class Compiler {
428431
m_snapshot->setCtx(ctx);
429432
rep();
430433
break;
434+
case Pipeline::SPATIAL:
435+
warn_unused<::intel_npu::NPUW_ONLINE_ISOLATE>();
436+
m_cfg.update(::intel_npu::Config::ConfigMap{{std::string(::intel_npu::NPUW_SPATIAL::key()), "YES"}});
437+
438+
// Manually set predefined isolates and nofolds then do rep() pipeline
439+
// FIXME: initialize via a dedicated function instead of parsing
440+
ctx.isolates = detail::getIsolates(detail::ISOL_PRESETS.at("COMPUTE"));
441+
m_snapshot->setCtx(ctx);
442+
rep();
443+
break;
431444
}
432445

433446
LOG_DEBUG("Online partitioning: group sizes after compilation:");

src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -1605,7 +1605,7 @@ void Partitioner::identifySpatialRange(ov::npuw::Function& f) {
16051605
const auto& f_params = f._model->get_parameters();
16061606
NPUW_ASSERT(f_params.size() > 0);
16071607

1608-
using S = ov::npuw::Function::Spatial;
1608+
using S = ov::npuw::function::Spatial;
16091609
S spatial;
16101610
spatial._range = f_result_0_shape[1];
16111611
spatial._out_dim = 1; // the only case we're looking into now

src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp

+2-14
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <vector>
1111

1212
#include "../lazy_tensor.hpp"
13+
#include "../spatial.hpp"
1314
#include "intel_npu/al/config/config.hpp"
1415
#include "openvino/openvino.hpp"
1516

@@ -70,20 +71,7 @@ struct Function {
7071
// NOTE: it seems it is required only for `matchRepeatedSubgraphs()'
7172
std::map<std::pair<std::string, std::size_t>, std::size_t> _param_mapping;
7273

73-
// Spatial information. So far assume spatial execution in 1 dimension only
74-
struct Spatial {
75-
using PPtr = std::shared_ptr<ov::op::v0::Parameter>;
76-
struct Param {
77-
PPtr param;
78-
std::size_t dim;
79-
};
80-
std::size_t _range = 0u; // Range over which spatial execution is organized, e.g. 1024
81-
std::size_t _slice = 0u; // A submission size for a single execution, e.g. 128
82-
std::size_t _out_dim = 0u; // Assume it is the same dim for all Results
83-
std::vector<Param> _inputs;
84-
};
85-
using SpatialOpt = std::optional<Spatial>;
86-
SpatialOpt _spatial;
74+
std::optional<ov::npuw::function::Spatial> _spatial;
8775
};
8876

8977
struct Group {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// Copyright (C) 2023-2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "spatial.hpp"
6+
7+
#include "util.hpp"
8+
9+
ov::npuw::runtime::spatial::AttentionMask::AttentionMask(std::size_t param_idx, const ov::ISyncInferRequest& rq)
10+
: m_attn_mask_param_idx(param_idx),
11+
m_rq(rq) {}
12+
13+
ov::npuw::runtime::spatial::Selector::Ptr ov::npuw::runtime::spatial::AttentionMask::find(
14+
const ov::ISyncInferRequest& rq) {
15+
auto is_attn_mask = [](const ov::Output<const ov::Node>& p) {
16+
const auto shape = p.get_shape();
17+
return p.get_node()->get_friendly_name() == "attention_mask" &&
18+
(shape.size() == 1 || (shape.size() == 2 && shape[0] == 1));
19+
};
20+
21+
const auto& inputs = rq.get_inputs();
22+
auto attn_mask_iter = std::find_if(inputs.begin(), inputs.end(), is_attn_mask);
23+
if (attn_mask_iter != inputs.end()) {
24+
const auto param_idx = std::distance(inputs.begin(), attn_mask_iter);
25+
return Selector::Ptr{new AttentionMask(param_idx, rq)};
26+
}
27+
return Selector::Ptr{};
28+
}
29+
30+
void ov::npuw::runtime::spatial::AttentionMask::prepare() {
31+
// Find the current valid range for this attention mask
32+
// Here we have the following (very strong) assumption:
33+
// The attention mask is dense (that is, has zero or one continuous interest region)
34+
const auto& iport = m_rq.get_compiled_model()->inputs()[m_attn_mask_param_idx];
35+
std::tie(m_valid_range_begin, m_valid_range_end) = ov::npuw::util::validMaskRange(m_rq.get_tensor(iport));
36+
}
37+
38+
bool ov::npuw::runtime::spatial::AttentionMask::need_submit(std::size_t offset, std::size_t len) const {
39+
// We don't submit this request if
40+
// - it is completely below the valid range
41+
// - it is completely above the valid range
42+
// in all other cases, we do
43+
return !(offset + len < m_valid_range_begin || offset >= m_valid_range_end);
44+
}

0 commit comments

Comments
 (0)