NPUW: Dynamic Spatial (#27104)

dmatveev · web-flow · commit 212be8e18b57 · 2024-10-18T00:13:00.000Z
### Details:
- Introduce a new SPATIAL pipeline which is a shortcut to
PIPELINE:REG+ISOLATE:COMPUTE+SPATIAL:ON;
- Refactor some code re: spatial regions handling in models and
requests;
- Finally, introduce a dyn dispatch over the spatial range
  - Based on runtime-detected features
  - Can be disabled to measure full range performance

### Tickets:
 - E-143572
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp
@@ -45,7 +45,8 @@ DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime);
 DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, CompileTime);
 DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime);
 DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, CompileTime);
-DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 64, npuw::partitioning::spatial_nway, CompileTime);
+DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 128, npuw::partitioning::spatial_nway, CompileTime);
+DEFINE_OPT(NPUW_SPATIAL_DYN, bool, true, npuw::partitioning::spatial_dyn, CompileTime);
 DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, CompileTime);
 DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, CompileTime);
 DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime);
diff --git a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
@@ -67,7 +67,7 @@ namespace online {
  * @brief
  * Type: std::string.
  * Specify which partitioning pipeline to run.
- * Possible values: "NONE", "INIT", "JUST", "REP", "REG", "COMPUTE".
+ * Possible values: "NONE", "INIT", "JUST", "REP", "REG", "COMPUTE", "SPATIAL".
  * Default value: "REG".
  */
 static constexpr ov::Property<std::string> pipeline{"NPUW_ONLINE_PIPELINE"};
@@ -206,10 +206,18 @@ static constexpr ov::Property<bool> spatial{"NPUW_SPATIAL"};
  * @brief
  * Type: std::size_t.
  * Submission size for the spatial execution.
- * Default value: 64
+ * Default value: 128
  */
 static constexpr ov::Property<std::size_t> spatial_nway{"NPUW_SPATIAL_NWAY"};
 
+/**
+ * @brief
+ * Type: boolean.
+ * Enable dynamic submission for spatial subgraphs. Requires SPATIAL pipeline to be selected.
+ * Default value: true
+ */
+static constexpr ov::Property<bool> spatial_dyn{"NPUW_SPATIAL_DYN"};
+
 /**
  * @brief
  * Type: boolean
diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -30,6 +30,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_PMM>();
     desc.add<NPUW_SPATIAL>();
     desc.add<NPUW_SPATIAL_NWAY>();
+    desc.add<NPUW_SPATIAL_DYN>();
     desc.add<NPUW_HOST_GATHER>();
     desc.add<NPUW_DCOFF_TYPE>();
     desc.add<NPUW_DCOFF_SCALE>();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -283,18 +283,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
 
                 // Fill in the spatial information, if it is present
                 if (fcn_template._spatial) {
-                    using S = CompiledModelDesc::Spatial;
-                    S s;
-                    s.range = fcn_template._spatial->_range;
-                    s.nway = fcn_template._spatial->_slice;
-                    s.out_dim = fcn_template._spatial->_out_dim;
-                    s.nway_iters = s.range / s.nway;
-                    s.tail_size = s.range % s.nway;
-                    for (auto&& input : fcn_template._spatial->_inputs) {
-                        std::size_t p_idx = fcn_template._model->get_parameter_index(input.param);
-                        s.params.push_back(S::Param{p_idx, input.dim});
-                    }
-                    m_compiled_submodels[id].spatial = std::move(s);
+                    m_compiled_submodels[id].spatial =
+                        compiled::Spatial(fcn_template._spatial.value(), fcn_template._model);
                 }
                 LOG_INFO("Subgraph[" << id << "] is a function body for " << subgraph._funcall);
             } else {
@@ -918,7 +908,8 @@ void ov::npuw::CompiledModel::implement_properties() {
                           BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
                           BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM),
                           BIND(npuw::partitioning::spatial, NPUW_SPATIAL),
-                          BIND(npuw::partitioning::spatial, NPUW_SPATIAL_NWAY),
+                          BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY),
+                          BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN),
                           BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
                           BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
                           BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2023 Intel Corporation
+// Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -13,6 +13,7 @@
 #include "openvino/runtime/icompiled_model.hpp"
 #include "openvino/runtime/so_ptr.hpp"
 #include "partitioning/partitioning.hpp"
+#include "spatial.hpp"
 #include "weights_bank.hpp"
 
 namespace intel_npu {
@@ -123,20 +124,7 @@ class CompiledModel : public ov::ICompiledModel {
         std::optional<std::size_t> replaced_by;
 
         Subgraph::Gather host_gather;
-        struct Spatial {
-            struct Param {
-                std::size_t idx;
-                std::size_t dim;
-            };
-            std::vector<Param> params;
-            std::size_t range = 0u;
-            std::size_t nway = 0u;
-            std::size_t out_dim = 0u;
-
-            std::size_t nway_iters = 0u;
-            std::size_t tail_size = 0u;
-        };
-        std::optional<Spatial> spatial;
+        std::optional<ov::npuw::compiled::Spatial> spatial;
 
         // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
         // w.r.t. function calls
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
@@ -199,6 +199,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
     // Create infer requests
     // Preallocate funcall tensors & substitute function call requests
     bool failover_happened = false;
+    bool has_spatial = false;
     for (size_t i = 0; i < m_num_submodels; i++) {
         LOG_INFO("Creating infer request for Subgraph[" << i << "]...");
         LOG_BLOCK();
@@ -221,6 +222,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
 
             // Initialize the spatial IO placeholders, if required
             if (proto_comp_model_desc.spatial) {
+                has_spatial = true;
+
                 m_spatial_io[real_idx].inputs.resize(proto_comp_model_desc.param_base);
                 m_spatial_io[real_idx].input_tails.resize(proto_comp_model_desc.param_base);
                 m_spatial_io[real_idx].outputs.resize(num_outputs);
@@ -399,6 +402,24 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         }  // for(closure)
         LOG_VERB("DONE");
     }
+
+    // Handle spatial dynamic submission
+    if (has_spatial) {
+        if (m_npuw_model->m_cfg.get<::intel_npu::NPUW_SPATIAL_DYN>()) {
+            LOG_VERB("Finding spatial features...");
+            LOG_BLOCK();
+            m_spatial_selector = runtime::spatial::AttentionMask::find(*this);
+            if (!m_spatial_selector) {
+                LOG_WARN("Spatial capability is enabled, but no run-time features were found.");
+                // Fallback selector to ALL
+                m_spatial_selector.reset(new runtime::spatial::All());
+            }
+        } else {
+            // Just force selector to ALL
+            m_spatial_selector.reset(new runtime::spatial::All());
+        }
+        LOG_VERB("Done");
+    }
 }
 
 void ov::npuw::JustInferRequest::connect_subrequests() {
@@ -506,6 +527,11 @@ void ov::npuw::JustInferRequest::prepare_for_infer() {
         LOG_DEBUG("Pre-initializing weights for subgraph[" << id << "]");
         unpack_closure(id, m_subrequests[id]);
     }
+
+    // Adjust spatial input range, if supported
+    if (m_spatial_selector) {
+        m_spatial_selector->prepare();
+    }
     LOG_DEBUG("Done");
 }
 
@@ -915,6 +941,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
         // must be prepared in the m_spatial_io at this point
         const auto& spatial = comp_model_desc.spatial.value();
         const auto num_outputs = comp_model_desc.compiled_model->outputs().size();
+        NPUW_ASSERT(m_spatial_selector);
 
         // Create a sparse vector with full input sizes.
         // For the access simplicity, its size is aligned with function's
@@ -940,6 +967,10 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
 
         std::size_t offset = 0u;
         for (std::size_t i = 0u; i < spatial.nway_iters; i++, offset += spatial.nway) {
+            if (!m_spatial_selector->need_submit(offset, spatial.nway)) {
+                continue;
+            }
+
             // Collect spatial inputs for this offset
             for (auto&& param : spatial.params) {
                 const auto& iport = comp_model_desc.compiled_model->inputs()[param.idx];
@@ -963,7 +994,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
         }  // for(full_nway_times)
 
         // Now process the tail, if required
-        if (spatial.tail_size) {
+        if (spatial.tail_size && m_spatial_selector->need_submit(offset, spatial.tail_size)) {
             // Copy the sub-ranges to spatial inputs
             // NOTE: tails buffers are read from/written to at 0th offset!
             for (auto&& param : spatial.params) {
@@ -1085,7 +1116,7 @@ ov::npuw::TensorPtr ov::npuw::JustInferRequest::allocMem(const ov::element::Type
         return ov::get_tensor_impl(ov::Tensor(type, shape));
     }
 
-    std::lock_guard<std::mutex> guard(m_alloc_mutex);
+    // Protect access to shared context(s) - at least among infer requests
     auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
     auto remote_tensor = remote_ctx->create_host_tensor(type, shape);
     return ov::get_tensor_impl(ov::make_tensor(remote_tensor));
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
@@ -15,6 +15,7 @@
 #include "openvino/runtime/iremote_context.hpp"
 #include "openvino/runtime/make_tensor.hpp"
 #include "openvino/runtime/tensor.hpp"
+#include "spatial.hpp"
 
 namespace ov {
 namespace npuw {
@@ -148,8 +149,10 @@ class JustInferRequest final : public IBaseInferRequest {
     };
     std::vector<GlobalIO> m_subrequests_gio;
 
-    std::mutex m_alloc_mutex;
     std::unordered_set<void*> m_input_allocated;
+
+    // Represents spatial run-time info
+    runtime::spatial::Selector::Ptr m_spatial_selector;
 };
 
 }  // namespace npuw
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp
@@ -267,12 +267,13 @@ void dump_partitioning(const ov::npuw::Ensemble& ens, const std::string& to) {
 // Interface to get online partitioning from the model
 class Compiler {
     enum class Pipeline {
-        NONE,    // Partitioning will consist of a single group with all the Ops
-        INIT,    // Initialize only. The hardest mode, every group has just 1 layer inside
-        JUST,    // "justParitioning" - combination of LHF + Remnants
-        REP,     // Repeated blocks pipeline - combination of repeatedBlocks and Remnants
-        REG,     // Regularized repeated blocks pipeline -same as REP, but with some strong hints first
-        COMPUTE  // Separates non-foldable compute subgraphs from the model based on predefined rules + REP
+        NONE,     // Partitioning will consist of a single group with all the Ops
+        INIT,     // Initialize only. The hardest mode, every group has just 1 layer inside
+        JUST,     // "justParitioning" - combination of LHF + Remnants
+        REP,      // Repeated blocks pipeline - combination of repeatedBlocks and Remnants
+        REG,      // Regularized repeated blocks pipeline - same as REP, but with some strong hints first
+        COMPUTE,  // Separates non-foldable compute subgraphs from the model based on predefined rules + REP
+        SPATIAL   // Similar to COMPUTE but allows folding
     };
 
     template <class C>
@@ -299,6 +300,8 @@ class Compiler {
             return Pipeline::REG;
         } else if (pipeline_opt == "COMPUTE") {
             return Pipeline::COMPUTE;
+        } else if (pipeline_opt == "SPATIAL") {
+            return Pipeline::SPATIAL;
         } else {
             LOG_WARN("Unknown partitioning compiler pipeline " << pipeline_opt << ", switching to REP");
             return Pipeline::REP;
@@ -428,6 +431,16 @@ class Compiler {
             m_snapshot->setCtx(ctx);
             rep();
             break;
+        case Pipeline::SPATIAL:
+            warn_unused<::intel_npu::NPUW_ONLINE_ISOLATE>();
+            m_cfg.update(::intel_npu::Config::ConfigMap{{std::string(::intel_npu::NPUW_SPATIAL::key()), "YES"}});
+
+            // Manually set predefined isolates and nofolds then do rep() pipeline
+            // FIXME: initialize via a dedicated function instead of parsing
+            ctx.isolates = detail::getIsolates(detail::ISOL_PRESETS.at("COMPUTE"));
+            m_snapshot->setCtx(ctx);
+            rep();
+            break;
         }
 
         LOG_DEBUG("Online partitioning: group sizes after compilation:");
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
@@ -1605,7 +1605,7 @@ void Partitioner::identifySpatialRange(ov::npuw::Function& f) {
     const auto& f_params = f._model->get_parameters();
     NPUW_ASSERT(f_params.size() > 0);
 
-    using S = ov::npuw::Function::Spatial;
+    using S = ov::npuw::function::Spatial;
     S spatial;
     spatial._range = f_result_0_shape[1];
     spatial._out_dim = 1;  // the only case we're looking into now
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.hpp
@@ -10,6 +10,7 @@
 #include <vector>
 
 #include "../lazy_tensor.hpp"
+#include "../spatial.hpp"
 #include "intel_npu/al/config/config.hpp"
 #include "openvino/openvino.hpp"
 
@@ -70,20 +71,7 @@ struct Function {
     // NOTE: it seems it is required only for `matchRepeatedSubgraphs()'
     std::map<std::pair<std::string, std::size_t>, std::size_t> _param_mapping;
 
-    // Spatial information. So far assume spatial execution in 1 dimension only
-    struct Spatial {
-        using PPtr = std::shared_ptr<ov::op::v0::Parameter>;
-        struct Param {
-            PPtr param;
-            std::size_t dim;
-        };
-        std::size_t _range = 0u;    // Range over which spatial execution is organized, e.g. 1024
-        std::size_t _slice = 0u;    // A submission size for a single execution, e.g. 128
-        std::size_t _out_dim = 0u;  // Assume it is the same dim for all Results
-        std::vector<Param> _inputs;
-    };
-    using SpatialOpt = std::optional<Spatial>;
-    SpatialOpt _spatial;
+    std::optional<ov::npuw::function::Spatial> _spatial;
 };
 
 struct Group {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/spatial.cpp b/src/plugins/intel_npu/src/plugin/npuw/spatial.cpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "spatial.hpp"
+
+#include "util.hpp"
+
+ov::npuw::runtime::spatial::AttentionMask::AttentionMask(std::size_t param_idx, const ov::ISyncInferRequest& rq)
+    : m_attn_mask_param_idx(param_idx),
+      m_rq(rq) {}
+
+ov::npuw::runtime::spatial::Selector::Ptr ov::npuw::runtime::spatial::AttentionMask::find(
+    const ov::ISyncInferRequest& rq) {
+    auto is_attn_mask = [](const ov::Output<const ov::Node>& p) {
+        const auto shape = p.get_shape();
+        return p.get_node()->get_friendly_name() == "attention_mask" &&
+               (shape.size() == 1 || (shape.size() == 2 && shape[0] == 1));
+    };
+
+    const auto& inputs = rq.get_inputs();
+    auto attn_mask_iter = std::find_if(inputs.begin(), inputs.end(), is_attn_mask);
+    if (attn_mask_iter != inputs.end()) {
+        const auto param_idx = std::distance(inputs.begin(), attn_mask_iter);
+        return Selector::Ptr{new AttentionMask(param_idx, rq)};
+    }
+    return Selector::Ptr{};
+}
+
+void ov::npuw::runtime::spatial::AttentionMask::prepare() {
+    // Find the current valid range for this attention mask
+    // Here we have the following (very strong) assumption:
+    // The attention mask is dense (that is, has zero or one continuous interest region)
+    const auto& iport = m_rq.get_compiled_model()->inputs()[m_attn_mask_param_idx];
+    std::tie(m_valid_range_begin, m_valid_range_end) = ov::npuw::util::validMaskRange(m_rq.get_tensor(iport));
+}
+
+bool ov::npuw::runtime::spatial::AttentionMask::need_submit(std::size_t offset, std::size_t len) const {
+    // We don't submit this request if
+    // - it is completely below the valid range
+    // - it is completely above the valid range
+    // in all other cases, we do
+    return !(offset + len < m_valid_range_begin || offset >= m_valid_range_end);
+}
diff --git a/src/plugins/intel_npu/src/plugin/npuw/spatial.hpp b/src/plugins/intel_npu/src/plugin/npuw/spatial.hpp
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.hpp b/src/plugins/intel_npu/src/plugin/npuw/util.hpp