Reserving CPU resource in CPU inference (#27321)

### Details: - Add property `ov::hint::enable_cpu_reservation` to reserve CPU resource in CPU inference - `ov::hint::enable_cpu_reservation` defaults to false, user can explicitly set it to true to enable CPU reservation. - update proc_type_table before stream scheduling in compile_model() ### Tickets: - *CVS-155268* - *#27083 --------- Co-authored-by: Shen, Wanglei <wanglei.shen@intel.com> Co-authored-by: Chen Peter <peter.chen@intel.com>
openvinotoolkit · Jan 16, 2025 · c849f72 · c849f72
1 parent 513dcc5
commit c849f72
Show file tree

Hide file tree

Showing 36 changed files with 2,048 additions and 501 deletions.
diff --git a/.../openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst b/.../openvino-workflow/running-inference/inference-devices-and-modes/cpu-device.rst
@@ -355,6 +355,7 @@ All parameters must be set before calling ``ov::Core::compile_model()`` in order
 - ``ov::hint::num_request``
 - ``ov::hint::scheduling_core_type``
 - ``ov::hint::enable_hyper_threading``
+- ``ov::hint::enable_cpu_reservation``
 - ``ov::hint::enable_cpu_pinning``
 - ``ov::num_streams``
 - ``ov::inference_num_threads``

diff --git a/src/inference/dev_api/openvino/runtime/system_conf.hpp b/src/inference/dev_api/openvino/runtime/system_conf.hpp
@@ -219,6 +219,13 @@ OPENVINO_RUNTIME_API std::vector<std::vector<int>> get_proc_type_table();
  */
 OPENVINO_RUNTIME_API int get_current_socket_id();
 
+/**
+ * @brief      Returns the numa node ID in cpu mapping table of the currently running thread.
+ * @ingroup    ov_dev_api_system_conf
+ * @return     numa node ID in cpu mapping
+ */
+OPENVINO_RUNTIME_API int get_current_numa_node_id();
+
 /**
  * @brief      Returns a table of original number of processor types without filtering other plugins occupying CPU
  * resources. The difference from get_proc_type_table: This is used to get the configuration of current machine. For

diff --git a/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp b/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp
@@ -57,6 +57,8 @@ class OPENVINO_RUNTIME_API CPUStreamsExecutor : public IStreamsExecutor {
 
     std::vector<int> get_rank() override;
 
+    void cpu_reset() override;
+
 private:
     struct Impl;
     std::unique_ptr<Impl> _impl;

diff --git a/src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp b/src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp
@@ -12,6 +12,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include <mutex>
 
 #include "openvino/runtime/common.hpp"
 #include "openvino/runtime/properties.hpp"
@@ -89,14 +90,15 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor {
             ov::hint::SchedulingCoreType::ANY_CORE;  //!< PCORE_ONLY and ECORE_ONLY are valid in hybrid core machine,
                                                      //!< ANY_CORE is valid in all machines. Core type priority:
                                                      //!< physical PCore, ECore, logical PCore
-        bool _cpu_reservation = false;  //!< Whether to reserve current cores which will not be used by other plugin.
-                                        //!< If it is true, cpu_pinning defaults to true.
+        bool _cpu_reservation = false;  //!< Whether to reserve current cores which will not be used by other plugin or
+                                        //!< compiled model. If it is true, cpu_pinning defaults to true.
         bool _cpu_pinning = false;      //!< Whether to bind threads to cores.
         bool _cores_limit = true;       //!< Whether to limit the number of streams and threads by the number of cpu cores
         std::vector<std::vector<int>> _streams_info_table = {};
         std::vector<std::vector<int>> _stream_processor_ids;
         int _sub_streams = 0;
         std::vector<int> _rank = {};
+        bool _add_lock = true;
 
         /**
          * @brief Get and reserve cpu ids based on configuration and hardware information,
@@ -109,6 +111,8 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor {
          */
         void update_executor_config();
 
+        void update_executor_config(bool lock);
+
         /**
          * @brief Set _streams_info_table and _cpu_reservation in cpu streams executor config when nstreams = 0,
          *        that is, only create one thread with TBB
@@ -136,7 +140,8 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor {
                bool cpu_pinning = false,
                bool cores_limit = true,
                std::vector<std::vector<int>> streams_info_table = {},
-               std::vector<int> rank = {})
+               std::vector<int> rank = {},
+               bool add_lock = true)
             : _name{std::move(name)},
               _streams{streams},
               _threads_per_stream{threads_per_stream},
@@ -145,8 +150,9 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor {
               _cpu_pinning{cpu_pinning},
               _cores_limit{cores_limit},
               _streams_info_table{std::move(streams_info_table)},
-              _rank{rank} {
-            update_executor_config();
+              _rank{rank},
+              _add_lock(add_lock) {
+            update_executor_config(_add_lock);
         }
 
         // These APIs which includes set_property and get_property can not be removed until they will never be called by
@@ -266,12 +272,19 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor {
      */
     virtual std::vector<int> get_rank() = 0;
 
+    /**
+     * @brief Reset cpu map table when user set enable_cpu_reservation = true
+     */
+    virtual void cpu_reset() = 0;
+
     /**
      * @brief Execute the task in the current thread using streams executor configuration and constraints
      * @param task A task to start
      */
     virtual void execute(Task task) = 0;
 };
 
+static std::mutex _streams_executor_mutex;
+
 }  // namespace threading
 }  // namespace ov
diff --git a/src/inference/include/openvino/runtime/properties.hpp b/src/inference/include/openvino/runtime/properties.hpp
@@ -479,6 +479,23 @@ static constexpr Property<std::set<ModelDistributionPolicy>> model_distribution_
  */
 static constexpr Property<bool> enable_cpu_pinning{"ENABLE_CPU_PINNING"};
 
+/**
+ * @brief This property allows CPU reservation during inference.
+ * @ingroup ov_runtime_cpp_prop_api
+ *
+ * Cpu Reservation means reserve cpus which will not be used by other plugin or compiled model. Developer can use this
+ * property to enable or disable CPU reservation during inference on Windows and Linux. MacOS does not support CPU
+ * reservation, and this property is always disabled. This property defaults to false.
+ *
+ * The following code is example to use this property.
+ *
+ * @code
+ * ie.set_property(ov::hint::enable_cpu_reservation(true));
+ * ie.set_property(ov::hint::enable_cpu_reservation(false));
+ * @endcode
+ */
+static constexpr Property<bool> enable_cpu_reservation{"ENABLE_CPU_RESERVATION"};
+
 /**
  * @brief This property define if using hyper threading during inference.
  * @ingroup ov_runtime_cpp_prop_api

diff --git a/src/inference/src/dev/threading/cpu_streams_executor.cpp b/src/inference/src/dev/threading/cpu_streams_executor.cpp
@@ -92,14 +92,6 @@ struct CPUStreamsExecutor::Impl {
                 _impl->_streamIdQueue.push(_streamId);
             }
 #if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
-            if (_impl->_config.get_name().find("StreamsExecutor") == std::string::npos) {
-                try {
-                    set_cpu_used(_cpu_ids, NOT_USED);
-                } catch (const ov::Exception&) {
-                    // Destructor should not throw - catch needed for static analysis.
-                    // CPU::CPU() won't throw here as cpu_info() is called from Stream constructor.
-                }
-            }
             if (nullptr != _observer) {
                 _observer->observe(false);
             }
@@ -345,6 +337,7 @@ struct CPUStreamsExecutor::Impl {
         _exectorMgr = executor_manager();
         auto numaNodes = get_available_numa_nodes();
         int streams_num = _config.get_streams();
+        auto processor_ids = _config.get_stream_processor_ids();
         if (streams_num != 0) {
             std::copy_n(std::begin(numaNodes),
                         std::min<std::size_t>(streams_num, numaNodes.size()),
@@ -353,6 +346,10 @@ struct CPUStreamsExecutor::Impl {
             _usedNumaNodes = std::move(numaNodes);
         }
         for (auto streamId = 0; streamId < streams_num; ++streamId) {
+            if (_config.get_cpu_reservation()) {
+                std::lock_guard<std::mutex> lock(_cpu_ids_mutex);
+                _cpu_ids_all.insert(_cpu_ids_all.end(), processor_ids[streamId].begin(), processor_ids[streamId].end());
+            }
             _threads.emplace_back([this, streamId] {
                 openvino::itt::threadName(_config.get_name() + "_" + std::to_string(streamId));
                 for (bool stopped = false; !stopped;) {
@@ -457,6 +454,8 @@ struct CPUStreamsExecutor::Impl {
     CustomThreadLocal _streams;
     std::shared_ptr<ExecutorManager> _exectorMgr;
     bool _isExit = false;
+    std::vector<int> _cpu_ids_all;
+    std::mutex _cpu_ids_mutex;
 };
 
 int CPUStreamsExecutor::get_stream_id() {
@@ -492,9 +491,20 @@ std::vector<int> CPUStreamsExecutor::get_rank() {
     return stream->_rank;
 }
 
+void CPUStreamsExecutor::cpu_reset() {
+    if (!_impl->_cpu_ids_all.empty()) {
+        set_cpu_used(_impl->_cpu_ids_all, NOT_USED);
+        {
+            std::lock_guard<std::mutex> lock(_impl->_cpu_ids_mutex);
+            _impl->_cpu_ids_all.clear();
+        }
+    }
+}
+
 CPUStreamsExecutor::CPUStreamsExecutor(const IStreamsExecutor::Config& config) : _impl{new Impl{config}} {}
 
 CPUStreamsExecutor::~CPUStreamsExecutor() {
+    cpu_reset();
     {
         std::lock_guard<std::mutex> lock(_impl->_mutex);
         _impl->_isStopped = true;

diff --git a/src/inference/src/dev/threading/istreams_executor.cpp b/src/inference/src/dev/threading/istreams_executor.cpp
@@ -159,27 +159,32 @@ void IStreamsExecutor::Config::update_executor_config() {
     const auto proc_type_table = get_proc_type_table();
     bool streams_info_available = false;
 
-    if (proc_type_table.empty()) {
-        return;
-    }
-
-    if (_cpu_reservation && !_cpu_pinning) {
-        _cpu_pinning = true;
+    if (proc_type_table.empty() || proc_type_table[0][ALL_PROC] == 0) {
+        if (_cpu_reservation) {
+            OPENVINO_THROW("[ Config ] proc_type_table is empty. No CPU resources available!");
+        } else {
+            return;
+        }
     }
 
     if (!_streams_info_table.empty()) {
         streams_info_available = true;
         std::vector<int> threads_proc_type(HYPER_THREADING_PROC + 1, 0);
+        int threads_all = 0;
         for (size_t i = 0; i < _streams_info_table.size(); i++) {
             if (_streams_info_table[i][NUMBER_OF_STREAMS] > 0) {
-                threads_proc_type[_streams_info_table[i][PROC_TYPE]] +=
+                int num_threads =
                     _streams_info_table[i][THREADS_PER_STREAM] * _streams_info_table[i][NUMBER_OF_STREAMS];
+                threads_proc_type[_streams_info_table[i][PROC_TYPE]] += num_threads;
+                threads_all += num_threads;
             }
         }
+        if (threads_all == 0) {
+            OPENVINO_THROW("streams_info_table is invalid!");
+        }
         for (size_t i = ALL_PROC; i < threads_proc_type.size(); i++) {
             if (threads_proc_type[i] > proc_type_table[0][i]) {
-                streams_info_available = false;
-                break;
+                OPENVINO_THROW("Not enough CPU resources!");
             }
         }
     }
@@ -269,7 +274,7 @@ void IStreamsExecutor::Config::update_executor_config() {
         }
     }
 
-    if (_cpu_pinning) {
+    if (_cpu_pinning || _cpu_reservation) {
         reserve_available_cpus(_streams_info_table, _stream_processor_ids, _cpu_reservation ? CPU_USED : NOT_USED);
     }
 
@@ -319,6 +324,17 @@ void IStreamsExecutor::Config::update_executor_config() {
 #endif
 }
 
+void IStreamsExecutor::Config::update_executor_config(bool lock) {
+    if (lock) {
+        {
+            std::lock_guard<std::mutex> lock{_streams_executor_mutex};
+            update_executor_config();
+        }
+    } else {
+        update_executor_config();
+    }
+}
+
 void IStreamsExecutor::Config::set_config_zero_stream() {
     std::vector<std::vector<int>> proc_type_table = get_proc_type_table();
     int core_type = MAIN_CORE_PROC;
@@ -333,6 +349,7 @@ void IStreamsExecutor::Config::set_config_zero_stream() {
         socket_id = std::max(0, proc_type_table[0][PROC_SOCKET_ID]);
     }
     _streams_info_table.push_back({1, core_type, 1, numa_id, socket_id});
+    _cpu_reservation = false;
     _cpu_pinning = false;
 }
 

diff --git a/src/inference/src/os/cpu_map_info.hpp b/src/inference/src/os/cpu_map_info.hpp
@@ -54,38 +54,6 @@ class CPU {
     std::mutex _cpu_mutex;
     int _socket_idx = 0;
 
-private:
-    /**
-     * @brief      Sort proc_type_table by CPU ID on which application is running. The numa node containing this CPU ID
-     * will move to first row.
-     * @param[in]  _processor_id CPU ID on which application is running.
-     * @param[in] _proc_type_table summary table of number of processors per type
-     * @param[in] _cpu_mapping_table CPU mapping table for each processor
-     * @return
-     */
-    void sort_table_by_cpu_id(const int _processor_id,
-                              std::vector<std::vector<int>>& _proc_type_table,
-                              const std::vector<std::vector<int>>& _cpu_mapping_table) {
-        int current_numa_node = 0;
-        int current_socket = 0;
-
-        for (auto& row : _cpu_mapping_table) {
-            if (_processor_id == row[CPU_MAP_PROCESSOR_ID]) {
-                current_numa_node = row[CPU_MAP_NUMA_NODE_ID];
-                current_socket = row[CPU_MAP_SOCKET_ID];
-                break;
-            }
-        }
-        for (size_t i = 1; i < _proc_type_table.size(); i++) {
-            if ((current_numa_node == _proc_type_table[i][PROC_NUMA_NODE_ID]) &&
-                (current_socket == _proc_type_table[i][PROC_SOCKET_ID])) {
-                std::rotate(_proc_type_table.begin() + 1, _proc_type_table.begin() + i, _proc_type_table.end());
-                break;
-            }
-        }
-    };
-
-    friend class LinuxSortProcTableTests;
 };
 
 CPU& cpu_info();

diff --git a/src/inference/src/os/lin/lin_system_conf.cpp b/src/inference/src/os/lin/lin_system_conf.cpp
@@ -320,11 +320,6 @@ CPU::CPU() {
         OPENVINO_THROW("CPU affinity check failed. No CPU is eligible to run inference.");
     };
 
-    if (_proc_type_table.size() > 1) {
-        int cur_processor_id = sched_getcpu();
-        sort_table_by_cpu_id(cur_processor_id, _proc_type_table, _cpu_mapping_table);
-    }
-
     _org_proc_type_table = _proc_type_table;
 
     cpu_debug();

diff --git a/src/inference/src/os/win/win_system_conf.cpp b/src/inference/src/os/win/win_system_conf.cpp
@@ -52,11 +52,6 @@ CPU::CPU() {
         }
     }
 
-    if (_proc_type_table.size() > 1) {
-        int cur_processor_id = GetCurrentProcessorNumber();
-        sort_table_by_cpu_id(cur_processor_id, _proc_type_table, _cpu_mapping_table);
-    }
-
     cpu_debug();
 }