Skip to content

Commit e2c6ae9

Browse files
sunxiaoxia2022wangleisazhai219v-Golubev
authoredApr 1, 2024··
Enable fullyconnect parallel for per node core allocation (openvinotoolkit#23593)
### Details: - *integrated [PR19801](openvinotoolkit#19801), [PR23007](openvinotoolkit#23007) and [PR23127](openvinotoolkit#23127 - enable sub streams for per node core allocation - update class ModelDistributionPolicy and class SubStreamsMode - refactor get_model_prefer_threads() with class ModelDistributionPolicy - remove get_default_latency_streams() since it is always 1 now - add sub streams to executor for per node core allocation - Improve the performance of Fully connect layer on 2-socket Xeon systems. ### Tickets: - *123078, 129972, 132954* --------- Co-authored-by: Shen, Wanglei <wanglei.shen@intel.com> Co-authored-by: Xiuchuan Zhai <xiuchuan.zhai@intel.com> Co-authored-by: Vladislav Golubev <vladislav.golubev@intel.com>
1 parent d384662 commit e2c6ae9

38 files changed

+2134
-470
lines changed
 

‎src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ class OPENVINO_RUNTIME_API CPUStreamsExecutor : public IStreamsExecutor {
5353

5454
int get_socket_id() override;
5555

56+
void run_sub_stream(Task task, int id) override;
57+
5658
private:
5759
struct Impl;
5860
std::unique_ptr<Impl> _impl;

‎src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp

+42
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,17 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor {
6464
// (for large #streams)
6565
};
6666

67+
/**
68+
* @enum StreamsMode
69+
* @brief This enum contains definition of each sub streams mode, indicating the main stream situation.
70+
*/
71+
enum class StreamsMode {
72+
SUB_STREAMS_NULL, //!< Do not create sub streams
73+
SUB_STREAMS_FOR_SOCKET, //!< Create sub streams for multiple sockets in main stream
74+
LATENCY, //!< latency mode
75+
THROUGHPUT, //!< throughput mode
76+
};
77+
6778
private:
6879
std::string _name; //!< Used by `ITT` to name executor threads
6980
int _streams = 1; //!< Number of streams.
@@ -82,6 +93,7 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor {
8293
std::vector<std::vector<int>> _streams_info_table = {};
8394
std::vector<std::vector<int>> _stream_processor_ids;
8495
bool _cpu_reservation = false;
96+
int _sub_streams = 0;
8597

8698
/**
8799
* @brief Get and reserve cpu ids based on configuration and hardware information,
@@ -190,6 +202,15 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor {
190202
int get_thread_binding_offset() const {
191203
return _threadBindingOffset;
192204
}
205+
int get_sub_streams() const {
206+
return _sub_streams;
207+
}
208+
StreamsMode get_sub_stream_mode() const {
209+
const auto proc_type_table = get_proc_type_table();
210+
int sockets = proc_type_table.size() > 1 ? static_cast<int>(proc_type_table.size()) - 1 : 1;
211+
return _sub_streams > 0 ? StreamsMode::SUB_STREAMS_FOR_SOCKET
212+
: (_streams <= sockets ? StreamsMode::LATENCY : StreamsMode::THROUGHPUT);
213+
}
193214
bool operator==(const Config& config) {
194215
if (_name == config._name && _streams == config._streams &&
195216
_threads_per_stream == config._threads_per_stream && _threadBindingType == config._threadBindingType &&
@@ -250,6 +271,27 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor {
250271
* @param task A task to start
251272
*/
252273
virtual void execute(Task task) = 0;
274+
275+
/**
276+
* @brief Execute ov::Task inside sub stream of task executor context
277+
* @param task A task to start
278+
* @param id Sub stream id
279+
*/
280+
virtual void run_sub_stream(Task task, int id) = 0;
281+
282+
/**
283+
* @brief Execute all of the tasks and waits for its completion.
284+
* Default run_sub_stream_and_wait() method implementation uses run_sub_stream() pure virtual method
285+
* and higher level synchronization primitives from STL.
286+
* The task is wrapped into std::packaged_task which returns std::future.
287+
* std::packaged_task will call the task and signal to std::future that the task is finished
288+
* or the exception is thrown from task
289+
* Than std::future is used to wait for task execution completion and
290+
* task exception extraction
291+
* @note run_sub_stream_and_wait() does not copy or capture tasks!
292+
* @param tasks A vector of tasks to execute
293+
*/
294+
void run_sub_stream_and_wait(const std::vector<Task>& tasks);
253295
};
254296

255297
} // namespace threading

‎src/inference/src/dev/threading/cpu_streams_executor.cpp

+94-4
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "openvino/itt.hpp"
1919
#include "openvino/runtime/system_conf.hpp"
2020
#include "openvino/runtime/threading/cpu_streams_executor_internal.hpp"
21+
#include "openvino/runtime/threading/cpu_streams_info.hpp"
2122
#include "openvino/runtime/threading/executor_manager.hpp"
2223
#include "openvino/runtime/threading/thread_local.hpp"
2324

@@ -58,6 +59,11 @@ struct CPUStreamsExecutor::Impl {
5859
_streamId = _impl->_streamIdQueue.front();
5960
_impl->_streamIdQueue.pop();
6061
}
62+
if (!_impl->_subStreamIdQueue.empty() && _impl->_subStreamsNum < _impl->_config.get_sub_streams()) {
63+
_sub_stream_id = _impl->_subStreamIdQueue.front();
64+
_impl->_subStreamIdQueue.pop();
65+
_impl->_subStreamsNum++;
66+
}
6167
}
6268
_numaNodeId =
6369
_impl->_config.get_streams()
@@ -144,9 +150,8 @@ struct CPUStreamsExecutor::Impl {
144150
.set_max_threads_per_core(max_threads_per_core)});
145151
} else {
146152
_taskArena.reset(new custom::task_arena{concurrency});
147-
_cpu_ids = static_cast<int>(stream_processors.size()) == _impl->_config.get_streams()
148-
? stream_processors[stream_id]
149-
: _cpu_ids;
153+
_cpu_ids =
154+
stream_id < static_cast<int>(stream_processors.size()) ? stream_processors[stream_id] : _cpu_ids;
150155
if (_cpu_ids.size() > 0) {
151156
CpuSet processMask;
152157
int ncpus = 0;
@@ -166,7 +171,8 @@ struct CPUStreamsExecutor::Impl {
166171
StreamCreateType stream_type;
167172
const auto org_proc_type_table = get_org_proc_type_table();
168173
int streams_num = _impl->_config.get_streams();
169-
const auto stream_id = streams_num == 0 ? 0 : _streamId % streams_num;
174+
const auto stream_id =
175+
streams_num == 0 ? 0 : (_sub_stream_id >= 0 ? streams_num + _sub_stream_id : _streamId % streams_num);
170176
get_cur_stream_info(stream_id,
171177
_impl->_config.get_cpu_reservation(),
172178
org_proc_type_table,
@@ -193,6 +199,7 @@ struct CPUStreamsExecutor::Impl {
193199
int _numaNodeId = 0;
194200
int _socketId = 0;
195201
bool _execute = false;
202+
int _sub_stream_id = -1;
196203
std::queue<Task> _taskQueue;
197204
#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
198205
std::unique_ptr<custom::task_arena> _taskArena;
@@ -314,13 +321,17 @@ struct CPUStreamsExecutor::Impl {
314321
_exectorMgr = executor_manager();
315322
auto numaNodes = get_available_numa_nodes();
316323
int streams_num = _config.get_streams();
324+
int sub_streams_num = _config.get_sub_streams();
317325
if (streams_num != 0) {
318326
std::copy_n(std::begin(numaNodes),
319327
std::min<std::size_t>(streams_num, numaNodes.size()),
320328
std::back_inserter(_usedNumaNodes));
321329
} else {
322330
_usedNumaNodes = numaNodes;
323331
}
332+
if (sub_streams_num > 0) {
333+
_subTaskThread.assign(sub_streams_num, std::make_shared<SubQueue>());
334+
}
324335
for (auto streamId = 0; streamId < streams_num; ++streamId) {
325336
_threads.emplace_back([this, streamId] {
326337
openvino::itt::threadName(_config.get_name() + "_" + std::to_string(streamId));
@@ -343,6 +354,31 @@ struct CPUStreamsExecutor::Impl {
343354
});
344355
}
345356
_streams.set_thread_ids_map(_threads);
357+
358+
for (auto subId = 0; subId < sub_streams_num; ++subId) {
359+
_subThreads.emplace_back([this, subId, sub_streams_num] {
360+
openvino::itt::threadName(_config.get_name() + "_subthreads" + "_" + std::to_string(subId));
361+
for (bool stopped = false; !stopped;) {
362+
Task task;
363+
{ _subTaskThread[subId]->que_pop(task, stopped); }
364+
if (task) {
365+
{
366+
std::lock_guard<std::mutex> lock{_streamIdMutex};
367+
if (_subStreamsNum < sub_streams_num) {
368+
_subStreamIdQueue.push(subId);
369+
} else {
370+
std::queue<int> empty;
371+
std::swap(_subStreamIdQueue, empty);
372+
}
373+
}
374+
Execute(task, *(_streams.local()));
375+
}
376+
}
377+
});
378+
}
379+
if (_subThreads.size() > 0) {
380+
_streams.set_thread_ids_map(_subThreads);
381+
}
346382
}
347383

348384
void Enqueue(Task task) {
@@ -353,6 +389,10 @@ struct CPUStreamsExecutor::Impl {
353389
_queueCondVar.notify_one();
354390
}
355391

392+
void Enqueue_sub(Task task, int id) {
393+
_subTaskThread[id]->que_push(std::move(task));
394+
}
395+
356396
void Execute(const Task& task, Stream& stream) {
357397
#if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO
358398
auto& arena = stream._taskArena;
@@ -382,15 +422,49 @@ struct CPUStreamsExecutor::Impl {
382422
}
383423
}
384424

425+
struct SubQueue {
426+
std::mutex _subMutex;
427+
std::condition_variable _subQueueCondVar;
428+
bool _isSubStopped = false;
429+
std::queue<Task> _subTaskQueue;
430+
431+
SubQueue() {}
432+
433+
void que_push(Task task) {
434+
{
435+
std::lock_guard<std::mutex> lock(_subMutex);
436+
_subTaskQueue.emplace(std::move(task));
437+
}
438+
_subQueueCondVar.notify_one();
439+
}
440+
441+
void que_pop(Task& task, bool& stopped) {
442+
std::unique_lock<std::mutex> lock(_subMutex);
443+
_subQueueCondVar.wait(lock, [&] {
444+
return !_subTaskQueue.empty() || (stopped = _isSubStopped);
445+
});
446+
if (!_subTaskQueue.empty()) {
447+
task = std::move(_subTaskQueue.front());
448+
_subTaskQueue.pop();
449+
}
450+
}
451+
452+
~SubQueue() {}
453+
};
454+
385455
Config _config;
386456
std::mutex _streamIdMutex;
387457
int _streamId = 0;
388458
std::queue<int> _streamIdQueue;
459+
std::queue<int> _subStreamIdQueue;
460+
int _subStreamsNum = 0;
389461
std::vector<std::thread> _threads;
462+
std::vector<std::thread> _subThreads;
390463
std::mutex _mutex;
391464
std::condition_variable _queueCondVar;
392465
std::queue<Task> _taskQueue;
393466
bool _isStopped = false;
467+
std::vector<std::shared_ptr<SubQueue>> _subTaskThread;
394468
std::vector<int> _usedNumaNodes;
395469
CustomThreadLocal _streams;
396470
std::shared_ptr<ExecutorManager> _exectorMgr;
@@ -424,6 +498,18 @@ CPUStreamsExecutor::~CPUStreamsExecutor() {
424498
thread.join();
425499
}
426500
}
501+
for (size_t i = 0; i < _impl->_subTaskThread.size(); i++) {
502+
{
503+
std::lock_guard<std::mutex> lock(_impl->_subTaskThread[i]->_subMutex);
504+
_impl->_subTaskThread[i]->_isSubStopped = true;
505+
}
506+
_impl->_subTaskThread[i]->_subQueueCondVar.notify_all();
507+
}
508+
for (auto& thread : _impl->_subThreads) {
509+
if (thread.joinable()) {
510+
thread.join();
511+
}
512+
}
427513
}
428514

429515
void CPUStreamsExecutor::execute(Task task) {
@@ -438,5 +524,9 @@ void CPUStreamsExecutor::run(Task task) {
438524
}
439525
}
440526

527+
void CPUStreamsExecutor::run_sub_stream(Task task, int id) {
528+
_impl->Enqueue_sub(std::move(task), id);
529+
}
530+
441531
} // namespace threading
442532
} // namespace ov

‎src/inference/src/dev/threading/cpu_streams_executor_internal.cpp

+8-5
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ void get_cur_stream_info(const int stream_id,
2727
bool cpu_reserve = cpu_reservation;
2828
bool ecore_used = false;
2929
for (size_t i = 0; i < streams_info_table.size(); i++) {
30-
stream_total += streams_info_table[i][NUMBER_OF_STREAMS];
30+
stream_total += std::abs(streams_info_table[i][NUMBER_OF_STREAMS]);
3131
if (stream_id < stream_total) {
3232
stream_info_id = i;
3333
break;
@@ -93,10 +93,10 @@ void reserve_cpu_by_streams_info(const std::vector<std::vector<int>> _streams_in
9393
bool last_all_proc = false;
9494

9595
for (size_t i = 0; i < _streams_info_table.size(); i++) {
96-
if (_streams_info_table[i][NUMBER_OF_STREAMS] > 0) {
96+
if (_streams_info_table[i][NUMBER_OF_STREAMS] != 0) {
9797
stream_pos.push_back(num_streams);
9898
}
99-
num_streams += _streams_info_table[i][NUMBER_OF_STREAMS];
99+
num_streams += std::abs(_streams_info_table[i][NUMBER_OF_STREAMS]);
100100
}
101101
num_conditions = static_cast<int>(stream_pos.size());
102102
_stream_processors.assign(num_streams, std::vector<int>());
@@ -107,10 +107,13 @@ void reserve_cpu_by_streams_info(const std::vector<std::vector<int>> _streams_in
107107
std::vector<std::string> proc_types;
108108
std::vector<std::string> numa_nodes;
109109
std::vector<std::string> sockets;
110-
if (_streams_info_table[i][NUMBER_OF_STREAMS] > 0) {
110+
if (_streams_info_table[i][NUMBER_OF_STREAMS] != 0) {
111111
streams_table.push_back(_streams_info_table[i]);
112+
if (_streams_info_table[i][NUMBER_OF_STREAMS] < 0) {
113+
streams_table[streams_table.size() - 1][NUMBER_OF_STREAMS] = 1;
114+
}
112115
}
113-
if (last_all_proc && _streams_info_table[i][NUMBER_OF_STREAMS] > 0) {
116+
if (last_all_proc && _streams_info_table[i][NUMBER_OF_STREAMS] != 0) {
114117
last_all_proc = false;
115118
condition_idx++;
116119
}

‎src/inference/src/dev/threading/istreams_executor.cpp

+31
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "openvino/runtime/threading/istreams_executor.hpp"
66

77
#include <algorithm>
8+
#include <future>
89
#include <string>
910
#include <thread>
1011
#include <vector>
@@ -303,10 +304,13 @@ void IStreamsExecutor::Config::update_executor_config() {
303304
// Recaculate _streams, _threads and _threads_per_stream by _streams_info_table
304305
int num_streams = 0;
305306
_threads = 0;
307+
_sub_streams = 0;
306308
for (size_t i = 0; i < _streams_info_table.size(); i++) {
307309
if (_streams_info_table[i][NUMBER_OF_STREAMS] > 0) {
308310
num_streams += _streams_info_table[i][NUMBER_OF_STREAMS];
309311
_threads += _streams_info_table[i][NUMBER_OF_STREAMS] * _streams_info_table[i][THREADS_PER_STREAM];
312+
} else if (_streams_info_table[i][NUMBER_OF_STREAMS] == -1) {
313+
_sub_streams += 1;
310314
}
311315
}
312316
_threads_per_stream = _streams_info_table[0][THREADS_PER_STREAM];
@@ -346,5 +350,32 @@ void IStreamsExecutor::Config::set_config_zero_stream() {
346350
_cpu_reservation = false;
347351
}
348352

353+
void IStreamsExecutor::run_sub_stream_and_wait(const std::vector<Task>& tasks) {
354+
std::vector<std::packaged_task<void()>> packagedTasks;
355+
std::vector<std::future<void>> futures;
356+
for (std::size_t i = 0; i < tasks.size(); ++i) {
357+
packagedTasks.emplace_back([&tasks, i] {
358+
tasks[i]();
359+
});
360+
futures.emplace_back(packagedTasks.back().get_future());
361+
}
362+
for (std::size_t i = 0; i < tasks.size(); ++i) {
363+
run_sub_stream(
364+
[&packagedTasks, i] {
365+
packagedTasks[i]();
366+
},
367+
static_cast<int>(i));
368+
}
369+
// std::future::get will rethrow exception from task.
370+
// We should wait all tasks before any exception is thrown.
371+
// So wait() and get() for each future moved to separate loops
372+
for (auto&& future : futures) {
373+
future.wait();
374+
}
375+
for (auto&& future : futures) {
376+
future.get();
377+
}
378+
}
379+
349380
} // namespace threading
350381
} // namespace ov

0 commit comments

Comments
 (0)
Please sign in to comment.