Skip to content

Commit 71c9359

Browse files
authored
[NPU] Optimize findCommandQueueGroupOrdinal method and get only once device properties (#29292)
### Details: - *Optimize findCommandQueueGroupOrdinal method and get only once device properties* Signed-off-by: Bogdan Pereanu <bogdan.pereanu@intel.com>
1 parent d7e72a0 commit 71c9359

16 files changed

+72
-81
lines changed

src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ class ZeroHostTensor : public ov::ITensor {
1515
public:
1616
ZeroHostTensor(const std::shared_ptr<ov::IRemoteContext>& context,
1717
const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
18+
const ze_device_properties_t& device_properties,
1819
const ov::element::Type element_type,
1920
const ov::Shape& shape,
2021
const Config& config,

src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp

-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,6 @@ class ZeroInferRequest final : public SyncInferRequest {
8585
mutable std::vector<std::vector<std::shared_ptr<ov::ITensor>>> _levelZeroInputTensors;
8686
mutable std::vector<std::shared_ptr<ov::ITensor>> _levelZeroOutputTensors;
8787

88-
ze_device_properties_t _properties = {};
8988
std::shared_ptr<const zeroMemory::HostMemAllocator> _inputAllocator;
9089
std::shared_ptr<const zeroMemory::HostMemAllocator> _outputAllocator;
9190

src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp

+1-2
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@ struct Pipeline {
2222
zeroProfiling::ProfilingQuery& profiling_query,
2323
const std::shared_ptr<zeroProfiling::NpuInferProfiling>& npu_profiling,
2424
const std::vector<std::vector<std::shared_ptr<ov::ITensor>>>& input_tensors,
25-
const std::vector<std::shared_ptr<ov::ITensor>>& output_tensors,
26-
uint32_t group_ordinal);
25+
const std::vector<std::shared_ptr<ov::ITensor>>& output_tensors);
2726

2827
Pipeline(const Pipeline&) = delete;
2928
Pipeline& operator=(const Pipeline&) = delete;

src/plugins/intel_npu/src/backend/include/zero_remote_tensor.hpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ class ZeroRemoteTensor final : public RemoteTensor {
1818
public:
1919
ZeroRemoteTensor(const std::shared_ptr<ov::IRemoteContext>& context,
2020
const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
21+
const ze_device_properties_t& device_properties,
2122
const ov::element::Type& element_type,
2223
const ov::Shape& shape,
2324
const Config& config,
@@ -41,7 +42,7 @@ class ZeroRemoteTensor final : public RemoteTensor {
4142

4243
std::shared_ptr<ZeroInitStructsHolder> _init_structs;
4344

44-
ze_device_properties_t _ze_properties = {};
45+
ze_device_properties_t _device_properties = {};
4546

4647
ov::intel_npu::TensorType _tensor_type;
4748
ov::intel_npu::MemType _mem_type;

src/plugins/intel_npu/src/backend/src/zero_device.cpp

+16-3
Original file line numberDiff line numberDiff line change
@@ -184,14 +184,27 @@ ov::SoPtr<ov::IRemoteTensor> ZeroDevice::createRemoteTensor(std::shared_ptr<ov::
184184
ov::intel_npu::TensorType tensor_type,
185185
ov::intel_npu::MemType mem_type,
186186
void* mem) {
187-
return {std::make_shared<
188-
ZeroRemoteTensor>(context, _initStructs, element_type, shape, config, tensor_type, mem_type, mem)};
187+
return {std::make_shared<ZeroRemoteTensor>(context,
188+
_initStructs,
189+
device_properties,
190+
element_type,
191+
shape,
192+
config,
193+
tensor_type,
194+
mem_type,
195+
mem)};
189196
};
190197

191198
ov::SoPtr<ov::ITensor> ZeroDevice::createHostTensor(std::shared_ptr<ov::IRemoteContext> context,
192199
const ov::element::Type& element_type,
193200
const ov::Shape& shape,
194201
const Config& config,
195202
ov::intel_npu::TensorType tensor_type) {
196-
return {std::make_shared<ZeroHostTensor>(context, _initStructs, element_type, shape, config, tensor_type)};
203+
return {std::make_shared<ZeroHostTensor>(context,
204+
_initStructs,
205+
device_properties,
206+
element_type,
207+
shape,
208+
config,
209+
tensor_type)};
197210
};

src/plugins/intel_npu/src/backend/src/zero_host_tensor.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@ namespace intel_npu {
1010

1111
ZeroHostTensor::ZeroHostTensor(const std::shared_ptr<ov::IRemoteContext>& context,
1212
const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
13+
const ze_device_properties_t& device_properties,
1314
const ov::element::Type element_type,
1415
const ov::Shape& shape,
1516
const Config& config,
1617
ov::intel_npu::TensorType tensor_type)
1718
: _impl(std::make_shared<ZeroRemoteTensor>(context,
1819
init_structs,
20+
device_properties,
1921
element_type,
2022
shape,
2123
config,

src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp

+1-10
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,6 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
8989
_npuProfiling = std::make_shared<zeroProfiling::NpuInferProfiling>(_initStructs, _config.get<LOG_LEVEL>());
9090
}
9191

92-
_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
93-
THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties",
94-
zeDeviceGetProperties(_initStructs->getDevice(), &_properties));
95-
9692
_outputAllocator = std::make_shared<const zeroMemory::HostMemAllocator>(_initStructs);
9793
_inputAllocator =
9894
std::make_shared<const zeroMemory::HostMemAllocator>(_initStructs, ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED);
@@ -171,10 +167,6 @@ void ZeroInferRequest::create_pipeline() {
171167
*_outputAllocator,
172168
_graph->get_batch_size());
173169
}
174-
175-
// Find the corresponding command queue group.
176-
_logger.debug("ZeroInferRequest::create_pipeline - findGroupOrdinal");
177-
auto groupOrdinal = zeroUtils::findGroupOrdinal(_initStructs->getDevice(), _properties);
178170
_logger.debug("ZeroInferRequest::create_pipeline - init completed");
179171

180172
// Set new tensors and reset variable state flag if memory updated before creating the pipeline
@@ -210,8 +202,7 @@ void ZeroInferRequest::create_pipeline() {
210202
_profilingQuery,
211203
_npuProfiling,
212204
_levelZeroInputTensors,
213-
_levelZeroOutputTensors,
214-
groupOrdinal);
205+
_levelZeroOutputTensors);
215206

216207
_logger.debug("ZeroInferRequest::create_pipeline - SyncInferRequest completed");
217208
}

src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,7 @@ Pipeline::Pipeline(const Config& config,
2424
zeroProfiling::ProfilingQuery& profiling_query,
2525
const std::shared_ptr<zeroProfiling::NpuInferProfiling>& npu_profiling,
2626
const std::vector<std::vector<std::shared_ptr<ov::ITensor>>>& input_tensors,
27-
const std::vector<std::shared_ptr<ov::ITensor>>& output_tensors,
28-
uint32_t group_ordinal)
27+
const std::vector<std::shared_ptr<ov::ITensor>>& output_tensors)
2928
: _graph(graph),
3029
_config(config),
3130
_id(_graph->get_unique_id()),
@@ -60,7 +59,8 @@ Pipeline::Pipeline(const Config& config,
6059

6160
_command_lists.reserve(_number_of_command_lists);
6261
for (size_t i = 0; i < _number_of_command_lists; i++) {
63-
_command_lists.emplace_back(std::make_unique<CommandList>(init_structs, group_ordinal));
62+
_command_lists.emplace_back(
63+
std::make_unique<CommandList>(init_structs, _graph->get_command_queue_group_ordinal()));
6464
}
6565

6666
if (_sync_output_with_fences) {

src/plugins/intel_npu/src/backend/src/zero_remote_tensor.cpp

+3-5
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ namespace intel_npu {
2323

2424
ZeroRemoteTensor::ZeroRemoteTensor(const std::shared_ptr<ov::IRemoteContext>& context,
2525
const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
26+
const ze_device_properties_t& device_properties,
2627
const ov::element::Type& element_type,
2728
const ov::Shape& shape,
2829
const Config& config,
@@ -33,13 +34,10 @@ ZeroRemoteTensor::ZeroRemoteTensor(const std::shared_ptr<ov::IRemoteContext>& co
3334
_config(config),
3435
_logger("ZeroRemoteContext", _config.get<LOG_LEVEL>()),
3536
_init_structs(init_structs),
37+
_device_properties(device_properties),
3638
_tensor_type(tensor_type),
3739
_mem_type(mem_type),
3840
_mem(mem) {
39-
_ze_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
40-
THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties",
41-
zeDeviceGetProperties(_init_structs->getDevice(), &_ze_properties));
42-
4341
const auto byte_size = ov::element::get_memory_size(_element_type, shape_size(_shape));
4442

4543
ze_device_external_memory_properties_t desc = {};
@@ -99,7 +97,7 @@ void ZeroRemoteTensor::allocate(const size_t bytes) {
9997
size_t size = (bytes + STANDARD_PAGE_SIZE - 1) & ~(STANDARD_PAGE_SIZE - 1);
10098

10199
ze_host_mem_alloc_desc_t desc = {};
102-
if (_tensor_type == TensorType::INPUT && (_ze_properties.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED)) {
100+
if (_tensor_type == TensorType::INPUT && (_device_properties.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED)) {
103101
ze_host_mem_alloc_flag_t flag = ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED;
104102
desc = {ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, nullptr, static_cast<ze_host_mem_alloc_flags_t>(flag)};
105103
} else {

src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
4242
const std::vector<ArgumentDescriptor>& get_input_descriptors() const;
4343
const std::vector<ArgumentDescriptor>& get_output_descriptors() const;
4444
const std::shared_ptr<CommandQueue>& get_command_queue() const;
45+
uint32_t get_command_queue_group_ordinal() const;
4546

4647
void set_workload_type(const ov::WorkloadType workloadType) const;
4748

@@ -83,6 +84,7 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
8384
std::vector<ArgumentDescriptor> _output_descriptors;
8485

8586
std::shared_ptr<CommandQueue> _command_queue;
87+
uint32_t _command_queue_group_ordinal = 0;
8688
std::vector<std::shared_ptr<Event>> _last_submitted_event;
8789

8890
// Used to protect zero pipeline creation in the graph. The pipeline should be created only once per graph when the

src/plugins/intel_npu/src/common/src/igraph.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ const std::shared_ptr<CommandQueue>& IGraph::get_command_queue() const {
4747
return _command_queue;
4848
}
4949

50+
uint32_t IGraph::get_command_queue_group_ordinal() const {
51+
return _command_queue_group_ordinal;
52+
}
53+
5054
void IGraph::set_workload_type(const ov::WorkloadType workloadType) const {
5155
if (_command_queue == nullptr) {
5256
return;

src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class ZeGraphExtWrappers {
4848

4949
void setGraphArgumentValue(ze_graph_handle_t graphHandle, uint32_t argi_, const void* argv) const;
5050

51-
void initializeGraph(ze_graph_handle_t graphHandle) const;
51+
void initializeGraph(ze_graph_handle_t graphHandle, uint32_t commandQueueGroupOrdinal) const;
5252

5353
private:
5454
std::unordered_set<std::string> getQueryResultFromSupportedLayers(
@@ -60,7 +60,7 @@ class ZeGraphExtWrappers {
6060
std::vector<IODescriptor>& inputs,
6161
std::vector<IODescriptor>& outputs) const;
6262

63-
void initialize_graph_through_command_list(ze_graph_handle_t graphHandle) const;
63+
void initialize_graph_through_command_list(ze_graph_handle_t graphHandle, uint32_t commandQueueGroupOrdinal) const;
6464

6565
std::shared_ptr<ZeroInitStructsHolder> _zeroInitStruct;
6666
uint32_t _graphExtVersion;

src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp

+5-7
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,9 @@ void DriverGraph::initialize(const Config& config) {
9999
_input_descriptors.shrink_to_fit();
100100
_output_descriptors.shrink_to_fit();
101101

102-
ze_device_properties_t deviceProperties = {};
103-
deviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
104-
THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties",
105-
zeDeviceGetProperties(_zeroInitStruct->getDevice(), &deviceProperties));
106-
auto groupOrdinal = zeroUtils::findGroupOrdinal(_zeroInitStruct->getDevice(), deviceProperties);
102+
_command_queue_group_ordinal =
103+
zeroUtils::findCommandQueueGroupOrdinal(_zeroInitStruct->getDevice(),
104+
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE);
107105

108106
bool turbo = false;
109107
if (config.has<TURBO>()) {
@@ -112,14 +110,14 @@ void DriverGraph::initialize(const Config& config) {
112110

113111
_command_queue = std::make_shared<CommandQueue>(_zeroInitStruct,
114112
zeroUtils::toZeQueuePriority(config.get<MODEL_PRIORITY>()),
115-
groupOrdinal,
113+
_command_queue_group_ordinal,
116114
turbo);
117115

118116
if (config.has<WORKLOAD_TYPE>()) {
119117
set_workload_type(config.get<WORKLOAD_TYPE>());
120118
}
121119

122-
_zeGraphExt->initializeGraph(_handle);
120+
_zeGraphExt->initializeGraph(_handle, _command_queue_group_ordinal);
123121

124122
_logger.debug("Graph initialize finish");
125123

src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp

+5-7
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,9 @@ void PluginGraph::initialize(const Config& config) {
9999
_input_descriptors.shrink_to_fit();
100100
_output_descriptors.shrink_to_fit();
101101

102-
ze_device_properties_t deviceProperties = {};
103-
deviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
104-
THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties",
105-
zeDeviceGetProperties(_zeroInitStruct->getDevice(), &deviceProperties));
106-
auto groupOrdinal = zeroUtils::findGroupOrdinal(_zeroInitStruct->getDevice(), deviceProperties);
102+
_command_queue_group_ordinal =
103+
zeroUtils::findCommandQueueGroupOrdinal(_zeroInitStruct->getDevice(),
104+
ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE);
107105

108106
bool turbo = false;
109107
if (config.has<TURBO>()) {
@@ -112,14 +110,14 @@ void PluginGraph::initialize(const Config& config) {
112110

113111
_command_queue = std::make_shared<CommandQueue>(_zeroInitStruct,
114112
zeroUtils::toZeQueuePriority(config.get<MODEL_PRIORITY>()),
115-
groupOrdinal,
113+
_command_queue_group_ordinal,
116114
turbo);
117115

118116
if (config.has<WORKLOAD_TYPE>()) {
119117
set_workload_type(config.get<WORKLOAD_TYPE>());
120118
}
121119

122-
_zeGraphExt->initializeGraph(_handle);
120+
_zeGraphExt->initializeGraph(_handle, _command_queue_group_ordinal);
123121

124122
if (config.get<BATCH_MODE>() != ov::intel_npu::BatchMode::COMPILER) {
125123
_batch_size = get_batch_size(_metadata);

src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp

+10-13
Original file line numberDiff line numberDiff line change
@@ -162,10 +162,10 @@ void ZeGraphExtWrappers::setGraphArgumentValue(ze_graph_handle_t graphHandle, ui
162162
THROW_ON_FAIL_FOR_LEVELZERO_EXT("zeGraphSetArgumentValue", result, _zeroInitStruct->getGraphDdiTable());
163163
}
164164

165-
void ZeGraphExtWrappers::initializeGraph(ze_graph_handle_t graphHandle) const {
165+
void ZeGraphExtWrappers::initializeGraph(ze_graph_handle_t graphHandle, uint32_t commandQueueGroupOrdinal) const {
166166
if (_zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8) {
167167
_logger.debug("Use initialize_graph_through_command_list for ext version smaller than 1.8");
168-
initialize_graph_through_command_list(graphHandle);
168+
initialize_graph_through_command_list(graphHandle, commandQueueGroupOrdinal);
169169
} else {
170170
_logger.debug("Initialize graph based on graph properties for ext version larger than 1.8");
171171
ze_graph_properties_2_t properties = {};
@@ -179,23 +179,20 @@ void ZeGraphExtWrappers::initializeGraph(ze_graph_handle_t graphHandle) const {
179179
}
180180

181181
if (properties.initStageRequired & ZE_GRAPH_STAGE_COMMAND_LIST_INITIALIZE) {
182-
initialize_graph_through_command_list(graphHandle);
182+
initialize_graph_through_command_list(graphHandle, commandQueueGroupOrdinal);
183183
}
184184
}
185185
}
186186

187-
void ZeGraphExtWrappers::initialize_graph_through_command_list(ze_graph_handle_t graphHandle) const {
188-
ze_device_properties_t deviceProperties = {};
189-
deviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
190-
THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties",
191-
zeDeviceGetProperties(_zeroInitStruct->getDevice(), &deviceProperties));
192-
auto groupOrdinal = zeroUtils::findGroupOrdinal(_zeroInitStruct->getDevice(), deviceProperties);
193-
187+
void ZeGraphExtWrappers::initialize_graph_through_command_list(ze_graph_handle_t graphHandle,
188+
uint32_t commandQueueGroupOrdinal) const {
194189
_logger.debug("initialize_graph_through_command_list init start - create graph_command_list");
195-
CommandList graph_command_list(_zeroInitStruct, groupOrdinal);
190+
CommandList graph_command_list(_zeroInitStruct, commandQueueGroupOrdinal);
196191
_logger.debug("initialize_graph_through_command_list - create graph_command_queue");
197-
std::shared_ptr<CommandQueue> graph_command_queue =
198-
std::make_shared<CommandQueue>(_zeroInitStruct, ZE_COMMAND_QUEUE_PRIORITY_NORMAL, groupOrdinal, false);
192+
std::shared_ptr<CommandQueue> graph_command_queue = std::make_shared<CommandQueue>(_zeroInitStruct,
193+
ZE_COMMAND_QUEUE_PRIORITY_NORMAL,
194+
commandQueueGroupOrdinal,
195+
false);
199196
_logger.debug("initialize_graph_through_command_list - create fence");
200197
Fence fence(graph_command_queue);
201198

0 commit comments

Comments
 (0)