Skip to content

Commit 4a30cb8

Browse files
authored
[NPU] Adding support for the set_tensors method (#26823)
### Details: - *Adding support for the set_tensor method* set_tensors works differently in case the plugin or the compiler handles the batch: - in case the compiler handles batching we need to create a continuous L0 tensor and copy all the tensors into that big tensor even when tensors are part of the same L0 context - in case the plugin handles batching and the remote tensor feature is supported copy is not used if the tensors are part of the same L0 context. ### Tickets: - *EISW-116494*
1 parent 4d3a534 commit 4a30cb8

File tree

9 files changed

+872
-135
lines changed

9 files changed

+872
-135
lines changed

src/plugins/intel_npu/src/al/include/sync_infer_request.hpp

+22-5
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ namespace intel_npu {
2222
*/
2323
class SyncInferRequest : public ov::IInferRequest {
2424
public:
25-
explicit SyncInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel);
25+
explicit SyncInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel, const Config& config);
2626

2727
/**
2828
* @brief Gets an input/output tensor for inference.
@@ -50,8 +50,8 @@ class SyncInferRequest : public ov::IInferRequest {
5050
* @brief Currently there is no support implemented for batches of tensors, thus this call is a simple redirection
5151
* to the "set_tensor" one.
5252
*/
53-
void set_tensors(const ov::Output<const ov::Node>& port,
54-
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
53+
virtual void set_tensors(const ov::Output<const ov::Node>& port,
54+
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
5555

5656
/**
5757
* @brief Gets inputs for infer request
@@ -126,6 +126,15 @@ class SyncInferRequest : public ov::IInferRequest {
126126
*/
127127
void check_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) const;
128128

129+
/**
130+
* @brief Basic checks for input tensors
131+
*
132+
* @param port Input port
133+
* @param tensors Input tensors
134+
*/
135+
void check_batched_tensors(const ov::Output<const ov::Node>& port,
136+
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) const;
137+
129138
/**
130139
* @brief Check that all tensors are valid. Throws an exception if it's not.
131140
*/
@@ -153,14 +162,22 @@ class SyncInferRequest : public ov::IInferRequest {
153162
const ov::Allocator& allocator = {},
154163
const std::optional<std::size_t> batchSize = std::nullopt) const;
155164

165+
bool is_batched_input(size_t idx) const;
166+
167+
ov::SoPtr<ov::ITensor>& get_user_input(size_t index) const;
168+
std::vector<ov::SoPtr<ov::ITensor>>& get_user_inputs(size_t index) const;
169+
156170
// This is intel_npu::ICompiledModel pointer, but need to use OV base class because
157171
// ov::IInferRequest::get_compiled_model returns a refernce to shared_ptr!
158172
std::shared_ptr<const ov::ICompiledModel> _compiledModel;
159173

160174
NetworkMetadata _metadata;
161175

162-
mutable std::vector<std::shared_ptr<ov::ITensor>> _userInputTensors;
163-
mutable std::vector<std::shared_ptr<ov::ITensor>> _userOutputTensors;
176+
Logger _logger;
177+
178+
// In case set_tensors is called, we receive a vector with N tensors otherwise only 1 tensor is needed
179+
mutable std::vector<std::vector<ov::SoPtr<ov::ITensor>>> _userInputTensors;
180+
mutable std::vector<ov::SoPtr<ov::ITensor>> _userOutputTensors;
164181

165182
mutable std::vector<ov::SoPtr<ov::IVariableState>> _variableStates;
166183

src/plugins/intel_npu/src/al/src/sync_infer_request.cpp

+111-14
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,12 @@ constexpr size_t BATCH_AXIS = 0;
1919

2020
namespace intel_npu {
2121

22-
SyncInferRequest::SyncInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel)
22+
SyncInferRequest::SyncInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel, const Config& config)
2323
: _compiledModel(compiledModel),
2424
_metadata(compiledModel->get_network_metadata()),
25-
_userInputTensors(_metadata.inputs.size(), nullptr),
26-
_userOutputTensors(_metadata.outputs.size(), nullptr) {
25+
_logger("SyncInferRequest", config.get<LOG_LEVEL>()),
26+
_userInputTensors(_metadata.inputs.size(), std::vector<ov::SoPtr<ov::ITensor>>(1, {nullptr})),
27+
_userOutputTensors(_metadata.outputs.size(), {nullptr}) {
2728
OPENVINO_ASSERT(_compiledModel);
2829

2930
if (get_outputs().empty()) {
@@ -121,7 +122,7 @@ ov::SoPtr<ov::ITensor> SyncInferRequest::get_tensor(const ov::Output<const ov::N
121122
OPENVINO_ASSERT(foundPort.found(), "Cannot find tensor for port ", port);
122123

123124
if (foundPort.is_input()) {
124-
return _userInputTensors.at(foundPort.idx);
125+
return get_user_input(foundPort.idx);
125126
}
126127
return _userOutputTensors.at(foundPort.idx);
127128
}
@@ -138,17 +139,22 @@ void SyncInferRequest::set_tensor(const ov::Output<const ov::Node>& port, const
138139
}
139140

140141
if (foundPort.is_input()) {
141-
_userInputTensors.at(foundPort.idx) = tensor._ptr;
142+
get_user_input(foundPort.idx) = tensor;
142143
} else {
143-
_userOutputTensors.at(foundPort.idx) = tensor._ptr;
144+
_userOutputTensors.at(foundPort.idx) = tensor;
144145
}
145146
}
146147

147-
std::vector<ov::SoPtr<ov::ITensor>> SyncInferRequest::get_tensors(const ov::Output<const ov::Node>& /*port*/) const {
148+
std::vector<ov::SoPtr<ov::ITensor>> SyncInferRequest::get_tensors(const ov::Output<const ov::Node>& port) const {
148149
OV_ITT_SCOPED_TASK(ov::itt::domains::Plugin, "get_tensors");
149150

150-
// Using batches of tensors is currently not supported by the NPU plugin. In this scenario, the OpenVINO API demands
151-
// returning an empty vector.
151+
auto foundPort = find_port(port);
152+
OPENVINO_ASSERT(foundPort.found(), "Cannot find input tensors for port ", port);
153+
154+
if (foundPort.is_input() && is_batched_input(foundPort.idx)) {
155+
return get_user_inputs(foundPort.idx);
156+
}
157+
152158
return {};
153159
}
154160

@@ -192,11 +198,89 @@ void SyncInferRequest::check_tensor(const ov::Output<const ov::Node>& port,
192198
"Tensor data equal nullptr!");
193199
}
194200

201+
void SyncInferRequest::check_batched_tensors(const ov::Output<const ov::Node>& port,
202+
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) const {
203+
OPENVINO_ASSERT(!tensors.empty(), "set_input_tensors/set_tensors can't be called with empty tensors");
204+
OPENVINO_ASSERT(
205+
tensors.size() != 1,
206+
"Internal error (plugin): check_batched_tensors is not allowed to have only one tensor inside batch");
207+
208+
auto layout = ov::layout::get_layout(port);
209+
210+
int64_t batch_idx;
211+
212+
if (layout.empty()) {
213+
_logger.warning("set_input_tensors/set_tensors layout is not set, assuming batch dimension is found on 0 axis");
214+
batch_idx = BATCH_AXIS;
215+
} else {
216+
OPENVINO_ASSERT(ov::layout::has_batch(layout),
217+
"set_input_tensors/set_tensors can be used only for inputs with N(batch) dimension"
218+
" 'layout' defined. Current layout is ",
219+
layout.to_string());
220+
batch_idx = ov::layout::batch_idx(layout);
221+
}
222+
223+
if (batch_idx < 0) {
224+
batch_idx += static_cast<int64_t>(tensors[BATCH_AXIS]->get_shape().size());
225+
}
226+
OPENVINO_ASSERT(batch_idx == BATCH_AXIS,
227+
"set_input_tensors/set_tensors is not currently supported for batch dimension index ",
228+
batch_idx,
229+
" != 0");
230+
std::for_each(tensors.begin(), tensors.end(), [&batch_idx](const ov::SoPtr<ov::ITensor>& item) {
231+
OPENVINO_ASSERT(item, "Unintialized tensor is provided!");
232+
OPENVINO_ASSERT(item->get_shape()[batch_idx] == 1,
233+
"set_input_tensors/set_tensors. Tensors shall represent one item in a batch, ",
234+
item->get_shape()[batch_idx],
235+
" provided");
236+
});
237+
auto tensors_size = static_cast<int>(tensors.size());
238+
if (port.get_partial_shape().rank().is_static()) {
239+
OPENVINO_ASSERT(batch_idx >= 0 && batch_idx < port.get_partial_shape().rank().get_length(),
240+
"set_input_tensors/set_tensors error. Layout ",
241+
layout.to_string(),
242+
" is incorrect for operation with shape ",
243+
port.get_partial_shape());
244+
auto batch = port.get_partial_shape()[batch_idx];
245+
246+
OPENVINO_ASSERT(batch.is_dynamic() || batch.get_length() == tensors_size,
247+
"set_input_tensors/set_tensors error. Input shape ",
248+
port.get_partial_shape(),
249+
"batch ",
250+
batch,
251+
"doesn't match with total blobs count: ",
252+
tensors_size);
253+
}
254+
255+
auto batched_shape = tensors[BATCH_AXIS]->get_shape();
256+
auto element_type = tensors[BATCH_AXIS]->get_element_type();
257+
batched_shape[batch_idx] = tensors_size;
258+
for (const auto& item : tensors) {
259+
OPENVINO_ASSERT(item, "Unintialized tensor is provided!");
260+
auto item_shape = item->get_shape();
261+
item_shape[batch_idx] = batched_shape[batch_idx];
262+
OPENVINO_ASSERT(item_shape == batched_shape && item->get_element_type() == element_type &&
263+
"set_input_tensors/set_tensors error. Tensor with element type ",
264+
item->get_element_type(),
265+
" and shape ",
266+
item_shape,
267+
" is not compatible with batched tensor with element type ",
268+
element_type,
269+
" and shape ",
270+
batched_shape);
271+
OPENVINO_ASSERT(item->is_continuous(), "Strides for batched tensors should be default.");
272+
}
273+
}
274+
195275
void SyncInferRequest::check_tensors() const {
196276
const auto& inputs = _compiledModel->inputs();
197277
for (size_t i = 0; i < inputs.size(); i++) {
198-
if (_userInputTensors.at(i)) {
199-
check_tensor(inputs[i], _userInputTensors.at(i));
278+
if (is_batched_input(i)) {
279+
check_batched_tensors(inputs[i], get_user_inputs(i));
280+
continue;
281+
}
282+
if (get_user_input(i)) {
283+
check_tensor(inputs[i], get_user_input(i));
200284
}
201285
}
202286

@@ -229,16 +313,16 @@ std::shared_ptr<ov::ITensor> SyncInferRequest::allocate_tensor(const IODescripto
229313
OPENVINO_ASSERT(descriptor.relatedDescriptorIndex.has_value(),
230314
"The link between state descriptors is missing, state name: ",
231315
descriptor.nameFromCompiler);
232-
tensor = _userInputTensors.at(*descriptor.relatedDescriptorIndex);
316+
tensor = get_user_input(*descriptor.relatedDescriptorIndex)._ptr;
233317
} else if (allocator) {
234318
tensor = ov::make_tensor(descriptor.precision, allocatedTensorShape, allocator);
235319
} else {
236320
tensor = ov::make_tensor(descriptor.precision, allocatedTensorShape);
237321
}
238322

239323
if (isInput) {
240-
if (_userInputTensors.at(index) == nullptr) {
241-
_userInputTensors.at(index) = tensor;
324+
if (get_user_input(index) == nullptr) {
325+
get_user_input(index) = tensor;
242326
}
243327

244328
if (descriptor.isStateInput) {
@@ -250,4 +334,17 @@ std::shared_ptr<ov::ITensor> SyncInferRequest::allocate_tensor(const IODescripto
250334

251335
return tensor;
252336
}
337+
338+
bool SyncInferRequest::is_batched_input(size_t idx) const {
339+
return _userInputTensors.at(idx).size() > 1;
340+
}
341+
342+
ov::SoPtr<ov::ITensor>& SyncInferRequest::get_user_input(size_t index) const {
343+
return _userInputTensors.at(index).at(0);
344+
}
345+
346+
std::vector<ov::SoPtr<ov::ITensor>>& SyncInferRequest::get_user_inputs(size_t index) const {
347+
return _userInputTensors.at(index);
348+
}
349+
253350
} // namespace intel_npu

src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp

+11-3
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ class ZeroInferRequest final : public SyncInferRequest {
2727

2828
ov::SoPtr<ov::ITensor> get_tensor(const ov::Output<const ov::Node>& port) const override;
2929
void set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) override;
30+
void set_tensors(const ov::Output<const ov::Node>& port,
31+
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
3032

3133
void infer() override;
3234
void infer_async() override;
@@ -54,7 +56,7 @@ class ZeroInferRequest final : public SyncInferRequest {
5456
* @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside
5557
* the plugin.
5658
*/
57-
std::optional<size_t> getBatchSize(const NetworkMetadata& metadata);
59+
std::optional<size_t> get_batch_size(const NetworkMetadata& metadata);
5860

5961
/**
6062
* @brief Check the received tensor and set the Level Zero tensor accordingly
@@ -75,6 +77,12 @@ class ZeroInferRequest final : public SyncInferRequest {
7577
void check_network_precision(const ov::element::Type_t precision) const override;
7678
void create_pipeline();
7779

80+
std::shared_ptr<ov::ITensor>& get_level_zero_input(size_t index, size_t tensorNo = 0) const;
81+
std::vector<std::shared_ptr<ov::ITensor>>& get_level_zero_inputs(size_t index) const;
82+
83+
std::optional<TensorData>& get_input_tensor_data(size_t index, size_t tensorNo = 0) const;
84+
std::vector<std::optional<TensorData>>& get_input_tensors_data(size_t index) const;
85+
7886
const std::shared_ptr<ZeroInitStructsHolder> _initStructs;
7987
const std::shared_ptr<const IExecutor> _executorPtr;
8088
const ZeroExecutor* _executor;
@@ -83,10 +91,10 @@ class ZeroInferRequest final : public SyncInferRequest {
8391

8492
// A copy of each tensor is needed to maintain the original L0 memory allocation in case the user provides another
8593
// memory area for the tensor.
86-
mutable std::vector<std::shared_ptr<ov::ITensor>> _levelZeroInputTensors;
94+
mutable std::vector<std::vector<std::shared_ptr<ov::ITensor>>> _levelZeroInputTensors;
8795
mutable std::vector<std::shared_ptr<ov::ITensor>> _levelZeroOutputTensors;
8896

89-
mutable std::vector<std::optional<TensorData>> _inputTensorsData;
97+
mutable std::vector<std::vector<std::optional<TensorData>>> _inputTensorsData;
9098
mutable std::vector<std::optional<TensorData>> _outputTensorsData;
9199

92100
ze_device_properties_t _properties = {};

src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ struct Pipeline {
2525
zeroProfiling::ProfilingPool& profiling_pool,
2626
zeroProfiling::ProfilingQuery& profiling_query,
2727
std::shared_ptr<zeroProfiling::NpuInferProfiling> npu_profiling,
28-
const std::vector<std::optional<TensorData>>& inputTensorsData,
28+
const std::vector<std::vector<std::optional<TensorData>>>& inputTensorsData,
2929
const std::vector<std::optional<TensorData>>& outputTensorsData,
3030
const size_t numberOfCommandLists);
3131

@@ -37,7 +37,8 @@ struct Pipeline {
3737
void pull();
3838
void reset() const;
3939

40-
void updateCommandList(const TensorData& tensorsData, const uint32_t index);
40+
void updateCommandList(const TensorData& tensorsData, uint32_t index);
41+
void updateCommandList(const TensorData& tensorsData, uint32_t index, size_t commandListIndex);
4142

4243
protected:
4344
const Config _config;

0 commit comments

Comments
 (0)