Skip to content

Commit aaff252

Browse files
committed
Merge remote-tracking branch 'origin/master' into cpu/compile_mmap
2 parents 9dad4e8 + 0361714 commit aaff252

File tree

28 files changed

+729
-194
lines changed

28 files changed

+729
-194
lines changed

.github/workflows/linux_sanitizers.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ jobs:
204204

205205
CXX_Unit_Tests:
206206
name: C++ unit tests
207-
if: always()
207+
if: ${{ github.repository_owner == 'openvinotoolkit' }}
208208
needs: Build
209209
timeout-minutes: 100
210210
runs-on: 'aks-linux-16-cores-32gb'

src/frontends/pytorch/src/op/addmm.cpp

+2-11
Original file line numberDiff line numberDiff line change
@@ -73,17 +73,8 @@ OutputVector translate_conv1d_ext(const NodeContext& context) {
7373
auto bias = context.get_input(2);
7474
bias = context.mark_node(std::make_shared<ov::op::v1::ConvertLike>(bias, x));
7575

76-
auto neg_one = context.mark_node(v0::Constant::create(element::i32, Shape{1}, {-1}));
77-
auto zero = context.mark_node(v0::Constant::create(element::i32, Shape{1}, {0}));
78-
auto shape_x = context.mark_node(std::make_shared<v3::ShapeOf>(x, element::i32));
79-
auto x_last_dim = context.mark_node(std::make_shared<v8::Gather>(shape_x, neg_one, zero));
80-
auto x_new_shape = context.mark_node(std::make_shared<v0::Concat>(OutputVector{neg_one, x_last_dim}, 0));
81-
82-
auto x_new = context.mark_node(std::make_shared<v1::Reshape>(x, x_new_shape, false));
83-
auto mm = context.mark_node(std::make_shared<v0::MatMul>(x_new, weight));
84-
auto addmm = context.mark_node(std::make_shared<v1::Add>(bias, mm));
85-
auto size_out = context.mark_node(std::make_shared<v12::ScatterElementsUpdate>(shape_x, neg_one, neg_one, zero));
86-
return {context.mark_node(std::make_shared<v1::Reshape>(addmm, size_out, false))};
76+
auto mm = context.mark_node(std::make_shared<v0::MatMul>(x, weight));
77+
return {context.mark_node(std::make_shared<v1::Add>(mm, bias))};
8778
};
8879

8980
} // namespace op

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/mvn_gpu_bfyx_opt.cl

+6-35
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,6 @@ KERNEL (mvn_gpu_bfyx_opt)(
2929
float my_sum = 0;
3030
float tmp;
3131

32-
__local float lg_storage[SLM_SIZE];
33-
3432
//each WI reads items_num consecutive items from batch*feature
3533
for (uint i=0; i<items_num; ++i)
3634
{
@@ -42,23 +40,7 @@ KERNEL (mvn_gpu_bfyx_opt)(
4240
my_sum += (float)input[data_set_offset + workers_per_data_set * items_num + in_data_set_idx];
4341
}
4442

45-
lg_storage[in_data_set_idx] = my_sum;
46-
47-
barrier(CLK_LOCAL_MEM_FENCE);
48-
for (uint offset = workers_per_data_set / 2; offset > 0; offset /= 2) {
49-
if (in_data_set_idx < offset) {
50-
lg_storage[in_data_set_idx] += lg_storage[in_data_set_idx + offset];
51-
}
52-
barrier(CLK_LOCAL_MEM_FENCE);
53-
}
54-
55-
if (in_data_set_idx == 0)
56-
{
57-
lg_storage[0] /= data_set_size;
58-
}
59-
barrier(CLK_LOCAL_MEM_FENCE);
60-
61-
my_sum = lg_storage[0];
43+
my_sum = work_group_reduce_add(my_sum) / data_set_size;
6244

6345
#if NORMALIZE_VARIANCE == 0
6446
for (uint i=0; i<items_num; ++i) {
@@ -82,7 +64,6 @@ KERNEL (mvn_gpu_bfyx_opt)(
8264
# endif
8365
}
8466
#else
85-
barrier(CLK_LOCAL_MEM_FENCE);
8667

8768
float my_variance = 0.f;
8869
//each WI reads items_num consecutive items from batch*feature
@@ -100,30 +81,20 @@ KERNEL (mvn_gpu_bfyx_opt)(
10081
my_variance = fma(tmp, tmp, my_variance);
10182
}
10283

103-
lg_storage[in_data_set_idx] = my_variance;
104-
105-
barrier(CLK_LOCAL_MEM_FENCE);
106-
107-
for (uint offset = workers_per_data_set / 2; offset > 0; offset /= 2) {
108-
if (in_data_set_idx < offset) {
109-
lg_storage[in_data_set_idx] += lg_storage[in_data_set_idx + offset];
110-
}
111-
barrier(CLK_LOCAL_MEM_FENCE);
112-
}
84+
my_variance = work_group_reduce_add(my_variance);
11385

11486
if (in_data_set_idx == 0)
11587
{
116-
my_variance = lg_storage[0] / data_set_size;
88+
my_variance /= data_set_size;
11789

11890
# if defined EPS_OUTSIDE_SQRT
119-
lg_storage[0] = native_powr(native_sqrt(my_variance) + (float)EPSILON, -1.f);
91+
my_variance = native_powr(native_sqrt(my_variance) + (float)EPSILON, -1.f);
12092
# elif defined EPS_INSIDE_SQRT
121-
lg_storage[0] = native_powr(my_variance + (float)EPSILON, -0.5f);
93+
my_variance = native_powr(my_variance + (float)EPSILON, -0.5f);
12294
# endif
12395
}
124-
barrier(CLK_LOCAL_MEM_FENCE);
12596

126-
my_variance = lg_storage[0];
97+
my_variance = work_group_broadcast(my_variance, 0);
12798

12899
for (uint i=0; i<items_num; ++i) {
129100
uint iteration_in_data_set_offset = i * workers_per_data_set;

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/select_gpu_ref.cl

+5-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,11 @@ KERNEL(select)(
4545
uint output_offset = OUTPUT_GET_INDEX(b, f, y, x);
4646
#endif
4747

48-
const OUTPUT_TYPE res = TO_OUTPUT_TYPE(select(INPUT_2, INPUT_1, MASK));
48+
#if INPUT1_IS_FP && !OUTPUT_IS_FP
49+
const OUTPUT_TYPE res = TO_OUTPUT_TYPE(convert_long(select(INPUT_2, INPUT_1, MASK)));
50+
#else
51+
const OUTPUT_TYPE res = TO_OUTPUT_TYPE(select(INPUT_2, INPUT_1, MASK));
52+
#endif
4953

5054
output[output_offset] = res;
5155
}

src/plugins/intel_gpu/src/kernel_selector/kernels/mvn/mvn_kernel_bfyx_opt.cpp

+1-3
Original file line numberDiff line numberDiff line change
@@ -93,14 +93,12 @@ JitConstants MVNKernelBfyxOpt::GetJitConstants(const mvn_params& params, MVNKern
9393
const std::string lws_0 = "get_local_size(0)";
9494
jit.AddConstants({
9595
MakeJitConstant("LWS", lws_0),
96-
MakeJitConstant("SLM_SIZE", dispatchData.maxSlmSize),
97-
MakeJitConstant("DATA_SETS_COUNT", data_set_count),
9896
MakeJitConstant("DATA_SET_SIZE", data_set_size),
97+
MakeJitConstant("DATA_SETS_COUNT", data_set_count),
9998
});
10099
} else {
101100
jit.AddConstants({
102101
MakeJitConstant("LWS", dispatchData.lws[0]),
103-
MakeJitConstant("SLM_SIZE", dispatchData.lws[0]),
104102
MakeJitConstant("DATA_SETS_COUNT", dispatchData.dataSetsCount),
105103
MakeJitConstant("DATA_SET_SIZE", dispatchData.dataSetSize),
106104
});

src/plugins/intel_gpu/tests/unit/fusions/select_fusion_test.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,12 @@ class SelectFusingTest : public ::BaseFusingTest<select_test_params> {
6363
} // namespace
6464

6565
#define CASE_SELECT_FP32_TO_I8_0 {2, 16, 4, 4}, {2, 16, 4, 4}, data_types::f32, data_types::i8, format::bfyx, format::bfyx
66+
#define CASE_SELECT_FP32_TO_U8_0 {2, 16, 4, 4}, {2, 16, 4, 4}, data_types::f32, data_types::u8, format::bfyx, format::bfyx
6667
#define CASE_SELECT_FP32_TO_F16_0 {2, 16, 17, 4}, {2, 16, 1, 4}, data_types::f32, data_types::f16, format::bfyx, format::bfyx
6768
#define CASE_SELECT_FP16_TO_I8_0 {2, 16, 4, 4}, {2, 16, 4, 4}, data_types::f16, data_types::i8, format::bfyx, format::bfyx
69+
#define CASE_SELECT_FP16_TO_U8_0 {2, 16, 4, 4}, {2, 16, 4, 4}, data_types::f16, data_types::u8, format::bfyx, format::bfyx
6870
#define CASE_SELECT_FP16_TO_I8_1 {2, 16, 4, 4}, {2, 16, 4, 4}, data_types::f16, data_types::i8, format::bfyx, format::bfzyx
71+
#define CASE_SELECT_FP16_TO_U8_1 {2, 16, 4, 4}, {2, 16, 4, 4}, data_types::f16, data_types::u8, format::bfyx, format::bfzyx
6972

7073
class select_reorder_fusion : public SelectFusingTest {};
7174
TEST_P(select_reorder_fusion, basic) {
@@ -85,8 +88,11 @@ TEST_P(select_reorder_fusion, basic) {
8588
INSTANTIATE_TEST_SUITE_P(fusings_gpu, select_reorder_fusion, ::testing::ValuesIn(std::vector<select_test_params>{
8689
select_test_params{ CASE_SELECT_FP32_TO_F16_0, 5, 6},
8790
select_test_params{ CASE_SELECT_FP32_TO_I8_0, 5, 6},
91+
select_test_params{ CASE_SELECT_FP32_TO_U8_0, 5, 6},
8892
select_test_params{ CASE_SELECT_FP16_TO_I8_0, 5, 6},
93+
select_test_params{ CASE_SELECT_FP16_TO_U8_0, 5, 6},
8994
select_test_params{ CASE_SELECT_FP16_TO_I8_1, 6, 6}, // reorder should not be fused
95+
select_test_params{ CASE_SELECT_FP16_TO_U8_1, 6, 6},
9096
}));
9197

9298
class select_reorder_fusion_dynamic : public SelectFusingTest {};

src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale,
5151
DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime);
5252
DEFINE_OPT(NPUW_PARALLEL_COMPILE, bool, false, npuw::parallel_compilation, CompileTime);
5353
DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime);
54+
DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime);
5455
DEFINE_OPT(NPUW_CACHE_DIR, std::string, "", npuw::cache_dir, CompileTime);
5556
DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime);
5657
DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime);

src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp

+8
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,14 @@ static constexpr ov::Property<std::string> submodel_device{"NPUW_SUBMODEL_DEVICE
4545
*/
4646
static constexpr ov::Property<std::string> weights_bank{"NPUW_WEIGHTS_BANK"};
4747

48+
/**
49+
* @brief
50+
* Type: std::string.
51+
* Specify device name for weights bank which is used to allocate memory.
52+
* Default value: "".
53+
*/
54+
static constexpr ov::Property<std::string> weights_bank_alloc{"NPUW_WEIGHTS_BANK_ALLOC"};
55+
4856
/**
4957
* @brief
5058
* Type: std::string.

src/plugins/intel_npu/src/al/src/config/npuw.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
3737
desc.add<NPUW_PARALLEL_COMPILE>();
3838
desc.add<NPUW_FUNCALL_ASYNC>();
3939
desc.add<NPUW_WEIGHTS_BANK>();
40+
desc.add<NPUW_WEIGHTS_BANK_ALLOC>();
4041
desc.add<NPUW_CACHE_DIR>();
4142
desc.add<NPUW_ACC_CHECK>();
4243
desc.add<NPUW_ACC_THRESH>();

src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp

+13
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,19 @@ bool ov::npuw::IBaseInferRequest::needs_copy(std::size_t idx) const {
375375
return true;
376376
}
377377

378+
bool ov::npuw::IBaseInferRequest::needs_copy(std::size_t idx, std::size_t cidx) const {
379+
if (!needs_copy(idx)) {
380+
return false;
381+
}
382+
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
383+
if (comp_model_desc.is_remote[cidx]) {
384+
// FIXME: Test if the tensor device and the request device are
385+
// the same or compatible!
386+
return false;
387+
}
388+
return true;
389+
}
390+
378391
std::size_t ov::npuw::IBaseInferRequest::next(std::size_t idx_base) const {
379392
// Answer the next valid subrequest which is possible to prepare
380393
// FIXME: this could be a predefined map, not a lookup

src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
127127
std::size_t m_run_iter = 0u;
128128

129129
bool needs_copy(std::size_t idx) const;
130+
bool needs_copy(std::size_t idx, std::size_t cidx) const;
130131
std::size_t next(std::size_t idx_base) const;
131132
std::size_t real(std::size_t idx) const;
132133

src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp

+53-17
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
115115

116116
// Initialize weights bank
117117
const std::string weights_bank_opt = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK>();
118-
m_weights_bank = ov::npuw::weights::bank(weights_bank_opt, plugin->get_core());
118+
const std::string wbank_alloc = m_cfg.get<::intel_npu::NPUW_WEIGHTS_BANK_ALLOC>();
119+
m_weights_bank = ov::npuw::weights::bank(weights_bank_opt, plugin->get_core(), wbank_alloc);
119120

120121
LOG_VERB("*** Original model ***");
121122
const auto& orig_parameters = model->get_parameters();
@@ -235,6 +236,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
235236
} // for(ordered_subgraphs)
236237
// NOTE(dm): there's a better way to do it, like we do in G-API backends.
237238

239+
m_update_required = m_cfg.get<::intel_npu::NPUW_FOLD>();
240+
238241
// Store mapping between manually splitted inputs/outputs
239242
// to connect tensors between compiled submodels
240243
m_submodels_input_to_prev_output = partitioning.input_to_prev_output;
@@ -302,10 +305,11 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
302305
m_compiled_submodels[id].host_gather = subgraph._host_gather;
303306
m_compiled_submodels[id].param_base = fcn_template._param_offset;
304307
m_compiled_submodels[id].closure = subgraph._closure;
308+
m_compiled_submodels[id].lazy_closure = subgraph._lazy_closure;
305309
m_compiled_submodels[id].scales = subgraph._scales;
306310
m_compiled_submodels[id].zerops = subgraph._zerops;
307-
m_compiled_submodels[id].update_required.resize(subgraph._closure.size(), false);
308-
fill_weights_bank(id);
311+
m_compiled_submodels[id].forced_to_fcall = subgraph._forced_to_fcall;
312+
m_compiled_submodels[id].is_remote.resize(subgraph._lazy_closure.size(), false);
309313
} // if(!funcall)
310314

311315
if (!m_compiled_submodels[id].model && !m_compiled_submodels[id].replaced_by) {
@@ -421,6 +425,9 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
421425
}
422426
}
423427

428+
// Finalize memory in closures and weight banks
429+
finalize_weights_bank();
430+
424431
// Print stats report when possible
425432
{
426433
LOG_INFO("Initial device distribution:");
@@ -434,24 +441,54 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
434441
reset_io();
435442
}
436443

437-
void ov::npuw::CompiledModel::fill_weights_bank(const std::size_t idx) {
438-
LOG_VERB("Filling weights bank for Subgraph[" << idx << "]...");
439-
LOG_BLOCK();
444+
void ov::npuw::CompiledModel::finalize_weights_bank() {
445+
// Register lazy tensors
446+
for (std::size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
447+
auto& comp_model_desc = m_compiled_submodels[idx];
440448

441-
NPUW_ASSERT(m_compiled_submodels[idx].replaced_by);
449+
// Skip optimized out and non-functions
450+
if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
451+
return;
452+
}
442453

443-
auto& comp_model_desc = m_compiled_submodels[idx];
454+
const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
455+
auto& func_desc = m_compiled_submodels[real_idx];
444456

445-
for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) {
446-
comp_model_desc.closure[cidx] = m_weights_bank->update(comp_model_desc.closure[cidx]);
447-
if (m_cfg.get<::intel_npu::NPUW_FOLD>()) {
448-
comp_model_desc.update_required[cidx] = true;
449-
} else {
450-
comp_model_desc.update_required[cidx] = false;
457+
for (std::size_t tidx = 0; tidx < comp_model_desc.lazy_closure.size(); ++tidx) {
458+
if (comp_model_desc.closure[tidx]) {
459+
continue; // host-side closure
460+
}
461+
m_weights_bank->registerLT(comp_model_desc.lazy_closure[tidx], *func_desc.device_it);
451462
}
452463
}
453464

454-
LOG_VERB("DONE");
465+
// Evaluate and allocate all LazyTensors inside the bank
466+
m_weights_bank->evaluate_and_allocate();
467+
468+
// Set evaluated and allocated ov::Tensors to closures
469+
for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
470+
auto& comp_model_desc = m_compiled_submodels[idx];
471+
472+
// Skip optimized out and non-functions
473+
if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
474+
continue;
475+
}
476+
477+
const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
478+
auto& func_desc = m_compiled_submodels[real_idx];
479+
480+
for (std::size_t tidx = 0; tidx < comp_model_desc.lazy_closure.size(); ++tidx) {
481+
if (comp_model_desc.closure[tidx]) {
482+
// host-side closure - already set, do nothing
483+
comp_model_desc.is_remote[tidx] = false;
484+
continue;
485+
}
486+
const auto& lt = comp_model_desc.lazy_closure[tidx];
487+
comp_model_desc.closure[tidx] = m_weights_bank->get(lt, *func_desc.device_it);
488+
// FIXME: find a more reliable way to do so
489+
comp_model_desc.is_remote[tidx] = m_weights_bank->is_remote(lt);
490+
}
491+
}
455492
}
456493

457494
void ov::npuw::CompiledModel::remove_long_output_names(const std::shared_ptr<ov::Model>& model) {
@@ -748,7 +785,6 @@ void ov::npuw::CompiledModel::implement_properties() {
748785

749786
// 1.
750787
// OV Public
751-
// ===============================================
752788
m_prop_to_opt = {{ov::supported_properties.name(),
753789
{ov::PropertyMutability::RO,
754790
[&](const ::intel_npu::Config&) -> std::vector<PropertyName>& {
@@ -785,7 +821,6 @@ void ov::npuw::CompiledModel::implement_properties() {
785821
return m_loaded_from_cache;
786822
}}},
787823
// OV Public Hints
788-
// =====================================================
789824
{ov::hint::performance_mode.name(),
790825
{ov::PropertyMutability::RO,
791826
[&](const ::intel_npu::Config&) {
@@ -856,6 +891,7 @@ void ov::npuw::CompiledModel::implement_properties() {
856891
BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
857892
BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC),
858893
BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK),
894+
BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC),
859895
BIND(npuw::cache_dir, NPUW_CACHE_DIR),
860896
BIND(npuw::accuracy::check, NPUW_ACC_CHECK),
861897
BIND(npuw::accuracy::threshold, NPUW_ACC_THRESH),

src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp

+10-2
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ class CompiledModel : public ov::ICompiledModel {
7272

7373
void implement_properties();
7474

75-
void fill_weights_bank(const std::size_t idx);
75+
void finalize_weights_bank();
7676

7777
std::shared_ptr<::intel_npu::OptionsDesc> m_options_desc;
7878
::intel_npu::Config m_cfg;
@@ -135,10 +135,16 @@ class CompiledModel : public ov::ICompiledModel {
135135
// FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
136136
// w.r.t. function calls
137137
std::size_t param_base = 0;
138+
// NB: closure and lazy_closure are of the same size - to preserve proper indexing.
139+
// closure is responsible for host-side tensors (DCOFF, Gather, etc) while
140+
// lazy_closure is used for weights sharing and allocating device memory.
138141
std::vector<ov::Tensor> closure;
142+
std::vector<weights::LazyTensor> lazy_closure;
139143
std::vector<ov::Tensor> scales;
140144
std::vector<ov::Tensor> zerops;
141-
std::vector<bool> update_required;
145+
std::vector<bool> is_remote;
146+
147+
bool forced_to_fcall = false;
142148

143149
// FIXME: Take it out of structure
144150
ov::SoPtr<ov::ICompiledModel> ref_compiled_model;
@@ -149,6 +155,8 @@ class CompiledModel : public ov::ICompiledModel {
149155
};
150156
std::vector<CompiledModelDesc> m_compiled_submodels;
151157

158+
bool m_update_required;
159+
152160
std::function<bool(const ov::SoPtr<ov::ITensor>&, const ov::SoPtr<ov::ITensor>&)> m_acc_check;
153161
std::string m_ref_device;
154162

0 commit comments

Comments
 (0)