Skip to content

Commit 5872549

Browse files
authored
[GPU] Improve memory pool access performance (openvinotoolkit#22977)
### Details: - Previously, memory conflict check was done using std::string (primitive_id), and it was time consuming - Fixed to use unique_id as mem_dep, instead of std::string ### Tickets: - 131916
1 parent e70e59b commit 5872549

File tree

12 files changed

+91
-70
lines changed

12 files changed

+91
-70
lines changed

src/plugins/intel_gpu/include/intel_gpu/runtime/memory_pool.hpp

+17-12
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,15 @@ using primitive_id = std::string;
2727
using memory_ptr = std::shared_ptr<memory>;
2828

2929
struct memory_user {
30-
primitive_id _id;
30+
size_t _unique_id;
3131
uint32_t _network_id;
32+
primitive_id _prim_id;
3233

33-
memory_user(primitive_id id, uint32_t network_id)
34-
: _id(id), _network_id(network_id) {}
34+
memory_user(size_t unique_id, uint32_t network_id, primitive_id prim_id)
35+
: _unique_id(unique_id), _network_id(network_id), _prim_id(prim_id) {}
3536

3637
friend std::ostream& operator<<(std::ostream& os, const memory_user& memory_user) {
37-
os << memory_user._id << "(" << memory_user._network_id << ")";
38+
os << memory_user._prim_id << " (unique_id:" << memory_user._unique_id << ", net_id:" << memory_user._network_id << ")";
3839
return os;
3940
}
4041
};
@@ -43,7 +44,7 @@ struct memory_user_comparer {
4344
bool operator()(const memory_user& l_mu, const memory_user& r_mu) const {
4445
if (l_mu._network_id != r_mu._network_id)
4546
return l_mu._network_id < r_mu._network_id;
46-
return l_mu._id < r_mu._id;
47+
return l_mu._unique_id < r_mu._unique_id;
4748
}
4849
};
4950

@@ -91,7 +92,7 @@ class memory_pool {
9192
memory_pool();
9293

9394
memory_ptr alloc_memory(const layout& layout, allocation_type type, bool reset = true);
94-
static bool has_conflict(const memory_set&, const std::set<primitive_id>&, uint32_t network_id);
95+
static bool has_conflict(const memory_set&, const std::set<size_t>&, uint32_t network_id);
9596

9697
std::multimap<uint64_t, memory_record> _non_padded_pool;
9798
std::map<layout, std::list<memory_record>, padded_pool_comparer> _padded_pool;
@@ -103,29 +104,33 @@ class memory_pool {
103104
~memory_pool();
104105
memory_ptr get_memory(const layout& layout,
105106
const primitive_id& id,
107+
size_t unique_id,
106108
uint32_t network_id,
107-
const std::set<primitive_id>& restrictions,
109+
const std::set<size_t>& restrictions,
108110
allocation_type type,
109111
bool reusable = true,
110112
bool reset = true); // get from pool or create memory allocation
111113
memory_ptr get_memory(const layout& layout, allocation_type type, bool reset = true);
112114
memory_ptr get_from_non_padded_pool(const layout& layout,
113-
const primitive_id& id,
115+
const primitive_id& prim_id,
116+
size_t unique_id,
114117
uint32_t network_id,
115-
const std::set<primitive_id>&,
118+
const std::set<size_t>&,
116119
allocation_type type,
117120
bool reset = true);
118121
memory_ptr get_from_padded_pool(const layout& layout,
119-
const primitive_id& id,
122+
const primitive_id& prim_id,
123+
size_t unique_id,
120124
uint32_t network_id,
121-
const std::set<primitive_id>& restrictions,
125+
const std::set<size_t>& restrictions,
122126
allocation_type type);
123127
memory_ptr get_from_across_networks_pool(const layout& layout,
124128
const primitive_id& id,
129+
size_t unique_id,
125130
uint32_t network_id,
126131
allocation_type type);
127132
void clear_pool_for_network(uint32_t network_id);
128-
void release_memory(memory* memory, const primitive_id& id, uint32_t network_id);
133+
void release_memory(memory* memory, const size_t& unique_id, primitive_id prim_id, uint32_t network_id);
129134

130135
size_t get_non_padded_pool_size() {
131136
return _non_padded_pool.size();

src/plugins/intel_gpu/src/graph/graph_optimizer/basic_memory_dependencies.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ using namespace cldnn;
2020
void basic_memory_dependencies::run(program& p) {
2121
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "pass::BasicMemoryDependencies");
2222
auto itr = p.get_processing_order().begin();
23-
std::vector<primitive_id> past_outputs;
23+
std::vector<size_t> past_outputs;
2424
while (itr != p.get_processing_order().end()) {
2525
auto& node = *itr;
2626
itr++;
@@ -62,7 +62,7 @@ void basic_memory_dependencies::run(program& p) {
6262
node->add_memory_dependency(past_outputs);
6363
// if current node is an output add it to the outputs list after restriction.
6464
if (node->is_output()) {
65-
past_outputs.push_back(node->id());
65+
past_outputs.push_back(node->get_unique_id());
6666
if (node->is_type<mutable_data>()) {
6767
// if output is mutable data, then propagate output flag to its dependencies
6868
for (auto& dep : node->get_dependencies()) {

src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -288,10 +288,10 @@ void prepare_quantization::prepare_scale_shift_opt(program &p, quantize_node& qu
288288
p.add_connection(in_shift_node, new_quantize_node);
289289
p.add_connection(out_scale_node, new_quantize_node);
290290
p.add_connection(out_shift_node, new_quantize_node);
291-
new_quantize_node.add_memory_dependency(in_scale_node.id());
292-
new_quantize_node.add_memory_dependency(in_shift_node.id());
293-
new_quantize_node.add_memory_dependency(out_scale_node.id());
294-
new_quantize_node.add_memory_dependency(out_shift_node.id());
291+
new_quantize_node.add_memory_dependency(in_scale_node.get_unique_id());
292+
new_quantize_node.add_memory_dependency(in_shift_node.get_unique_id());
293+
new_quantize_node.add_memory_dependency(out_scale_node.get_unique_id());
294+
new_quantize_node.add_memory_dependency(out_shift_node.get_unique_id());
295295
p.get_processing_order().insert(&new_quantize_node, &in_shift_node);
296296
p.get_processing_order().insert(&new_quantize_node, &in_scale_node);
297297
p.get_processing_order().insert(&new_quantize_node, &out_shift_node);

src/plugins/intel_gpu/src/graph/include/pass_manager.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@ class memory_dependency_pass : public base_pass {
327327
explicit memory_dependency_pass(const std::string& pass_name) : base_pass(pass_name) {}
328328
void add_memory_dependency(program_node* node, program_node* dep) {
329329
if (node->can_be_optimized() || !dep->can_be_optimized()) {
330-
node->add_memory_dependency(dep->id());
330+
node->add_memory_dependency(static_cast<int32_t>(dep->get_unique_id()));
331331
} else {
332332
if (node->id() == dep->id()) {
333333
return;

src/plugins/intel_gpu/src/graph/include/primitive_inst.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ class primitive_inst {
188188
}
189189
return _network.get_primitives(users);
190190
}
191-
std::set<primitive_id> get_runtime_memory_dependencies() { return _runtime_memory_dependencies; }
191+
std::set<size_t> get_runtime_memory_dependencies() { return _runtime_memory_dependencies; }
192192

193193
const kernel_impl_params* get_impl_params() const { return _impl_params.get(); }
194194
// return pointer to const to prevent arbitrary 'execute' call -> use primitive_inst.execute() instead
@@ -264,7 +264,7 @@ class primitive_inst {
264264
memory_pool& pool,
265265
const program_node& _node,
266266
const kernel_impl_params& impl_params,
267-
const std::set<primitive_id>& memory_dependencies,
267+
const std::set<size_t>& memory_dependencies,
268268
uint32_t net_id,
269269
bool is_internal,
270270
size_t idx = 0,
@@ -333,7 +333,7 @@ class primitive_inst {
333333
std::vector<cldnn::primitive_id> _exec_dep_ids;
334334

335335
// List of primitive ids that this primitive can't share memory buffers with
336-
std::set<primitive_id> _runtime_memory_dependencies;
336+
std::set<size_t> _runtime_memory_dependencies;
337337

338338
// This is sub-network generated on demand to execute unfused primitives sequence instead of single fused primitive
339339
// Needed for dynamic path only, as fusion in some cases may be illegal, but it can't be checked on program build phase,

src/plugins/intel_gpu/src/graph/include/program_node.h

+9-4
Original file line numberDiff line numberDiff line change
@@ -202,9 +202,9 @@ struct program_node {
202202
size_t get_dependency_index(const program_node& node) const;
203203
size_t get_user_index(const program_node& node) const;
204204

205-
std::set<primitive_id> get_memory_dependencies() const;
206-
void add_memory_dependency(primitive_id);
207-
void add_memory_dependency(std::vector<primitive_id>);
205+
std::set<size_t> get_memory_dependencies() const;
206+
void add_memory_dependency(size_t);
207+
void add_memory_dependency(std::vector<size_t>);
208208

209209
template <class PType>
210210
bool have_user_with_type() const {
@@ -425,6 +425,11 @@ struct program_node {
425425
unique_id = cur_id++;
426426
}
427427

428+
void set_unique_id(size_t _id) {
429+
unique_id = _id;
430+
}
431+
432+
428433
static void reset_unique_id() {
429434
cur_id = 0;
430435
}
@@ -473,7 +478,7 @@ struct program_node {
473478
std::list<program_node*> users;
474479

475480
// list of primitives that can reuse same memory buffers due to execution order conflicts
476-
std::set<primitive_id> memory_dependencies;
481+
std::set<size_t> memory_dependencies;
477482

478483
impl_types impl_type = impl_types::any;
479484
bool constant = false;

src/plugins/intel_gpu/src/graph/network.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -1379,7 +1379,7 @@ void network::transfer_memory_to_device(std::shared_ptr<primitive_inst> instance
13791379
auto device_mem = inst_mem.get_engine()->allocate_memory(inst_mem.get_layout(), allocation_type::usm_device, false);
13801380
device_mem->copy_from(get_stream(), inst_mem);
13811381
GPU_DEBUG_LOG << "[" << node.id() << ": constant]" << std::endl;
1382-
_memory_pool->release_memory(&inst_mem, node.id(), get_id());
1382+
_memory_pool->release_memory(&inst_mem, node.get_unique_id(), node.id(), get_id());
13831383
instance->set_output_memory(device_mem);
13841384
}
13851385
}

src/plugins/intel_gpu/src/graph/primitive_inst.cpp

+6-7
Original file line numberDiff line numberDiff line change
@@ -152,16 +152,16 @@ static memory::ptr get_memory_from_pool(engine& _engine,
152152
const layout& layout,
153153
allocation_type type,
154154
bool reusable_across_network,
155-
const std::set<std::string>& memory_dependencies,
155+
const std::set<size_t>& memory_dependencies,
156156
bool reset = true,
157157
memory* curr_memory = nullptr) {
158158
OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(),
159159
"[GPU] Can't allocate output for dynamic layout without upper bound");
160160
// Use layout with max tensor for dynamic shape with upper bound
161161
if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool)) {
162162
if (curr_memory != nullptr)
163-
pool.release_memory(curr_memory, _node.id(), net_id);
164-
return pool.get_memory(layout, _node.id(), net_id, memory_dependencies, type, reusable_across_network, reset);
163+
pool.release_memory(curr_memory, _node.get_unique_id(), _node.id(), net_id);
164+
return pool.get_memory(layout, _node.id(), _node.get_unique_id(), net_id, memory_dependencies, type, reusable_across_network, reset);
165165
}
166166
return pool.get_memory(layout, type, reset);
167167
}
@@ -962,7 +962,7 @@ void primitive_inst::do_runtime_skip_reorder() {
962962
update_memory_dependencies = [&](std::vector<primitive_inst*> users) {
963963
for (auto& user : users) {
964964
GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] add " << id() << " to restriction list of " << user->id() << std::endl;
965-
user->_runtime_memory_dependencies.insert(id());
965+
user->_runtime_memory_dependencies.insert(get_node().get_unique_id());
966966
if (user->can_be_optimized())
967967
update_memory_dependencies(user->get_user_insts());
968968
}
@@ -1465,7 +1465,7 @@ primitive_inst::primitive_inst(network& network)
14651465
, _mem_allocated(false)
14661466
, _type(nullptr) {}
14671467

1468-
primitive_inst::primitive_inst(network& network, program_node const& node, bool allocate_memory)
1468+
primitive_inst::primitive_inst(network & network, program_node const& node, bool allocate_memory)
14691469
: _network(network)
14701470
, _node(&node)
14711471
, _node_output_layout(node.get_output_layout())
@@ -1775,7 +1775,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine,
17751775
memory_pool& pool,
17761776
const program_node& _node,
17771777
const kernel_impl_params& impl_params,
1778-
const std::set<primitive_id>& memory_dependencies,
1778+
const std::set<size_t>& memory_dependencies,
17791779
uint32_t net_id,
17801780
bool is_internal,
17811781
size_t idx,
@@ -2124,5 +2124,4 @@ std::string primitive_inst::get_implementation_name() const {
21242124

21252125
return "undef";
21262126
}
2127-
21282127
} // namespace cldnn

src/plugins/intel_gpu/src/graph/program.cpp

+9-3
Original file line numberDiff line numberDiff line change
@@ -769,7 +769,9 @@ const std::vector<primitive_id>& program::get_allocating_order(bool forced_updat
769769
void program::prepare_memory_dependencies() {
770770
if (!_config.get_property(ov::intel_gpu::enable_memory_pool))
771771
return;
772-
772+
for (auto& node : get_processing_order()) {
773+
node->add_memory_dependency(node->get_unique_id());
774+
}
773775
apply_opt_pass<basic_memory_dependencies>();
774776
apply_opt_pass<skipped_branch_memory_dependencies>();
775777
apply_opt_pass<oooq_memory_dependencies>();
@@ -781,9 +783,13 @@ std::string program::get_memory_dependencies_string() const {
781783
while (itr != processing_order.end()) {
782784
auto& node = *itr;
783785
itr++;
784-
mem_dep = mem_dep.append("primitive: ").append(node->id()).append(" restricted list: ");
786+
mem_dep = mem_dep.append("primitive: ")
787+
.append(node->id())
788+
.append("(unique_id:")
789+
.append(std::to_string(node->get_unique_id()))
790+
.append(") restricted list: ");
785791
for (auto it : node->get_memory_dependencies())
786-
mem_dep = mem_dep.append(it).append(", ");
792+
mem_dep = mem_dep.append(std::to_string(it)).append(",");
787793
mem_dep = mem_dep.append("\n");
788794
}
789795
return mem_dep;

src/plugins/intel_gpu/src/graph/program_node.cpp

+5-4
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ program_node::program_node(std::shared_ptr<primitive> prim, program& prog)
6060
output_layouts.push_back(output_layout);
6161
valid_output_layouts.push_back(false);
6262
}
63-
add_memory_dependency(id());
6463
}
6564
}
6665

@@ -196,11 +195,11 @@ void program_node::remove_dependency(size_t idx) {
196195
dependencies.erase(dependencies.begin() + idx);
197196
}
198197

199-
std::set<primitive_id> program_node::get_memory_dependencies() const { return memory_dependencies; }
198+
std::set<size_t> program_node::get_memory_dependencies() const { return memory_dependencies; }
200199

201-
void program_node::add_memory_dependency(primitive_id prim) { memory_dependencies.insert(prim); }
200+
void program_node::add_memory_dependency(size_t prim) { memory_dependencies.insert(prim); }
202201

203-
void program_node::add_memory_dependency(std::vector<primitive_id> prim_list) {
202+
void program_node::add_memory_dependency(std::vector<size_t> prim_list) {
204203
memory_dependencies.insert(prim_list.begin(), prim_list.end());
205204
}
206205

@@ -639,6 +638,7 @@ void program_node::add_dependant_shape_of_node(const program_node* node) {
639638
}
640639

641640
void program_node::save(cldnn::BinaryOutputBuffer& ob) const {
641+
ob << unique_id;
642642
ob << valid_output_layouts;
643643
ob << output_layouts;
644644

@@ -775,6 +775,7 @@ void program_node::save(cldnn::BinaryOutputBuffer& ob) const {
775775
}
776776

777777
void program_node::load(cldnn::BinaryInputBuffer& ib) {
778+
ib >> unique_id;
778779
ib >> valid_output_layouts;
779780
ib >> output_layouts;
780781

0 commit comments

Comments
 (0)