File tree 2 files changed +9
-1
lines changed
src/plugins/intel_gpu/src/graph
2 files changed +9
-1
lines changed Original file line number Diff line number Diff line change @@ -52,7 +52,6 @@ class typed_primitive_inst<kv_cache> : public typed_primitive_inst_base<kv_cache
52
52
53
53
static std::string to_string (const kv_cache_node& node);
54
54
55
- // Distribute prealloc period to prevent memory peak
56
55
int32_t get_prealloc_iter_num () override ;
57
56
58
57
static void update_pad (layout& l, int64_t pad, int64_t sequence_axis_legacy) {
Original file line number Diff line number Diff line change @@ -70,6 +70,15 @@ std::string kv_cache_inst::to_string(const kv_cache_node& node) {
70
70
}
71
71
72
72
int32_t kv_cache_inst::get_prealloc_iter_num () {
73
+ // - When a kv_cache_inst runs out of the pre-allocated memory and requires additional memory,
74
+ // it allocate a new memory. And then it copies data in the original memory to the new memory.
75
+ // Since the original memory is still assigned to the ReadValue, even after the copying is finished,
76
+ // we will have 2x memories for the kv cache. And the original memory will be released when the ReadValue is
77
+ // called, i.e., at the next iteration.
78
+ // - If this alloc/copy happens at the same time for all the kv cache memory, there will be a memory peak at that
79
+ // iteration.
80
+ // - Therfore, to avoid this situation where the allocation and copying occurs simutaneously for all the kv_cache_insts,
81
+ // we assigned different prealloc-size for each kv cache so that we could prevent a memory peak
73
82
return 128 + kv_cache_id % 64 ;
74
83
}
75
84
You can’t perform that action at this time.
0 commit comments