[GPU] Minor change (Added comment for kv cache prealloc policy) (#25529)

yeonbok · web-flow · commit dcdfdc586a8a · 2024-07-14T03:46:31.000Z
### Details:
 - Added detailed description about the kv cache prealloc policy 

### Tickets:
 - *ticket-id*
diff --git a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h
@@ -52,7 +52,6 @@ class typed_primitive_inst<kv_cache> : public typed_primitive_inst_base<kv_cache
 
     static std::string to_string(const kv_cache_node& node);
 
-    // Distribute prealloc period to prevent memory peak
     int32_t get_prealloc_iter_num() override;
 
     static void update_pad(layout& l, int64_t pad, int64_t sequence_axis_legacy) {
diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp
@@ -70,6 +70,15 @@ std::string kv_cache_inst::to_string(const kv_cache_node& node) {
 }
 
 int32_t kv_cache_inst::get_prealloc_iter_num() {
+    // - When a kv_cache_inst runs out of the pre-allocated memory and requires additional memory,
+    //   it allocate a new memory. And then it copies data in the original memory to the new memory.
+    //   Since the original memory is still assigned to the ReadValue, even after the copying is finished,
+    //   we will have 2x memories for the kv cache. And the original memory will be released when the ReadValue is
+    //   called, i.e., at the next iteration.
+    // - If this alloc/copy happens at the same time for all the kv cache memory, there will be a memory peak at that
+    //   iteration.
+    // - Therfore, to avoid this situation where the allocation and copying occurs simutaneously for all the kv_cache_insts,
+    //   we assigned different prealloc-size for each kv cache so that we could prevent a memory peak
     return 128 + kv_cache_id % 64;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -70,6 +70,15 @@ std::string kv_cache_inst::to_string(const kv_cache_node& node) {`
`70`	`70`	`}`
`71`	`71`
`72`	`72`	`int32_t kv_cache_inst::get_prealloc_iter_num() {`
	`73`	`+ // - When a kv_cache_inst runs out of the pre-allocated memory and requires additional memory,`
	`74`	`+ // it allocate a new memory. And then it copies data in the original memory to the new memory.`
	`75`	`+ // Since the original memory is still assigned to the ReadValue, even after the copying is finished,`
	`76`	`+ // we will have 2x memories for the kv cache. And the original memory will be released when the ReadValue is`
	`77`	`+ // called, i.e., at the next iteration.`
	`78`	`+ // - If this alloc/copy happens at the same time for all the kv cache memory, there will be a memory peak at that`
	`79`	`+ // iteration.`
	`80`	`+ // - Therfore, to avoid this situation where the allocation and copying occurs simutaneously for all the kv_cache_insts,`
	`81`	`+ // we assigned different prealloc-size for each kv cache so that we could prevent a memory peak`
`73`	`82`	`return 128 + kv_cache_id % 64;`
`74`	`83`	`}`
`75`	`84`