[GPU] release of dynamic layout from memory pool (openvinotoolkit#29148)

michal-miotk · web-flow · commit 5cc808c32796 · 2025-03-03T09:19:26.000Z
### Details:
 - part of accuracy repair of FasterRCNN_Resnet50
- memory dependencies not end recursion when dependency can be optimized
and is runtime skipable
 - fix documentation OV_GPU_Verbose=2 -&gt; OV_VERBOSE=2


### Tickets:
 - 101294
diff --git a/src/plugins/intel_gpu/docs/memory_allocation_gpu_plugin.md b/src/plugins/intel_gpu/docs/memory_allocation_gpu_plugin.md
@@ -20,7 +20,7 @@ calls the corresponding memory object wrapper for each allocation type: [gpu_buf
 
 ## Dump memory allocation history
 
-The memory allocation history is being managed by the `engine`, which can be dumped by setting the environment variable `OV_GPU_Verbose=2` if OpenVINO is built with the cmake configuration `ENABLE_DEBUG_CAPS=ON`.
+The memory allocation history is being managed by the `engine`, which can be dumped by setting the environment variable `OV_VERBOSE=2` if OpenVINO is built with the cmake configuration `ENABLE_DEBUG_CAPS=ON`.
 ```cpp
 ...
 GPU_Debug: Allocate 58982400 bytes of usm_host allocation type (current=117969612; max=117969612)
diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h
@@ -320,7 +320,8 @@ class memory_dependency_pass : public base_pass {
             return;
         }
 
-        if ((node->can_be_optimized() && !node->is_runtime_skippable()) || !dep->can_be_optimized()) {
+        if ((!dep->can_be_optimized() || !dep->is_runtime_skippable()) && ((node->can_be_optimized() && !node->is_runtime_skippable())
+            || !dep->can_be_optimized())) {
             node->add_memory_dependency(static_cast<int32_t>(dep->get_unique_id()));
         } else {
             if (node->is_runtime_skippable() || dep->is_runtime_skippable() || dep->can_be_optimized()) {
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -1660,17 +1660,20 @@ void primitive_inst::do_runtime_skip_scatter_update() {
         return;
 
     GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_scatter_update] " << id() << " : check optimizability" << std::endl;
+    const auto& input_layout = _impl_params->get_input_layout(0);
+    const auto& output_layout = _impl_params->get_output_layout(0);
     const auto& idx_layout = _impl_params->get_input_layout(1);
     const auto& update_layout = _impl_params->get_input_layout(2);
 
-    if (idx_layout.count() > 0 && update_layout.count() > 0) {
+    if ((idx_layout.count() > 0 && update_layout.count() > 0) || (get_node().is_type<scatter_elements_update>() && input_layout != output_layout)) {
         // set shape_change to realloc memory for same input shapes
         if (can_be_optimized()) {
             set_flag(ExecutionFlags::SHAPE_CHANGED);
         }
         set_can_be_optimized(false);
         GPU_DEBUG_TRACE_DETAIL << "--- Cannot optimize because idx_layout (" << idx_layout.to_short_string()
-                        << ") and update_layout(" << update_layout.to_short_string() << ") are not zero" << std::endl;
+                        << ") and update_layout(" << update_layout.to_short_string() << ") are not zero"
+                        "or input layout is different than output layout" << std::endl;
         return;
     }
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/scatter_update/scatter_elements_update_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/scatter_update/scatter_elements_update_kernel_ref.cpp
@@ -163,6 +163,15 @@ bool ScatterElementsUpdateKernelRef::Validate(const Params& p) const {
     return true;
 }
 
+bool ScatterElementsUpdateKernelRef::SkipKernelExecution(const scatter_elements_update_params& params, size_t kernel_id) const {
+    if (kernel_id == 0) {
+        if (params.outputs[0].LogicalSize() != 0 && params.outputs[0] != params.inputs[0]) {
+            return false;
+        }
+    }
+    return KernelData::SkipKernelExecution(params);
+}
+
 void ScatterElementsUpdateKernelRef::GetUpdateDispatchDataFunc(KernelData& kd) const {
     kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
         const auto& prim_params = static_cast<const scatter_elements_update_params&>(params);
@@ -172,7 +181,7 @@ void ScatterElementsUpdateKernelRef::GetUpdateDispatchDataFunc(KernelData& kd) c
             auto dispatchData = SetDefault(prim_params, i == 1);
             kd.kernels[i].params.workGroups.global = dispatchData.gws;
             kd.kernels[i].params.workGroups.local = dispatchData.lws;
-            kd.kernels[i].skip_execution = KernelData::SkipKernelExecution(prim_params);
+            kd.kernels[i].skip_execution = SkipKernelExecution(prim_params, i);
         }
     };
 }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/scatter_update/scatter_elements_update_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/scatter_update/scatter_elements_update_kernel_ref.h
@@ -34,6 +34,7 @@ class ScatterElementsUpdateKernelRef : public KernelBaseOpenCL {
 
 protected:
     bool Validate(const Params& p) const override;
+    bool SkipKernelExecution(const scatter_elements_update_params& params, size_t kernel_id) const;
     void GetUpdateDispatchDataFunc(KernelData& kd) const override;
 };
 }  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/runtime/memory_pool.cpp b/src/plugins/intel_gpu/src/runtime/memory_pool.cpp
@@ -49,6 +49,10 @@ bool memory_pool::has_conflict(const memory_set& mem_cand,
 void memory_pool::release_memory(memory* mem, const size_t& unique_id, primitive_id prim_id, uint32_t network_id) {
     // check non padded pool first
     auto _layout = mem->get_layout();
+    if (_layout.is_dynamic()) {
+        const auto max_shape = _layout.get_partial_shape().get_max_shape();
+        _layout = _layout.clone_with_other_shape(max_shape);
+    }
     auto type = mem->get_allocation_type();
     const auto _layout_bytes_count = _layout.bytes_count();
 

Original file line number	Diff line number	Diff line change
`@@ -320,7 +320,8 @@ class memory_dependency_pass : public base_pass {`
`320`	`320`	`return;`
`321`	`321`	`}`
`322`	`322`
`323`		`- if ((node->can_be_optimized() && !node->is_runtime_skippable()) \|\| !dep->can_be_optimized()) {`
	`323`	`+ if ((!dep->can_be_optimized() \|\| !dep->is_runtime_skippable()) && ((node->can_be_optimized() && !node->is_runtime_skippable())`
	`324`	`+ \|\| !dep->can_be_optimized())) {`
`324`	`325`	`node->add_memory_dependency(static_cast<int32_t>(dep->get_unique_id()));`
`325`	`326`	`} else {`
`326`	`327`	`if (node->is_runtime_skippable() \|\| dep->is_runtime_skippable() \|\| dep->can_be_optimized()) {`