[GPU] Remove force exit (#29062)

jade-cho · web-flow · commit e6b90c34da92 · 2025-02-25T17:17:00.000Z
### Tickets:
 - *162459*
diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h
@@ -543,7 +543,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
                 _prim.execute(stream.get_onednn_stream(), _args[net_id]);
             } catch (dnnl::error& err) {
                 auto err_code = err.status == dnnl_status_t::dnnl_out_of_memory ? CL_OUT_OF_RESOURCES : CL_INVALID_OPERATION;
-                ocl::rethrow_or_exit(err.what(), err_code, _engine->get_device_info());
+                ocl::rethrow(err.what(), err_code, _engine->get_device_info());
             }
 
             if (_enable_profiling) {
diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -223,6 +223,11 @@ network::network(program::ptr program, stream::ptr stream, uint16_t stream_id)
 network::~network() {
     if (_program != nullptr)
         _program->cancel_compilation_context();
+
+    // Clear the command queue to prevent errors caused by remaining tasks.
+    if (_stream != nullptr)
+        _stream->finish();
+
     _memory_pool->clear_pool_for_network(net_id);
     std::string dump_path = GPU_DEBUG_VALUE_OR(get_config().get_dump_profiling_data_path(), "");
     GPU_DEBUG_IF(!dump_path.empty()) {
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_common.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_common.hpp
@@ -22,18 +22,6 @@ class ocl_error : public ov::Exception {
 
 #define OCL_ERR_MSG_FMT(error) ("[GPU] " + std::string(error.what()) + std::string(", error code: ") + std::to_string(error.err()))
 
-
-/// WA: Force exit. Any opencl api call can be hang after CL_OUT_OF_RESOURCES.
-inline void force_exit() {
-    std::cerr << "[GPU] force exit.\n"
-              << "\tDue to the driver bug any subsequent OpenCL API call will cause application hang, "
-              << "so GPU plugin can't finish correctly.\n"
-              << "\tPlease try to update the driver or reduce memory consumption "
-              << "(use smaller batch size, less streams, lower precision, etc)"
-              << "to avoid CL_OUT_OF_RESOURCES exception" << std::endl;
-    std::_Exit(-1);
-}
-
 inline bool is_device_available(const device_info& info) {
     ocl_device_detector detector;
     auto devices = detector.get_available_devices(nullptr, nullptr);
@@ -46,22 +34,32 @@ inline bool is_device_available(const device_info& info) {
     return false;
 }
 
-inline void rethrow_or_exit(std::string message, cl_int error, const device_info& info) {
+inline void rethrow(std::string message, cl_int error, const device_info& info) {
     if (error != CL_OUT_OF_RESOURCES) {
         OPENVINO_THROW(message);
     }
     // For CL_OUT_OF_RESOURCES exception there are 2 possible cases:
-    // 1. Real out of resource which means that plugin must exit
+    // 1. Real out of resource
     // 2. Device is lost during application run, plugin may throw an exception
     if (is_device_available(info)) {
-        force_exit();
+        std::stringstream ss;
+        ss << "[GPU] CL_OUT_OF_RESOURCES exception.\n"
+           << "\tDue to a driver bug, any subsequent OpenCL API call may cause the application to hang, "
+           << "so the GPU plugin may be unable to finish correctly.\n"
+           << "\tThe CL_OUT_OF_RESOURCES error typically occurs in two cases:\n"
+           << "\t1. An actual lack of memory for the current inference.\n"
+           << "\t2. An out-of-bounds access to GPU memory from a kernel.\n"
+           << "\tFor case 1, you may try adjusting some model parameters (e.g., using a smaller batch size, lower inference precision, fewer streams, etc.)"
+           << " to reduce the required memory size. For case 2, please submit a bug report to the OpenVINO team.\n"
+           << "\tAdditionally, please try updating the driver to the latest version.\n";
+        OPENVINO_THROW(ss.str());
     } else {
         OPENVINO_THROW(message);
     }
 }
 
-inline void rethrow_or_exit(const cl::Error& error, const device_info& info) {
-    rethrow_or_exit(OCL_ERR_MSG_FMT(error), error.err(), info);
+inline void rethrow(const cl::Error& error, const device_info& info) {
+    rethrow(OCL_ERR_MSG_FMT(error), error.err(), info);
 }
 
 }  // namespace ocl
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp
@@ -289,7 +289,7 @@ event::ptr ocl_stream::enqueue_kernel(kernel& kernel,
     try {
         _command_queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local, dep_events_ptr, set_output_event ? &ret_ev : nullptr);
     } catch (cl::Error const& err) {
-        ocl::rethrow_or_exit(err, _engine.get_device_info());
+        ocl::rethrow(err, _engine.get_device_info());
     }
 
     return std::make_shared<ocl_event>(ret_ev, ++_queue_counter);

Original file line number	Diff line number	Diff line change
`@@ -543,7 +543,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {`
`543`	`543`	`_prim.execute(stream.get_onednn_stream(), _args[net_id]);`
`544`	`544`	`} catch (dnnl::error& err) {`
`545`	`545`	`auto err_code = err.status == dnnl_status_t::dnnl_out_of_memory ? CL_OUT_OF_RESOURCES : CL_INVALID_OPERATION;`
`546`		`- ocl::rethrow_or_exit(err.what(), err_code, _engine->get_device_info());`
	`546`	`+ ocl::rethrow(err.what(), err_code, _engine->get_device_info());`
`547`	`547`	`}`
`548`	`548`
`549`	`549`	`if (_enable_profiling) {`
Original file line number	Diff line number	Diff line change
`@@ -289,7 +289,7 @@ event::ptr ocl_stream::enqueue_kernel(kernel& kernel,`
`289`	`289`	`try {`
`290`	`290`	`_command_queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local, dep_events_ptr, set_output_event ? &ret_ev : nullptr);`
`291`	`291`	`} catch (cl::Error const& err) {`
`292`		`- ocl::rethrow_or_exit(err, _engine.get_device_info());`
	`292`	`+ ocl::rethrow(err, _engine.get_device_info());`
`293`	`293`	`}`
`294`	`294`
`295`	`295`	`return std::make_shared<ocl_event>(ret_ev, ++_queue_counter);`