Skip to content

Commit e6b90c3

Browse files
authored
[GPU] Remove force exit (#29062)
### Tickets: - *162459*
1 parent 1604ae5 commit e6b90c3

File tree

4 files changed

+22
-19
lines changed

4 files changed

+22
-19
lines changed

src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
543543
_prim.execute(stream.get_onednn_stream(), _args[net_id]);
544544
} catch (dnnl::error& err) {
545545
auto err_code = err.status == dnnl_status_t::dnnl_out_of_memory ? CL_OUT_OF_RESOURCES : CL_INVALID_OPERATION;
546-
ocl::rethrow_or_exit(err.what(), err_code, _engine->get_device_info());
546+
ocl::rethrow(err.what(), err_code, _engine->get_device_info());
547547
}
548548

549549
if (_enable_profiling) {

src/plugins/intel_gpu/src/graph/network.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,11 @@ network::network(program::ptr program, stream::ptr stream, uint16_t stream_id)
223223
network::~network() {
224224
if (_program != nullptr)
225225
_program->cancel_compilation_context();
226+
227+
// Clear the command queue to prevent errors caused by remaining tasks.
228+
if (_stream != nullptr)
229+
_stream->finish();
230+
226231
_memory_pool->clear_pool_for_network(net_id);
227232
std::string dump_path = GPU_DEBUG_VALUE_OR(get_config().get_dump_profiling_data_path(), "");
228233
GPU_DEBUG_IF(!dump_path.empty()) {

src/plugins/intel_gpu/src/runtime/ocl/ocl_common.hpp

+15-17
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,6 @@ class ocl_error : public ov::Exception {
2222

2323
#define OCL_ERR_MSG_FMT(error) ("[GPU] " + std::string(error.what()) + std::string(", error code: ") + std::to_string(error.err()))
2424

25-
26-
/// WA: Force exit. Any opencl api call can be hang after CL_OUT_OF_RESOURCES.
27-
inline void force_exit() {
28-
std::cerr << "[GPU] force exit.\n"
29-
<< "\tDue to the driver bug any subsequent OpenCL API call will cause application hang, "
30-
<< "so GPU plugin can't finish correctly.\n"
31-
<< "\tPlease try to update the driver or reduce memory consumption "
32-
<< "(use smaller batch size, less streams, lower precision, etc)"
33-
<< "to avoid CL_OUT_OF_RESOURCES exception" << std::endl;
34-
std::_Exit(-1);
35-
}
36-
3725
inline bool is_device_available(const device_info& info) {
3826
ocl_device_detector detector;
3927
auto devices = detector.get_available_devices(nullptr, nullptr);
@@ -46,22 +34,32 @@ inline bool is_device_available(const device_info& info) {
4634
return false;
4735
}
4836

49-
inline void rethrow_or_exit(std::string message, cl_int error, const device_info& info) {
37+
inline void rethrow(std::string message, cl_int error, const device_info& info) {
5038
if (error != CL_OUT_OF_RESOURCES) {
5139
OPENVINO_THROW(message);
5240
}
5341
// For CL_OUT_OF_RESOURCES exception there are 2 possible cases:
54-
// 1. Real out of resource which means that plugin must exit
42+
// 1. Real out of resource
5543
// 2. Device is lost during application run, plugin may throw an exception
5644
if (is_device_available(info)) {
57-
force_exit();
45+
std::stringstream ss;
46+
ss << "[GPU] CL_OUT_OF_RESOURCES exception.\n"
47+
<< "\tDue to a driver bug, any subsequent OpenCL API call may cause the application to hang, "
48+
<< "so the GPU plugin may be unable to finish correctly.\n"
49+
<< "\tThe CL_OUT_OF_RESOURCES error typically occurs in two cases:\n"
50+
<< "\t1. An actual lack of memory for the current inference.\n"
51+
<< "\t2. An out-of-bounds access to GPU memory from a kernel.\n"
52+
<< "\tFor case 1, you may try adjusting some model parameters (e.g., using a smaller batch size, lower inference precision, fewer streams, etc.)"
53+
<< " to reduce the required memory size. For case 2, please submit a bug report to the OpenVINO team.\n"
54+
<< "\tAdditionally, please try updating the driver to the latest version.\n";
55+
OPENVINO_THROW(ss.str());
5856
} else {
5957
OPENVINO_THROW(message);
6058
}
6159
}
6260

63-
inline void rethrow_or_exit(const cl::Error& error, const device_info& info) {
64-
rethrow_or_exit(OCL_ERR_MSG_FMT(error), error.err(), info);
61+
inline void rethrow(const cl::Error& error, const device_info& info) {
62+
rethrow(OCL_ERR_MSG_FMT(error), error.err(), info);
6563
}
6664

6765
} // namespace ocl

src/plugins/intel_gpu/src/runtime/ocl/ocl_stream.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ event::ptr ocl_stream::enqueue_kernel(kernel& kernel,
289289
try {
290290
_command_queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local, dep_events_ptr, set_output_event ? &ret_ev : nullptr);
291291
} catch (cl::Error const& err) {
292-
ocl::rethrow_or_exit(err, _engine.get_device_info());
292+
ocl::rethrow(err, _engine.get_device_info());
293293
}
294294

295295
return std::make_shared<ocl_event>(ret_ev, ++_queue_counter);

0 commit comments

Comments
 (0)