Skip to content

Commit 6db4939

Browse files
authored
[GPU] Use usm_device for output buffer for BMG (#28865)
### Details: - (refer to GSD-10054) Not to use usm_host for large output buffer in BMG - Perf check done - After this PR, MiniCPM 2.6V first token latency reduced to 1/2 ### Tickets: - CVS-161158
1 parent 84dc4b9 commit 6db4939

16 files changed

+94
-88
lines changed

src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp

+24
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
#include "intel_gpu/runtime/memory.hpp"
1111

1212
#include "intel_gpu/runtime/shape_predictor.hpp"
13+
#include "intel_gpu/runtime/engine.hpp"
14+
#include "intel_gpu/runtime/device_info.hpp"
1315
#include "openvino/core/layout.hpp"
1416
#include "openvino/core/node.hpp"
1517
#include "openvino/core/type/element_type.hpp"
@@ -31,6 +33,28 @@ enum class TensorType {
3133

3234
#define TensorValue(val) static_cast<cldnn::tensor::value_type>(val)
3335

36+
inline bool can_use_usm_host(cldnn::engine& engine, const uint64_t total_output_bytes) {
37+
GPU_DEBUG_GET_INSTANCE(debug_config);
38+
GPU_DEBUG_IF(debug_config->use_usm_host == 1) { return true; }
39+
GPU_DEBUG_IF(debug_config->use_usm_host == 2) { return false; }
40+
41+
auto can_use_usm = engine.use_unified_shared_memory();
42+
// When output size is large, it is better not to write to usm_host directly
43+
const uint64_t LARGE_OUTPUT_BYTES_THRESHOLD = 4 * 1048576;
44+
45+
const auto& device_info = engine.get_device_info();
46+
if ((device_info.gfx_ver.major == 12 && device_info.gfx_ver.minor == 60) ||
47+
(device_info.gfx_ver.major >= 20 && device_info.dev_type == cldnn::device_type::discrete_gpu) ||
48+
(device_info.dev_type == cldnn::device_type::discrete_gpu && total_output_bytes > LARGE_OUTPUT_BYTES_THRESHOLD)) {
49+
// WA: Disable USM host memory for infer request`s tensors for PVC and subsequent dGPUs, as kernel access
50+
// to system memory is slower than using an explicit memcpy (Host <-> Device) call with the copy engine
51+
// Driver tickets with additional details: 6155, 10054
52+
GPU_DEBUG_TRACE << "Do not use usm_host for performance issue" << std::endl;
53+
can_use_usm = false;
54+
}
55+
56+
return can_use_usm;
57+
}
3458
inline cldnn::tensor tensor_from_dims(const ov::Shape& dims, int def = 1) {
3559
switch (dims.size()) {
3660
case 0: return cldnn::tensor(cldnn::batch(def), cldnn::feature(def), cldnn::spatial(def, def));

src/plugins/intel_gpu/src/graph/primitive_inst.cpp

+8-4
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646

4747
#include "intel_gpu/plugin/common_utils.hpp"
4848
#include "intel_gpu/plugin/multi_tensor_variable_state.hpp"
49+
#include "intel_gpu/plugin/sync_infer_request.hpp"
4950
#include "intel_gpu/graph/network.hpp"
5051
#include "intel_gpu/graph/serialization/set_serializer.hpp"
5152
#include "intel_gpu/runtime/engine.hpp"
@@ -2376,10 +2377,11 @@ memory::ptr primitive_inst::allocate_output(engine& _engine,
23762377
return a;
23772378
};
23782379

2380+
const auto& device_info = _engine.get_device_info();
23792381
auto layout = out_layout.clone_with_other_shape(out_layout.get_partial_shape().get_max_shape());
23802382
bool usm_device_allocatable = true;
23812383
const auto& total_device_input_mem_size = std::accumulate(impl_params.input_layouts.begin(), impl_params.input_layouts.end(), (uint64_t)0, device_mem_acc);
2382-
if (total_device_input_mem_size > _engine.get_device_info().max_global_mem_size)
2384+
if (total_device_input_mem_size > device_info.max_global_mem_size)
23832385
usm_device_allocatable = false;
23842386

23852387
bool reusable_across_network = (runtime_alloc && _node.is_dynamic_output_layout())
@@ -2398,11 +2400,13 @@ memory::ptr primitive_inst::allocate_output(engine& _engine,
23982400
// Also if the successor of a node is an cpu, then memory needs to be lockable.
23992401
bool is_cpu = _node.get_selected_impl() ? _node.get_selected_impl()->is_cpu() :
24002402
_node.get_preferred_impl_type() == impl_types::cpu;
2403+
2404+
auto total_output_bytes = layout.bytes_count();
24012405
auto use_lockable_memory =
2402-
is_output_buffer || is_cpu ||
2403-
has_any_cpu_user_not_shape_of(_node.get_users()) ||
2406+
(is_output_buffer && ov::intel_gpu::can_use_usm_host(_engine, total_output_bytes)) ||
2407+
is_cpu || has_any_cpu_user_not_shape_of(_node.get_users()) ||
24042408
!_engine.supports_allocation(allocation_type::usm_device) ||
2405-
(_node.is_shape_infer_dep() && _engine.get_device_info().dev_type == device_type::integrated_gpu);
2409+
(_node.is_shape_infer_dep() && device_info.dev_type == device_type::integrated_gpu);
24062410
const auto& lockable_mem_type = _engine.get_lockable_preferred_memory_allocation_type(layout.format.is_image_2d());
24072411

24082412
auto alloc_type = use_lockable_memory ? lockable_mem_type

src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp

+1-23
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
#include "intel_gpu/primitives/kv_cache.hpp"
1111
#include "intel_gpu/primitives/read_value.hpp"
12+
#include "intel_gpu/plugin/common_utils.hpp"
1213
#include "intel_gpu/plugin/usm_host_tensor.hpp"
1314
#include "intel_gpu/plugin/sync_infer_request.hpp"
1415
#include "intel_gpu/plugin/remote_context.hpp"
@@ -32,29 +33,6 @@
3233

3334
namespace {
3435

35-
inline bool can_use_usm_host(const cldnn::engine& engine, const uint64_t total_output_bytes) {
36-
GPU_DEBUG_GET_INSTANCE(debug_config);
37-
GPU_DEBUG_IF(debug_config->use_usm_host == 1) { return true; }
38-
GPU_DEBUG_IF(debug_config->use_usm_host == 2) { return false; }
39-
40-
auto can_use_usm = engine.use_unified_shared_memory();
41-
// When output size is large, it is better not to write to usm_host directly
42-
const uint64_t LARGE_OUTPUT_BYTES_THRESHOLD = 4 * 1048576;
43-
44-
const auto& device_info = engine.get_device_info();
45-
if ((device_info.gfx_ver.major == 12 && device_info.gfx_ver.minor == 60) ||
46-
(device_info.gfx_ver.major >= 20 && device_info.dev_type == cldnn::device_type::discrete_gpu) ||
47-
(device_info.dev_type == cldnn::device_type::discrete_gpu && total_output_bytes > LARGE_OUTPUT_BYTES_THRESHOLD)) {
48-
// WA: Disable USM host memory for infer request`s tensors for PVC and subsequent dGPUs, as kernel access
49-
// to system memory is slower than using an explicit memcpy (Host <-> Device) call with the copy engine
50-
// Driver tickets with additional details: 6155, 10054
51-
GPU_DEBUG_TRACE << "Do not use usm_host for performance issue" << std::endl;
52-
can_use_usm = false;
53-
}
54-
55-
return can_use_usm;
56-
}
57-
5836
bool is_convert_required(ov::element::Type src_et, ov::element::Type dst_et) {
5937
return src_et != dst_et && !(dst_et == ov::element::boolean && src_et == ov::element::u8);
6038
}

src/plugins/intel_gpu/tests/unit/shape_infer/broadcast_si_test.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ TEST_P(broadcast_test_two_inputs_blocked_format, shape_infer) {
122122

123123
auto outputs = network.execute();
124124
auto output = outputs.at("output").get_memory();
125-
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
125+
cldnn::mem_lock<float, mem_lock_type::read> output_ptr(output, get_test_stream());
126126

127127
ASSERT_EQ(output->get_layout(), p.expected_layout);
128128
}

src/plugins/intel_gpu/tests/unit/test_cases/broadcast_gpu_test.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ void start_broadcast_test(format cldnn_format, data_types cldnn_data_type, std::
7373
auto outputs = network.execute();
7474

7575
auto output = outputs.at("output").get_memory();
76-
cldnn::mem_lock<T> output_ptr(output, get_test_stream());
76+
cldnn::mem_lock<T, mem_lock_type::read> output_ptr(output, get_test_stream());
7777

7878
for (tensor::value_type b = 0; b < output_4d.at(0); ++b) {
7979
for (tensor::value_type f = 0; f < output_4d.at(1); ++f) {

src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp

+6-6
Original file line numberDiff line numberDiff line change
@@ -7927,7 +7927,7 @@ TEST_P(convolution_grouped_gpu, base) {
79277927
auto outputs = network.execute();
79287928

79297929
auto out_mem = outputs.at("conv").get_memory();
7930-
cldnn::mem_lock<float> out_ptr(out_mem, get_test_stream());
7930+
cldnn::mem_lock<float, mem_lock_type::read> out_ptr(out_mem, get_test_stream());
79317931
auto out_lay = out_mem->get_layout();
79327932

79337933
ASSERT_EQ(out_mem->get_layout().format, input_data_format);
@@ -10628,10 +10628,11 @@ TEST_P(conv_dyn_test, convolution_gpu_bfyx_os_iyx_osv16_no_bias) {
1062810628
auto output_memory = outputs.at("conv").get_memory();
1062910629
ov::intel_gpu::ImplementationDesc conv_impl_ref = { format::bfyx, "convolution_gpu_ref", impl_types::ocl };
1063010630
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "conv", conv_impl_ref } }));
10631+
1063110632
auto output_memory_ref = calculate_ref(input, weights, config);
1063210633

10633-
cldnn::mem_lock<float> output_ptr(output_memory, get_test_stream());
10634-
cldnn::mem_lock<float> output_ptr_ref(output_memory_ref, get_test_stream());
10634+
cldnn::mem_lock<float, mem_lock_type::read> output_ptr_ref(output_memory_ref, get_test_stream());
10635+
cldnn::mem_lock<float, mem_lock_type::read> output_ptr(output_memory, get_test_stream());
1063510636

1063610637
ASSERT_EQ(outputs.at("conv").get_layout(), output_memory_ref->get_layout());
1063710638
for (size_t i = 0; i < output_ptr.size(); i++) {
@@ -10657,9 +10658,8 @@ TEST_P(conv_dyn_test, convolution_gpu_bfyx_os_iyx_osv16_no_bias) {
1065710658

1065810659
auto output_memory = outputs.at("conv").get_memory();
1065910660
auto output_memory_ref = calculate_ref(input, weights, config);
10660-
10661-
cldnn::mem_lock<float> output_ptr(output_memory, get_test_stream());
10662-
cldnn::mem_lock<float> output_ptr_ref(output_memory_ref, get_test_stream());
10661+
cldnn::mem_lock<float, mem_lock_type::read> output_ptr_ref(output_memory_ref, get_test_stream());
10662+
cldnn::mem_lock<float, mem_lock_type::read> output_ptr(output_memory, get_test_stream());
1066310663

1066410664
ASSERT_EQ(outputs.at("conv").get_layout(), output_memory_ref->get_layout());
1066510665
for (size_t i = 0; i < output_ptr.size(); i++) {

src/plugins/intel_gpu/tests/unit/test_cases/deconvolution_gpu_test.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -2704,7 +2704,7 @@ class deconvolution_random_test_base {
27042704

27052705
// Compare results
27062706
{
2707-
cldnn::mem_lock<OutputT> ptr(out_mem, get_test_stream());
2707+
cldnn::mem_lock<OutputT, mem_lock_type::read> ptr(out_mem, get_test_stream());
27082708

27092709
auto b = static_cast<size_t>(out_mem->get_layout().batch());
27102710
auto of = static_cast<size_t>(out_mem->get_layout().feature());

src/plugins/intel_gpu/tests/unit/test_cases/depth_to_space_gpu_test.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ TEST(depth_to_space_fp32_gpu, d1411_bs2) {
190190
auto outputs = network.execute();
191191

192192
auto output = outputs.at("depth_to_space").get_memory();
193-
cldnn::mem_lock<float> output_ptr(output, get_test_stream());
193+
cldnn::mem_lock<float, mem_lock_type::read> output_ptr(output, get_test_stream());
194194

195195
std::vector<float> expected_results = {
196196
0.f, 1.f, 2.f, 3.f
@@ -230,7 +230,7 @@ TEST(depth_to_space_fp32_gpu, d112960540_bs2) {
230230
auto outputs = network_act.execute();
231231

232232
auto output = outputs.at("depth_to_space").get_memory();
233-
cldnn::mem_lock<ov::float16> output_ptr (output, get_test_stream());
233+
cldnn::mem_lock<ov::float16, mem_lock_type::read> output_ptr (output, get_test_stream());
234234

235235
std::vector<uint16_t> perm = { 0,3,4,1,5,2 };
236236

@@ -255,7 +255,7 @@ TEST(depth_to_space_fp32_gpu, d112960540_bs2) {
255255
auto outputs_ref = network_ref.execute();
256256

257257
auto output_ref = outputs_ref.at("reshape2").get_memory();
258-
cldnn::mem_lock<ov::float16> output_ptr_ref(output_ref, get_test_stream());
258+
cldnn::mem_lock<ov::float16, mem_lock_type::read> output_ptr_ref(output_ref, get_test_stream());
259259

260260
for (size_t i = 0; i < output->get_layout().count(); ++i) {
261261
ASSERT_EQ(output_ptr_ref[i], output_ptr[i]);

src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -155,8 +155,8 @@ class dynamic_quantization_gpu_tests: public ::testing::Test {
155155
std::cout << "Outputs number: " << ref_output_buffers.size() << "\n";
156156

157157
for (size_t i = 0; i < ref_output_buffers.size(); i++) {
158-
cldnn::mem_lock<ov::float16> output_ptr(output_buffers[i], get_test_stream());
159-
cldnn::mem_lock<ov::float16> output_ptr_ref(ref_output_buffers[i], get_test_stream());
158+
cldnn::mem_lock<ov::float16, mem_lock_type::read> output_ptr(output_buffers[i], get_test_stream());
159+
cldnn::mem_lock<ov::float16, mem_lock_type::read> output_ptr_ref(ref_output_buffers[i], get_test_stream());
160160

161161
for (size_t i = 0; i < output_ptr_ref.size(); ++i) {
162162
auto abs_diff = std::abs(output_ptr_ref[i] - output_ptr[i]);

src/plugins/intel_gpu/tests/unit/test_cases/eltwise_gpu_test.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -3814,7 +3814,7 @@ TEST(eltwise_gpu_f16, bfyx_and_fs_b_yx_fsv32_output_padding) {
38143814

38153815
auto golden_outputs = golden_network.execute();
38163816
auto golden_output = golden_outputs.at("eltwise").get_memory();
3817-
cldnn::mem_lock<ov::float16> golden_ptr(golden_output, get_test_stream());
3817+
cldnn::mem_lock<ov::float16, mem_lock_type::read> golden_ptr(golden_output, get_test_stream());
38183818
// GOLDEN BFYX ELTWISE - END
38193819
// MIXED INPUT, FS_B_YX_FSV32 OUTPUT
38203820
topology FS_B_YX_FSV32_OUTPUT_topology;
@@ -3834,7 +3834,7 @@ TEST(eltwise_gpu_f16, bfyx_and_fs_b_yx_fsv32_output_padding) {
38343834

38353835
auto FS_B_YX_FSV32_OUTPUT_outputs = FS_B_YX_FSV32_OUTPUT_network.execute();
38363836
auto FS_B_YX_FSV32_OUTPUT_output = FS_B_YX_FSV32_OUTPUT_outputs.at("reorderOutput").get_memory();
3837-
cldnn::mem_lock<ov::float16> FS_B_YX_FSV32_OUTPUT_ptr(FS_B_YX_FSV32_OUTPUT_output, get_test_stream());
3837+
cldnn::mem_lock<ov::float16, mem_lock_type::read> FS_B_YX_FSV32_OUTPUT_ptr(FS_B_YX_FSV32_OUTPUT_output, get_test_stream());
38383838
// MIXED INPUT, FS_B_YX_FSV32 OUTPUT - END
38393839
// MIXED INPUT, BYXF OUTPUT
38403840
topology BYXF_OUTPUT_topology;
@@ -3854,7 +3854,7 @@ TEST(eltwise_gpu_f16, bfyx_and_fs_b_yx_fsv32_output_padding) {
38543854

38553855
auto BYXF_OUTPUT_outputs = BYXF_OUTPUT_network.execute();
38563856
auto BYXF_OUTPUT_output = BYXF_OUTPUT_outputs.at("reorderOutput").get_memory();
3857-
cldnn::mem_lock<ov::float16> BYXF_OUTPUT_ptr(BYXF_OUTPUT_output, get_test_stream());
3857+
cldnn::mem_lock<ov::float16, mem_lock_type::read> BYXF_OUTPUT_ptr(BYXF_OUTPUT_output, get_test_stream());
38583858
// MIXED INPUT, BYXF OUTPUT - END
38593859

38603860
ASSERT_EQ(golden_ptr.size(), FS_B_YX_FSV32_OUTPUT_ptr.size());

src/plugins/intel_gpu/tests/unit/test_cases/gather_gpu_test.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ class gather8_test : public ::testing::TestWithParam<gather8_test_param> {
103103
reorder_network.set_input_data("input0", input0);
104104
reorder_network.set_input_data("input1", input1);
105105
auto reorder_output = reorder_network.execute().at("reorder2").get_memory();
106-
cldnn::mem_lock<T_dat> reorder_output_ptr(reorder_output, get_test_stream());
106+
cldnn::mem_lock<T_dat, mem_lock_type::read> reorder_output_ptr(reorder_output, get_test_stream());
107107

108108
topology planar_topo;
109109
planar_topo.add(input_layout("input0", input0->get_layout()));
@@ -114,7 +114,7 @@ class gather8_test : public ::testing::TestWithParam<gather8_test_param> {
114114
planar_network.set_input_data("input0", input0);
115115
planar_network.set_input_data("input1", input1);
116116
auto planar_output = planar_network.execute().at("gather").get_memory();
117-
cldnn::mem_lock<T_dat> planar_output_ptr(planar_output, get_test_stream());
117+
cldnn::mem_lock<T_dat, mem_lock_type::read> planar_output_ptr(planar_output, get_test_stream());
118118

119119
ASSERT_TRUE(
120120
!memcmp(reorder_output_ptr.data(), planar_output_ptr.data(), get_linear_size(shape_out) * sizeof(T_dat)));

src/plugins/intel_gpu/tests/unit/test_cases/group_normalization_gpu_test.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ class GroupNormalizationGPUTest : public ::testing::TestWithParam<GroupNormaliza
8484
network_->set_input_data(bias_primitive_, bias_gpu_mem);
8585
auto outputs = network_->execute();
8686
auto output = outputs.at("output").get_memory();
87-
cldnn::mem_lock<float> output_gpu_mem(output, get_test_stream());
87+
cldnn::mem_lock<float, mem_lock_type::read> output_gpu_mem(output, get_test_stream());
8888

8989
std::vector<float> reference_output(data_.size());
9090
ov::reference::group_normalization(data_.data(), scale_.data(), bias_.data(), reference_output.data(),
@@ -236,4 +236,4 @@ TEST(group_normalization, input_bfyx_output_fsv16) {
236236
ASSERT_NEAR(output_mem_t[i], output_mem_g[i], 0.0001);
237237
}
238238
}
239-
#endif // ENABLE_ONEDNN_FOR_GPU
239+
#endif // ENABLE_ONEDNN_FOR_GPU

0 commit comments

Comments
 (0)