Skip to content

Commit ea4ba84

Browse files
[GPU] Fix accuracy issue at gemm_tiled_opt kernel with dynamic padding
1 parent f6d4b40 commit ea4ba84

File tree

3 files changed

+187
-2
lines changed

3 files changed

+187
-2
lines changed

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl

+2-2
Original file line numberDiff line numberDiff line change
@@ -200,14 +200,14 @@ KERNEL(gemm_tiled_opt)(
200200
#endif // TRANSPOSE_INPUT0
201201
#if TRANSPOSE_INPUT1 == TRANSPOSE_X_LAST
202202
const __global INPUT1_TYPE* b_ptr = input1 + batch_offset_input1;
203-
#if HAS_DYNAMIC_K_PADDING || INPUT1_HAS_PADDING
203+
#if HAS_DYNAMIC_N_PADDING || INPUT1_HAS_PADDING
204204
const uint input1_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, 1, tile_n_offset) - batch_offset_input1;
205205
#else
206206
const uint input1_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR 0, 0, 0, 0, 1, 0);
207207
#endif
208208
#elif TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST
209209
const __global INPUT1_TYPE* b_ptr = input1 + batch_offset_input1;
210-
#if HAS_DYNAMIC_K_PADDING || INPUT1_HAS_PADDING
210+
#if HAS_DYNAMIC_N_PADDING || INPUT1_HAS_PADDING
211211
const uint input1_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, 0, (tile_n_offset + 1)) - batch_offset_input1;
212212
const uint input1_offset1 = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (TILE_K), tile_n_offset) - batch_offset_input1;
213213
#else

src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp

+13
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,19 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons
213213
jit.AddConstant(MakeJitConstant("HAS_DYNAMIC_K_PADDING", 1));
214214
if (has_dynamic_n_padding)
215215
jit.AddConstant(MakeJitConstant("HAS_DYNAMIC_N_PADDING", 1));
216+
217+
auto hasDynamicPad = [](DataTensor dt) -> bool {
218+
auto dims = dt.GetDims();
219+
for (auto d : dims) {
220+
if (d.pad.is_dynamic)
221+
return true;
222+
}
223+
return false;
224+
};
225+
if (hasDynamicPad(params.inputs[0]))
226+
jit.AddConstant(MakeJitConstant("INPUT0_HAS_PADDING", 1));
227+
if (hasDynamicPad(params.inputs[1]))
228+
jit.AddConstant(MakeJitConstant("INPUT1_HAS_PADDING", 1));
216229
} else {
217230
auto get_transposed_dim_size = [](const kernel_selector::DataTensor &data_tensor,
218231
const std::vector<int64_t>& dims_order, const std::string dim) {

src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp

+172
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,168 @@ class gemm_gpu_tests: public ::testing::Test {
522522
}
523523
}
524524

525+
void test_dynamic_padding_w_transpose_order(bool is_caching_test, bool n_dim_only) {
526+
tests::random_generator rg;
527+
rg.set_seed(GET_SUITE_NAME);
528+
529+
auto& engine = get_test_engine();
530+
531+
const unsigned long BATCH_SIZE = 128;
532+
const unsigned long M_SIZE = 12;
533+
const unsigned long K_SIZE = 64;
534+
const unsigned long N_SIZE = 12;
535+
536+
auto fill_mem = [&](cldnn::memory_ptr mem, std::vector<ov::float16>& data) {
537+
cldnn::mem_lock<ov::float16> mem_ptr(mem, get_test_stream());
538+
auto&& l = mem->get_layout();
539+
auto data_idx = 0;
540+
for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) {
541+
for (cldnn::tensor::value_type f = 0; f < l.feature(); ++f) {
542+
for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) {
543+
for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) {
544+
auto tensor_coord = cldnn::tensor{{b, f, x, y}, 0};
545+
auto buffer_idx = l.get_linear_offset(tensor_coord);
546+
mem_ptr[buffer_idx] = data[data_idx++];
547+
}
548+
}
549+
}
550+
}
551+
};
552+
553+
const auto align_size_m = 13;
554+
const auto align_size_k = 16;
555+
const auto align_size_n = 15;
556+
const auto align_size_b1 = 3;
557+
const auto align_size_b2 = 19;
558+
559+
const auto aligned_batch1_size = align_to(1ul, align_size_b1);
560+
auto padding_size_batch1 = static_cast<int>(aligned_batch1_size - 1);
561+
562+
const auto aligned_batch2_size = align_to(BATCH_SIZE, align_size_b2);
563+
auto padding_size_batch2 = static_cast<int>(aligned_batch2_size - BATCH_SIZE);
564+
565+
const auto aligned_m_size = align_to(M_SIZE, align_size_m);
566+
auto padding_size_m = static_cast<int>(aligned_m_size - M_SIZE);
567+
const auto aligned_k_size = align_to(K_SIZE, align_size_k);
568+
auto padding_size_k = static_cast<int>(aligned_k_size - K_SIZE);
569+
const auto aligned_n_size = align_to(N_SIZE, align_size_n);
570+
auto padding_size_n = static_cast<int>(aligned_n_size - N_SIZE);
571+
572+
ov::Shape in1_shape = { 1, BATCH_SIZE, M_SIZE, K_SIZE };
573+
ov::Shape in2_shape = { 1, BATCH_SIZE, N_SIZE, K_SIZE };
574+
ov::Shape in1_shape_aligned = { aligned_batch1_size, aligned_batch2_size, aligned_m_size, aligned_k_size };
575+
ov::Shape in2_shape_aligned = { aligned_batch1_size, aligned_batch2_size, aligned_n_size, aligned_k_size };
576+
577+
// Use dynamic padding for all BFYX dimensions
578+
tensor dyn_pad_dims_input1({0, 0, 0, 0}, 0);
579+
tensor dyn_pad_dims_input2({0, 0, 0, 0}, 0);
580+
581+
if (n_dim_only) {
582+
dyn_pad_dims_input1 = tensor({0, 0, 0, 0}, 0);
583+
dyn_pad_dims_input2 = tensor({0, 0, 1, 0}, 0);
584+
} else {
585+
dyn_pad_dims_input1 = tensor({1, 1, 1, 1}, 0);
586+
dyn_pad_dims_input2 = tensor({1, 1, 1, 1}, 0);
587+
}
588+
589+
auto in1_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx, padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input1)};
590+
auto in2_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx, padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input2)};
591+
592+
auto aligned_input1_mem = engine.allocate_memory({ov::PartialShape(in1_shape_aligned), data_types::f16, format::bfyx});
593+
auto aligned_input2_mem = engine.allocate_memory({ov::PartialShape(in2_shape_aligned), data_types::f16, format::bfyx});
594+
595+
auto input1_mem = engine.reinterpret_buffer(*aligned_input1_mem, layout{ov::PartialShape(in1_shape),
596+
data_types::f16,
597+
format::bfyx,
598+
n_dim_only ? padding({0, 0, 0, 0 }, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input1) :
599+
padding({padding_size_batch1, 0, 0, 0}, {0, padding_size_batch2, padding_size_m, padding_size_k}, 0.0f, dyn_pad_dims_input1)});
600+
601+
auto input2_mem = engine.reinterpret_buffer(*aligned_input2_mem, layout{ov::PartialShape(in2_shape),
602+
data_types::f16,
603+
format::bfyx,
604+
n_dim_only ? padding({0, 0, 0, 0}, {0, 0, padding_size_n, 0}, 0.0f, dyn_pad_dims_input2) :
605+
padding({0, padding_size_batch2, 0, 0}, {padding_size_batch1, 0, padding_size_n, padding_size_k }, 0.0f, dyn_pad_dims_input2)});
606+
607+
auto input_1_data = rg.generate_random_1d<ov::float16>(ov::shape_size(in1_shape), -2, 2);
608+
auto input_2_data = rg.generate_random_1d<ov::float16>(ov::shape_size(in2_shape), -2, 2);
609+
610+
fill_mem(input1_mem, input_1_data);
611+
fill_mem(input2_mem, input_2_data);
612+
613+
auto get_ref_results = [&]() {
614+
ov::Shape in1_shape = { 1, BATCH_SIZE, M_SIZE, K_SIZE };
615+
ov::Shape in2_shape = { 1, BATCH_SIZE, N_SIZE, K_SIZE };
616+
auto in1_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx};
617+
auto in2_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx};
618+
619+
auto input1_mem = engine.allocate_memory(layout{ov::PartialShape(in1_shape), data_types::f16, format::bfyx});
620+
auto input2_mem = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f16, format::bfyx});
621+
622+
fill_mem(input1_mem, input_1_data);
623+
fill_mem(input2_mem, input_2_data);
624+
625+
topology topology;
626+
topology.add(input_layout("input1", in1_layout),
627+
input_layout("input2", in2_layout),
628+
gemm("gemm_ref", { input_info("input1"), input_info("input2") }, data_types::f16,
629+
{0, 2, 1, 3}, {0, 2, 3, 1}, {0, 1, 2, 3})
630+
);
631+
632+
auto config = get_test_default_config(engine);
633+
config.set_property(ov::intel_gpu::optimize_data(true));
634+
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
635+
636+
network network(engine, topology, config);
637+
network.set_input_data("input1", input1_mem);
638+
network.set_input_data("input2", input2_mem);
639+
640+
auto outputs = network.execute();
641+
OPENVINO_ASSERT(outputs.size() == 1);
642+
OPENVINO_ASSERT(outputs.begin()->first == "gemm_ref");
643+
644+
auto inst = network.get_primitive("gemm_ref");
645+
646+
auto output_mem = outputs.at("gemm_ref").get_memory();
647+
auto output_layout = outputs.at("gemm_ref").get_layout();
648+
649+
return engine.reinterpret_buffer(*output_mem, output_layout);
650+
};
651+
652+
topology topology;
653+
topology.add(input_layout("input1", in1_layout),
654+
input_layout("input2", in2_layout),
655+
gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f16,
656+
{0, 2, 1, 3}, {0, 2, 3, 1}, {0, 1, 2, 3})
657+
);
658+
659+
ExecutionConfig config = get_test_default_config(engine);
660+
config.set_property(ov::intel_gpu::optimize_data(true));
661+
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
662+
network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
663+
network->set_input_data("input1", input1_mem);
664+
network->set_input_data("input2", input2_mem);
665+
666+
auto inst = network->get_primitive("gemm");
667+
auto impl = inst->get_impl();
668+
ASSERT_TRUE(impl != nullptr);
669+
ASSERT_TRUE(impl->is_dynamic());
670+
671+
auto outputs = network->execute();
672+
673+
auto output_mem = outputs.at("gemm").get_memory();
674+
auto output_layout = outputs.at("gemm").get_layout();
675+
676+
auto res = engine.reinterpret_buffer(*output_mem, output_layout);
677+
678+
auto ref_res = get_ref_results();
679+
680+
mem_lock<ov::float16> res_lock(res, get_test_stream());
681+
mem_lock<ov::float16> res_ref_lock(ref_res, get_test_stream());
682+
for (size_t i = 0; i < res->count(); i++) {
683+
ASSERT_EQ(res_lock[i], res_ref_lock[i]) << i;
684+
}
685+
}
686+
525687
void test_dynamic_multi_inference_same_shape(bool is_caching_test) {
526688
auto& engine = get_test_engine();
527689

@@ -1433,6 +1595,16 @@ TEST_F(gemm_gpu_tests, dynamic_padding_n_dim_only) {
14331595
this->test_dynamic_padding(false, true);
14341596
}
14351597

1598+
#ifndef ENABLE_ONEDNN_FOR_GPU
1599+
// Disable onednn test because onednn does not support format_tag::cbda, format_tag::badc.
1600+
TEST_F(gemm_gpu_tests, dynamic_padding_w_transpose_order_all_dim) {
1601+
this->test_dynamic_padding_w_transpose_order(false, false);
1602+
}
1603+
1604+
TEST_F(gemm_gpu_tests, dynamic_padding_w_transpose_order_n_dim_only) {
1605+
this->test_dynamic_padding_w_transpose_order(false, true);
1606+
}
1607+
#endif
14361608

14371609
TEST_F(gemm_gpu_tests, dynamic_multi_inference_same_shape) {
14381610
this->test_dynamic_multi_inference_same_shape(false);

0 commit comments

Comments
 (0)