Skip to content

Commit 077f59f

Browse files
[GPU] Fix dynamic_n_padding typo error
1 parent 3692cf8 commit 077f59f

File tree

2 files changed

+173
-2
lines changed

2 files changed

+173
-2
lines changed

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl

+2-2
Original file line numberDiff line numberDiff line change
@@ -200,14 +200,14 @@ KERNEL(gemm_tiled_opt)(
200200
#endif // TRANSPOSE_INPUT0
201201
#if TRANSPOSE_INPUT1 == TRANSPOSE_X_LAST
202202
const __global INPUT1_TYPE* b_ptr = input1 + batch_offset_input1;
203-
#if HAS_DYNAMIC_K_PADDING || INPUT1_HAS_PADDING
203+
#if HAS_DYNAMIC_N_PADDING || INPUT1_HAS_PADDING
204204
const uint input1_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, 1, tile_n_offset) - batch_offset_input1;
205205
#else
206206
const uint input1_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR 0, 0, 0, 0, 1, 0);
207207
#endif
208208
#elif TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST
209209
const __global INPUT1_TYPE* b_ptr = input1 + batch_offset_input1;
210-
#if HAS_DYNAMIC_K_PADDING || INPUT1_HAS_PADDING
210+
#if HAS_DYNAMIC_N_PADDING || INPUT1_HAS_PADDING
211211
const uint input1_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, 0, (tile_n_offset + 1)) - batch_offset_input1;
212212
const uint input1_offset1 = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (TILE_K), tile_n_offset) - batch_offset_input1;
213213
#else

src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp

+171
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,170 @@ class gemm_gpu_tests: public ::testing::Test {
522522
}
523523
}
524524

525+
void test_dynamic_padding_w_fused_transpose(bool is_caching_test, bool n_dim_only) {
526+
tests::random_generator rg;
527+
rg.set_seed(GET_SUITE_NAME);
528+
529+
auto& engine = get_test_engine();
530+
531+
const unsigned long BATCH_SIZE = 128;
532+
const unsigned long M_SIZE = 12;
533+
const unsigned long K_SIZE = 64;
534+
const unsigned long N_SIZE = 12;
535+
536+
auto fill_mem = [&](cldnn::memory_ptr mem, std::vector<ov::float16>& data) {
537+
cldnn::mem_lock<ov::float16> mem_ptr(mem, get_test_stream());
538+
auto&& l = mem->get_layout();
539+
auto data_idx = 0;
540+
for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) {
541+
for (cldnn::tensor::value_type f = 0; f < l.feature(); ++f) {
542+
for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) {
543+
for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) {
544+
auto tensor_coord = cldnn::tensor{{b, f, x, y}, 0};
545+
auto buffer_idx = l.get_linear_offset(tensor_coord);
546+
mem_ptr[buffer_idx] = data[data_idx++];
547+
}
548+
}
549+
}
550+
}
551+
};
552+
553+
const auto align_size_m = 13;
554+
const auto align_size_k = 16;
555+
const auto align_size_n = 15;
556+
const auto align_size_b1 = 3;
557+
const auto align_size_b2 = 19;
558+
559+
const auto aligned_batch1_size = align_to(1ul, align_size_b1);
560+
auto padding_size_batch1 = static_cast<int>(aligned_batch1_size - 1);
561+
562+
const auto aligned_batch2_size = align_to(BATCH_SIZE, align_size_b2);
563+
auto padding_size_batch2 = static_cast<int>(aligned_batch2_size - BATCH_SIZE);
564+
565+
const auto aligned_m_size = align_to(M_SIZE, align_size_m);
566+
auto padding_size_m = static_cast<int>(aligned_m_size - M_SIZE);
567+
const auto aligned_k_size = align_to(K_SIZE, align_size_k);
568+
auto padding_size_k = static_cast<int>(aligned_k_size - K_SIZE);
569+
const auto aligned_n_size = align_to(N_SIZE, align_size_n);
570+
auto padding_size_n = static_cast<int>(aligned_n_size - N_SIZE);
571+
572+
ov::Shape in1_shape = { 1, BATCH_SIZE, M_SIZE, K_SIZE };
573+
ov::Shape in2_shape = { 1, BATCH_SIZE, N_SIZE, K_SIZE };
574+
ov::Shape in1_shape_aligned = { aligned_batch1_size, aligned_batch2_size, aligned_m_size, aligned_k_size };
575+
ov::Shape in2_shape_aligned = { aligned_batch1_size, aligned_batch2_size, aligned_n_size ,aligned_k_size };
576+
577+
// Use dynamic padding for all BFYX dimensions
578+
tensor dyn_pad_dims_input1({0, 0, 0, 0}, 0);
579+
tensor dyn_pad_dims_input2({0, 0, 0, 0}, 0);
580+
581+
if (n_dim_only) {
582+
dyn_pad_dims_input1 = tensor({0, 0, 0, 1}, 0);
583+
dyn_pad_dims_input2 = tensor({0, 0, 0, 1}, 0);
584+
} else {
585+
dyn_pad_dims_input1 = tensor({1, 1, 1, 1}, 0);
586+
dyn_pad_dims_input2 = tensor({1, 1, 1, 1}, 0);
587+
}
588+
589+
auto in1_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx, padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input1)};
590+
auto in2_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx, padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input2)};
591+
592+
auto aligned_input1_mem = engine.allocate_memory({ov::PartialShape(in1_shape_aligned), data_types::f16, format::bfyx});
593+
auto aligned_input2_mem = engine.allocate_memory({ov::PartialShape(in2_shape_aligned), data_types::f16, format::bfyx});
594+
595+
auto input1_mem = engine.reinterpret_buffer(*aligned_input1_mem, layout{ov::PartialShape(in1_shape),
596+
data_types::f16,
597+
format::bfyx,
598+
n_dim_only ? padding({0, 0, padding_size_m, 0 }, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input1) :
599+
padding({padding_size_batch1, 0, 0, 0}, {0, padding_size_batch2, padding_size_k, padding_size_m}, 0.0f, dyn_pad_dims_input1)});
600+
601+
auto input2_mem = engine.reinterpret_buffer(*aligned_input2_mem, layout{ov::PartialShape(in2_shape),
602+
data_types::f16,
603+
format::bfyx,
604+
n_dim_only ? padding({0, 0, padding_size_n, 0 }, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input2) :
605+
padding({0, padding_size_batch2, 0, 0}, {padding_size_batch1, 0, padding_size_k, padding_size_n }, 0.0f, dyn_pad_dims_input2)});
606+
607+
auto input_1_data = rg.generate_random_1d<ov::float16>(ov::shape_size(in1_shape), -2, 2);
608+
auto input_2_data = rg.generate_random_1d<ov::float16>(ov::shape_size(in2_shape), -2, 2);
609+
610+
fill_mem(input1_mem, input_1_data);
611+
fill_mem(input2_mem, input_2_data);
612+
613+
auto get_ref_results = [&]() {
614+
ov::Shape in1_shape = { 1, BATCH_SIZE, M_SIZE, K_SIZE };
615+
ov::Shape in2_shape = { 1, BATCH_SIZE, N_SIZE, K_SIZE };
616+
auto in1_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx};
617+
auto in2_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx};
618+
619+
auto input1_mem = engine.allocate_memory(layout{ov::PartialShape(in1_shape), data_types::f16, format::bfyx});
620+
auto input2_mem = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f16, format::bfyx});
621+
622+
fill_mem(input1_mem, input_1_data);
623+
fill_mem(input2_mem, input_2_data);
624+
625+
topology topology;
626+
topology.add(input_layout("input1", in1_layout),
627+
input_layout("input2", in2_layout),
628+
//gemm("gemm_ref", { input_info("input1"), input_info("input2") }, data_types::f16, false, false, 1.0f, 0.0f, 4, 4)
629+
gemm("gemm_ref", { input_info("input1"), input_info("input2") }, data_types::f16,
630+
{0, 2, 1, 3}, {0, 2, 3, 1}, {0, 1, 2, 3})
631+
);
632+
633+
auto config = get_test_default_config(engine);
634+
config.set_property(ov::intel_gpu::optimize_data(true));
635+
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
636+
637+
network network(engine, topology, config);
638+
network.set_input_data("input1", input1_mem);
639+
network.set_input_data("input2", input2_mem);
640+
641+
auto outputs = network.execute();
642+
OPENVINO_ASSERT(outputs.size() == 1);
643+
OPENVINO_ASSERT(outputs.begin()->first == "gemm_ref");
644+
645+
auto inst = network.get_primitive("gemm_ref");
646+
647+
auto output_mem = outputs.at("gemm_ref").get_memory();
648+
auto output_layout = outputs.at("gemm_ref").get_layout();
649+
650+
return engine.reinterpret_buffer(*output_mem, output_layout);
651+
};
652+
653+
topology topology;
654+
topology.add(input_layout("input1", in1_layout),
655+
input_layout("input2", in2_layout),
656+
//gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f16, false, false, 1.0f, 0.0f, 4, 4)
657+
gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f16,
658+
{0, 2, 1, 3}, {0, 2, 3, 1}, {0, 1, 2, 3})
659+
);
660+
661+
ExecutionConfig config = get_test_default_config(engine);
662+
config.set_property(ov::intel_gpu::optimize_data(true));
663+
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
664+
network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
665+
network->set_input_data("input1", input1_mem);
666+
network->set_input_data("input2", input2_mem);
667+
668+
auto inst = network->get_primitive("gemm");
669+
auto impl = inst->get_impl();
670+
ASSERT_TRUE(impl != nullptr);
671+
ASSERT_TRUE(impl->is_dynamic());
672+
673+
auto outputs = network->execute();
674+
675+
auto output_mem = outputs.at("gemm").get_memory();
676+
auto output_layout = outputs.at("gemm").get_layout();
677+
678+
auto res = engine.reinterpret_buffer(*output_mem, output_layout);
679+
680+
auto ref_res = get_ref_results();
681+
682+
mem_lock<ov::float16> res_lock(res, get_test_stream());
683+
mem_lock<ov::float16> res_ref_lock(ref_res, get_test_stream());
684+
for (size_t i = 0; i < res->count(); i++) {
685+
ASSERT_EQ(res_lock[i], res_ref_lock[i]) << i;
686+
}
687+
}
688+
525689
void test_dynamic_multi_inference_same_shape(bool is_caching_test) {
526690
auto& engine = get_test_engine();
527691

@@ -1433,6 +1597,13 @@ TEST_F(gemm_gpu_tests, dynamic_padding_n_dim_only) {
14331597
this->test_dynamic_padding(false, true);
14341598
}
14351599

1600+
TEST_F(gemm_gpu_tests, dynamic_padding_w_fused_transpose_all_dim) {
1601+
this->test_dynamic_padding_w_fused_transpose(false, false);
1602+
}
1603+
1604+
TEST_F(gemm_gpu_tests, dynamic_padding_w_fused_transpose_n_dim_only) {
1605+
this->test_dynamic_padding_w_fused_transpose(false, true);
1606+
}
14361607

14371608
TEST_F(gemm_gpu_tests, dynamic_multi_inference_same_shape) {
14381609
this->test_dynamic_multi_inference_same_shape(false);

0 commit comments

Comments
 (0)