Skip to content

Commit f57102b

Browse files
[GPU] Fix gemm_tiled_opt kernel to support B_VEC_SIZE = 2 for static
1 parent b9d98cb commit f57102b

File tree

3 files changed

+28
-5
lines changed

3 files changed

+28
-5
lines changed

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl

+12-2
Original file line numberDiff line numberDiff line change
@@ -786,16 +786,26 @@ KERNEL(gemm_tiled_opt)(
786786
ACCUMULATOR_TYPE_VEC dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id];
787787
#endif // BIAS_TERM
788788

789+
#if TRANSPOSE_OUTPUT == TRANSPOSE_X_LAST
790+
const uint x_pitch = 1;
791+
#else
792+
const uint x_pitch = output_x_pitch;
793+
#endif
794+
789795
#if HAS_FUSED_OPS
790796
#if FUSED_OPS_CAN_USE_PRELOAD
791797
FUSED_OPS_CALC_VEC;
792798
#else // FUSED_OPS_CAN_USE_PRELOAD
793799
FUSED_OPS_VEC;
794800
#endif // FUSED_OPS_CAN_USE_PRELOAD
795801
OUTPUT_TYPE_VEC res = FUSED_OPS_RESULT_VEC;
796-
BLOCK_WRITE_C(d_ptr, 0, res);
802+
unroll_for (uint n_elem = 0; n_elem < B_VEC_SIZE; ++n_elem) {
803+
BLOCK_WRITEN(OUTPUT_TYPE, 1, d_ptr, SIMD_WIDTH * n_elem * output_x_pitch, res[n_elem]);
804+
}
797805
#else // HAS_FUSED_OPS
798-
BLOCK_WRITE_C(d_ptr, 0, dequantized);
806+
unroll_for (uint n_elem = 0; n_elem < B_VEC_SIZE; ++n_elem) {
807+
BLOCK_WRITEN(OUTPUT_TYPE, 1, d_ptr, SIMD_WIDTH * n_elem * output_x_pitch, dequantized[n_elem]);
808+
}
799809
#endif // HAS_FUSED_OPS
800810
#endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
801811
#endif // IS_DYNAMIC

src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,9 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
9696
tuning_data.tile_m_size = tuning_data.simd_size;
9797
}
9898
// Increasing tile_n_size has performance improvement when m_size and n_size are not shallow and n_size is aligned at 32.
99-
if (m_size >= 128 && n_size >= 128 && (n_size % 32 == 0) && tuning_data.simd_size == 16 && params.fused_ops.empty())
99+
// TODO: Support TILE_K_LEFTOVER true case at static shape
100+
if (m_size >= 128 && n_size >= 128 && (n_size % 32 == 0) && tuning_data.simd_size == 16 &&
101+
(k_size % tuning_data.tile_k_size == 0) && params.fused_ops.empty())
100102
tuning_data.tile_n_size = 32;
101103

102104
GPU_DEBUG_LOG << params.layerID << ": m_size: " << m_size << ", n_size: " << n_size << ", k_size: " << k_size << std::endl;

src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp

+13-2
Original file line numberDiff line numberDiff line change
@@ -1316,7 +1316,7 @@ class gemm_gpu_tests: public ::testing::Test {
13161316
}
13171317
}
13181318

1319-
void test_transpose_matmul_f16(size_t num_dims, bool is_input_dynamic, bool is_caching_test, std::vector<size_t> BMKN, std::vector<int64_t> input0_order, std::vector<int64_t> input1_order) {
1319+
void test_transpose_matmul_f16(size_t num_dims, bool is_input_dynamic, bool is_caching_test, std::vector<size_t> BMKN, std::vector<int64_t> input0_order, std::vector<int64_t> input1_order, const double abs_error = 0.0001) {
13201320
tests::random_generator rg;
13211321
rg.set_seed(GET_SUITE_NAME);
13221322

@@ -1411,7 +1411,6 @@ class gemm_gpu_tests: public ::testing::Test {
14111411

14121412
ASSERT_EQ(output_ptr.size(), ref_out_data.size());
14131413

1414-
const auto abs_error = 0.0001;
14151414
for (uint32_t i = 0; i < ref_out_data.size(); ++i) {
14161415
ASSERT_NEAR(output_ptr[i], ref_out_data[i], abs_error) << "at " << i;
14171416
}
@@ -1487,10 +1486,22 @@ TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f16) {
14871486
this->test_transpose_matmul_f16(3, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 1}, /*input1_order*/{1, 2, 0});
14881487
}
14891488

1489+
TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f16_n32) {
1490+
this->test_transpose_matmul_f16(3, false, false, /*BMKN*/{1, 256, 32, 128}, /*input0_order*/{0, 1, 2}, /*input1_order*/{0, 2, 1}, 0.1);
1491+
}
1492+
14901493
TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f32) {
14911494
this->test_transpose_matmul_f32(3, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 1}, /*input1_order*/{1, 2, 0});
14921495
}
14931496

1497+
TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f32_n32) {
1498+
this->test_transpose_matmul_f32(3, false, false, /*BMKN*/{2, 128, 16, 256}, /*input0_order*/{0, 1, 2}, /*input1_order*/{0, 2, 1});
1499+
}
1500+
1501+
TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f32_n32_k_remainder) {
1502+
this->test_transpose_matmul_f32(3, false, false, /*BMKN*/{2, 128, 17, 256}, /*input0_order*/{0, 1, 2}, /*input1_order*/{0, 2, 1});
1503+
}
1504+
14941505
TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_4d_f16_unaligned) {
14951506
this->test_transpose_matmul_f16(4, true, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 3, 1}, /*input1_order*/{1, 2, 3, 0});
14961507
}

0 commit comments

Comments
 (0)