Skip to content

Commit a7503f3

Browse files
[GPU] Fix gemm_tiled_opt kernel to support B_VEC_SIZE = 2 for static
1 parent ffc135c commit a7503f3

File tree

3 files changed

+26
-3
lines changed

3 files changed

+26
-3
lines changed

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl

+12-2
Original file line numberDiff line numberDiff line change
@@ -786,16 +786,26 @@ KERNEL(gemm_tiled_opt)(
786786
ACCUMULATOR_TYPE_VEC dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id];
787787
#endif // BIAS_TERM
788788

789+
#if TRANSPOSE_OUTPUT == TRANSPOSE_X_LAST
790+
const uint x_pitch = 1;
791+
#else
792+
const uint x_pitch = output_x_pitch;
793+
#endif
794+
789795
#if HAS_FUSED_OPS
790796
#if FUSED_OPS_CAN_USE_PRELOAD
791797
FUSED_OPS_CALC_VEC;
792798
#else // FUSED_OPS_CAN_USE_PRELOAD
793799
FUSED_OPS_VEC;
794800
#endif // FUSED_OPS_CAN_USE_PRELOAD
795801
OUTPUT_TYPE_VEC res = FUSED_OPS_RESULT_VEC;
796-
BLOCK_WRITE_C(d_ptr, 0, res);
802+
unroll_for (uint n_elem = 0; n_elem < B_VEC_SIZE; ++n_elem) {
803+
BLOCK_WRITEN(OUTPUT_TYPE, 1, d_ptr, SIMD_WIDTH * n_elem * output_x_pitch, res[n_elem]);
804+
}
797805
#else // HAS_FUSED_OPS
798-
BLOCK_WRITE_C(d_ptr, 0, dequantized);
806+
unroll_for (uint n_elem = 0; n_elem < B_VEC_SIZE; ++n_elem) {
807+
BLOCK_WRITEN(OUTPUT_TYPE, 1, d_ptr, SIMD_WIDTH * n_elem * output_x_pitch, dequantized[n_elem]);
808+
}
799809
#endif // HAS_FUSED_OPS
800810
#endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
801811
#endif // IS_DYNAMIC

src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,8 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
9696
tuning_data.tile_m_size = tuning_data.simd_size;
9797
}
9898
// Increasing tile_n_size has performance improvement when m_size and n_size are not shallow and n_size is aligned at 32.
99-
if (m_size >= 128 && n_size >= 128 && (n_size % 32 == 0) && tuning_data.simd_size == 16 && params.fused_ops.empty())
99+
if (m_size >= 128 && n_size >= 128 && (n_size % 32 == 0) && tuning_data.simd_size == 16 &&
100+
(k_size % tuning_data.tile_k_size == 0) && params.fused_ops.empty())
100101
tuning_data.tile_n_size = 32;
101102

102103
GPU_DEBUG_LOG << params.layerID << ": m_size: " << m_size << ", n_size: " << n_size << ", k_size: " << k_size << std::endl;

src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp

+12
Original file line numberDiff line numberDiff line change
@@ -1487,10 +1487,22 @@ TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f16) {
14871487
this->test_transpose_matmul_f16(3, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 1}, /*input1_order*/{1, 2, 0});
14881488
}
14891489

1490+
TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f16_n32) {
1491+
this->test_transpose_matmul_f16(3, false, false, /*BMKN*/{3, 256, 32, 128}, /*input0_order*/{0, 2, 1}, /*input1_order*/{1, 2, 0});
1492+
}
1493+
14901494
TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f32) {
14911495
this->test_transpose_matmul_f32(3, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 1}, /*input1_order*/{1, 2, 0});
14921496
}
14931497

1498+
TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f32_n32) {
1499+
this->test_transpose_matmul_f32(3, false, false, /*BMKN*/{2, 128, 16, 256}, /*input0_order*/{0, 1, 2}, /*input1_order*/{0, 2, 1});
1500+
}
1501+
1502+
TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f32_n32_k_remainder) {
1503+
this->test_transpose_matmul_f32(3, false, false, /*BMKN*/{2, 128, 17, 256}, /*input0_order*/{0, 1, 2}, /*input1_order*/{0, 2, 1});
1504+
}
1505+
14941506
TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_4d_f16_unaligned) {
14951507
this->test_transpose_matmul_f16(4, true, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 3, 1}, /*input1_order*/{1, 2, 3, 0});
14961508
}

0 commit comments

Comments
 (0)