[GPU] Fix gemm_tiled_opt kernel to support B_VEC_SIZE = 2 for static

kelvinchoi-intel · kelvinchoi-intel · commit a7503f32718b · 2024-07-25T01:22:34.000+09:00
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl
@@ -786,16 +786,26 @@ KERNEL(gemm_tiled_opt)(
         ACCUMULATOR_TYPE_VEC dequantized = TO_ACCUMULATOR_TYPE(ALPHA) * c_tile[write_id];
         #endif // BIAS_TERM
 
+        #if TRANSPOSE_OUTPUT == TRANSPOSE_X_LAST
+        const uint x_pitch = 1;
+        #else
+        const uint x_pitch = output_x_pitch;
+        #endif
+
         #if HAS_FUSED_OPS
             #if FUSED_OPS_CAN_USE_PRELOAD
         FUSED_OPS_CALC_VEC;
             #else // FUSED_OPS_CAN_USE_PRELOAD
         FUSED_OPS_VEC;
             #endif // FUSED_OPS_CAN_USE_PRELOAD
         OUTPUT_TYPE_VEC res = FUSED_OPS_RESULT_VEC;
-        BLOCK_WRITE_C(d_ptr, 0, res);
+        unroll_for (uint n_elem = 0; n_elem < B_VEC_SIZE; ++n_elem) {
+            BLOCK_WRITEN(OUTPUT_TYPE, 1, d_ptr, SIMD_WIDTH * n_elem * output_x_pitch, res[n_elem]);
+        }
         #else // HAS_FUSED_OPS
-        BLOCK_WRITE_C(d_ptr, 0, dequantized);
+        unroll_for (uint n_elem = 0; n_elem < B_VEC_SIZE; ++n_elem) {
+            BLOCK_WRITEN(OUTPUT_TYPE, 1, d_ptr, SIMD_WIDTH * n_elem * output_x_pitch, dequantized[n_elem]);
+        }
         #endif // HAS_FUSED_OPS
     #endif // TILE_N_NOT_DIVISIBLE || B_VEC_SIZE == 1
 #endif // IS_DYNAMIC
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp
@@ -96,7 +96,8 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem
             tuning_data.tile_m_size = tuning_data.simd_size;
         }
         // Increasing tile_n_size has performance improvement when m_size and n_size are not shallow and n_size is aligned at 32.
-        if (m_size >= 128 && n_size >= 128 && (n_size % 32 == 0) && tuning_data.simd_size == 16 && params.fused_ops.empty())
+        if (m_size >= 128 && n_size >= 128 && (n_size % 32 == 0) && tuning_data.simd_size == 16 &&
+            (k_size % tuning_data.tile_k_size == 0) && params.fused_ops.empty())
             tuning_data.tile_n_size = 32;
 
         GPU_DEBUG_LOG << params.layerID << ": m_size: " << m_size << ", n_size: " << n_size << ", k_size: " << k_size << std::endl;
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp
@@ -1487,10 +1487,22 @@ TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f16) {
     this->test_transpose_matmul_f16(3, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 1}, /*input1_order*/{1, 2, 0});
 }
 
+TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f16_n32) {
+    this->test_transpose_matmul_f16(3, false, false, /*BMKN*/{3, 256, 32, 128}, /*input0_order*/{0, 2, 1}, /*input1_order*/{1, 2, 0});
+}
+
 TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f32) {
     this->test_transpose_matmul_f32(3, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 1}, /*input1_order*/{1, 2, 0});
 }
 
+TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f32_n32) {
+    this->test_transpose_matmul_f32(3, false, false, /*BMKN*/{2, 128, 16, 256}, /*input0_order*/{0, 1, 2}, /*input1_order*/{0, 2, 1});
+}
+
+TEST_F(gemm_gpu_tests, transpose_matmul_static_3d_f32_n32_k_remainder) {
+    this->test_transpose_matmul_f32(3, false, false, /*BMKN*/{2, 128, 17, 256}, /*input0_order*/{0, 1, 2}, /*input1_order*/{0, 2, 1});
+}
+
 TEST_F(gemm_gpu_tests, transpose_matmul_dynamic_4d_f16_unaligned) {
     this->test_transpose_matmul_f16(4, true, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 2, 3, 1}, /*input1_order*/{1, 2, 3, 0});
 }

Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,8 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem`
`96`	`96`	`tuning_data.tile_m_size = tuning_data.simd_size;`
`97`	`97`	`}`
`98`	`98`	`// Increasing tile_n_size has performance improvement when m_size and n_size are not shallow and n_size is aligned at 32.`
`99`		`- if (m_size >= 128 && n_size >= 128 && (n_size % 32 == 0) && tuning_data.simd_size == 16 && params.fused_ops.empty())`
	`99`	`+ if (m_size >= 128 && n_size >= 128 && (n_size % 32 == 0) && tuning_data.simd_size == 16 &&`
	`100`	`+ (k_size % tuning_data.tile_k_size == 0) && params.fused_ops.empty())`
`100`	`101`	`tuning_data.tile_n_size = 32;`
`101`	`102`
`102`	`103`	`GPU_DEBUG_LOG << params.layerID << ": m_size: " << m_size << ", n_size: " << n_size << ", k_size: " << k_size << std::endl;`