Apply dynamic padding kernel code if it has any dynmic padding

kelvinchoi-intel · kelvinchoi-intel · commit e2a71d1ecd23 · 2024-10-07T16:41:25.000+09:00
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl
@@ -181,7 +181,7 @@ KERNEL(gemm_tiled_opt)(
     // Start pointers offsets
 #if TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST
     const __global INPUT0_TYPE* a_ptr = input0 + batch_offset_input0;
-    #if HAS_DYNAMIC_K_PADDING || INPUT0_HAS_PADDING
+    #if INPUT0_HAS_DYNAMIC_PADDING || INPUT0_HAS_PADDING
         const uint input0_offset = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y+1), 0) - batch_offset_input0;
         const uint input0_offset1 = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, (TILE_K)) - batch_offset_input0;
     #else
@@ -190,7 +190,7 @@ KERNEL(gemm_tiled_opt)(
     #endif
 #elif TRANSPOSE_INPUT0 == TRANSPOSE_Y_LAST
     const __global INPUT0_TYPE* a_ptr = input0 + batch_offset_input0;
-    #if HAS_DYNAMIC_K_PADDING || INPUT0_HAS_PADDING
+    #if INPUT0_HAS_DYNAMIC_PADDING || INPUT0_HAS_PADDING
         const uint input0_offset = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, 1) - batch_offset_input0;
         const uint input0_offset1 = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, y, (TILE_K)) - batch_offset_input0;
     #else
@@ -200,14 +200,14 @@ KERNEL(gemm_tiled_opt)(
 #endif // TRANSPOSE_INPUT0
 #if TRANSPOSE_INPUT1 == TRANSPOSE_X_LAST
     const __global INPUT1_TYPE* b_ptr = input1 + batch_offset_input1;
-    #if HAS_DYNAMIC_K_PADDING || INPUT1_HAS_PADDING
+    #if INPUT1_HAS_DYNAMIC_PADDING || INPUT1_HAS_PADDING
         const uint input1_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, 1, tile_n_offset) - batch_offset_input1;
     #else
         const uint input1_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR 0, 0, 0, 0, 1, 0);
     #endif
 #elif TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST
     const __global INPUT1_TYPE* b_ptr = input1 + batch_offset_input1;
-    #if HAS_DYNAMIC_K_PADDING || INPUT1_HAS_PADDING
+    #if INPUT1_HAS_DYNAMIC_PADDING || INPUT1_HAS_PADDING
         const uint input1_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, 0, (tile_n_offset + 1)) - batch_offset_input1;
         const uint input1_offset1 = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (TILE_K), tile_n_offset) - batch_offset_input1;
     #else
@@ -386,7 +386,7 @@ KERNEL(gemm_tiled_opt)(
 #endif // TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST
 
         // Loading A tile and tile C calculation
-#if IS_DYNAMIC && !INDIRECT_INPUT0 && !HAS_DYNAMIC_K_PADDING && TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST
+#if IS_DYNAMIC && !INDIRECT_INPUT0 && !INPUT0_HAS_DYNAMIC_PADDING && !INPUT1_HAS_DYNAMIC_PADDING && TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST
         A_FLOATN a_read = (TILE_K_NOT_DIVISIBLE == 0 || K_IS_ALIGNED_4BYTE) ? BLOCK_READ_A(a_ptr, 0): a_ptr[sglid];
 #endif
         unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) {
@@ -395,7 +395,7 @@ KERNEL(gemm_tiled_opt)(
         #if INDIRECT_INPUT0
             uint a_idx = FUNC_CALL(get_input0_indirect_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (k * TILE_K + sglid), beam_table);
             A_FLOATN a_read = input0[a_idx];
-        #elif HAS_DYNAMIC_K_PADDING || INPUT0_HAS_PADDING
+        #elif INPUT0_HAS_DYNAMIC_PADDING || INPUT1_HAS_DYNAMIC_PADDING || INPUT0_HAS_PADDING
             // In case of dynamic padding we can't guarantee memory access alignment for
             // block reads (4 bytes), so use scattered read
             uint a_idx = FUNC_CALL(get_input0_index)(OPTIONAL_SHAPE_INFO_TENSOR b, f, w, z, (y + dot_id), (k * TILE_K + sglid));
@@ -431,7 +431,7 @@ KERNEL(gemm_tiled_opt)(
     #endif // TILE_K > SIMD_WIDTH
                 }
             }
-    #if IS_DYNAMIC && !INDIRECT_INPUT0 && !HAS_DYNAMIC_K_PADDING
+    #if IS_DYNAMIC && !INDIRECT_INPUT0 && !INPUT0_HAS_DYNAMIC_PADDING && !INPUT1_HAS_DYNAMIC_PADDING
         // Read A for next dot_id
         a_read = (dot_id + 1 < tile_m_iterations) ? (TILE_K_NOT_DIVISIBLE == 0 || K_IS_ALIGNED_4BYTE) ? BLOCK_READ_A(a_ptr, 0) : a_ptr[sglid] : 0;
     #endif
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.h
@@ -42,23 +42,19 @@ struct DimensionAccessHelperJit : virtual DimensionAccessHelperBase {
     : DimensionAccessHelperBase(t) {
         size_t dyn_shape_offset = t.get_dynamic_shape_offset();
         size_t dyn_pad_offset = dyn_shape_offset + DataTensor::max_rank();
+        has_dynamic_pad = false;
         for (auto d : dims) {
             dims_sizes.push_back(toCodeString(d, dyn_shape_offset, padded, d.pad.is_dynamic, dyn_pad_offset));
             dyn_shape_offset++;
             if (padded) {
                 if (d.pad.is_dynamic) {
                     pad_before_after_sizes.push_back("(shape_info[" + std::to_string(dyn_pad_offset++) + "])");
                     pad_before_after_sizes.push_back("(shape_info[" + std::to_string(dyn_pad_offset++) + "])");
+                    has_dynamic_pad = true;
                 } else {
                     pad_before_after_sizes.push_back(toCodeString(d.pad.before));
                     pad_before_after_sizes.push_back(toCodeString(d.pad.after));
                 }
-
-                if (d.pad.is_dynamic || d.is_dynamic) {
-                    dims_padded.push_back(true);
-                } else {
-                    dims_padded.push_back(false);
-                }
             }
         }
     }
@@ -82,7 +78,7 @@ struct DimensionAccessHelperJit : virtual DimensionAccessHelperBase {
 
     std::vector<std::string> dims_sizes;
     std::vector<std::string> pad_before_after_sizes;
-    std::vector<bool> dims_padded;
+    bool has_dynamic_pad;
 };
 
 std::vector<size_t> GetImageSizes(const kernel_selector::WeightsTensor& dimensions, const WeightsLayout layout);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp
@@ -205,10 +205,10 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons
         else
             jit.AddConstant(MakeJitConstant("TRANSPOSE_OUTPUT", 0 /* set as TRANSPOSE_X_LAST */));
 
-        bool has_dynamic_k_padding = params.transpose_input0 ? dims0_padded.dims_padded[input0_dims[6]]
-                                                             : dims0_padded.dims_padded[input0_dims[7]];
-        if (has_dynamic_k_padding)
-            jit.AddConstant(MakeJitConstant("HAS_DYNAMIC_K_PADDING", 1));
+        if (dims0_padded.has_dynamic_pad)
+            jit.AddConstant(MakeJitConstant("INPUT0_HAS_DYNAMIC_PADDING", 1));
+        if (dims1_padded.has_dynamic_pad)
+            jit.AddConstant(MakeJitConstant("INPUT1_HAS_DYNAMIC_PADDING", 1));
     } else {
         auto get_transposed_dim_size = [](const kernel_selector::DataTensor &data_tensor,
                                           const std::vector<int64_t>& dims_order, const std::string dim) {