@@ -181,7 +181,7 @@ KERNEL(gemm_tiled_opt)(
181
181
// Start pointers offsets
182
182
#if TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST
183
183
const __global INPUT0_TYPE * a_ptr = input0 + batch_offset_input0 ;
184
- #if HAS_DYNAMIC_K_PADDING || INPUT0_HAS_PADDING
184
+ #if INPUT0_HAS_DYNAMIC_PADDING || INPUT0_HAS_PADDING
185
185
const uint input0_offset = FUNC_CALL (get_input0_index )(OPTIONAL_SHAPE_INFO_TENSOR b , f , w , z , (y + 1 ), 0 ) - batch_offset_input0 ;
186
186
const uint input0_offset1 = FUNC_CALL (get_input0_index )(OPTIONAL_SHAPE_INFO_TENSOR b , f , w , z , y , (TILE_K )) - batch_offset_input0 ;
187
187
#else
@@ -190,7 +190,7 @@ KERNEL(gemm_tiled_opt)(
190
190
#endif
191
191
#elif TRANSPOSE_INPUT0 == TRANSPOSE_Y_LAST
192
192
const __global INPUT0_TYPE * a_ptr = input0 + batch_offset_input0 ;
193
- #if HAS_DYNAMIC_K_PADDING || INPUT0_HAS_PADDING
193
+ #if INPUT0_HAS_DYNAMIC_PADDING || INPUT0_HAS_PADDING
194
194
const uint input0_offset = FUNC_CALL (get_input0_index )(OPTIONAL_SHAPE_INFO_TENSOR b , f , w , z , y , 1 ) - batch_offset_input0 ;
195
195
const uint input0_offset1 = FUNC_CALL (get_input0_index )(OPTIONAL_SHAPE_INFO_TENSOR b , f , w , z , y , (TILE_K )) - batch_offset_input0 ;
196
196
#else
@@ -200,14 +200,14 @@ KERNEL(gemm_tiled_opt)(
200
200
#endif // TRANSPOSE_INPUT0
201
201
#if TRANSPOSE_INPUT1 == TRANSPOSE_X_LAST
202
202
const __global INPUT1_TYPE * b_ptr = input1 + batch_offset_input1 ;
203
- #if HAS_DYNAMIC_K_PADDING || INPUT1_HAS_PADDING
203
+ #if INPUT1_HAS_DYNAMIC_PADDING || INPUT1_HAS_PADDING
204
204
const uint input1_offset = FUNC_CALL (get_input1_index )(OPTIONAL_SHAPE_INFO_TENSOR b , f , w , z , 1 , tile_n_offset ) - batch_offset_input1 ;
205
205
#else
206
206
const uint input1_offset = FUNC_CALL (get_input1_index )(OPTIONAL_SHAPE_INFO_TENSOR 0 , 0 , 0 , 0 , 1 , 0 );
207
207
#endif
208
208
#elif TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST
209
209
const __global INPUT1_TYPE * b_ptr = input1 + batch_offset_input1 ;
210
- #if HAS_DYNAMIC_K_PADDING || INPUT1_HAS_PADDING
210
+ #if INPUT1_HAS_DYNAMIC_PADDING || INPUT1_HAS_PADDING
211
211
const uint input1_offset = FUNC_CALL (get_input1_index )(OPTIONAL_SHAPE_INFO_TENSOR b , f , w , z , 0 , (tile_n_offset + 1 )) - batch_offset_input1 ;
212
212
const uint input1_offset1 = FUNC_CALL (get_input1_index )(OPTIONAL_SHAPE_INFO_TENSOR b , f , w , z , (TILE_K ), tile_n_offset ) - batch_offset_input1 ;
213
213
#else
@@ -386,7 +386,7 @@ KERNEL(gemm_tiled_opt)(
386
386
#endif // TRANSPOSE_INPUT1 == TRANSPOSE_Y_LAST
387
387
388
388
// Loading A tile and tile C calculation
389
- #if IS_DYNAMIC && !INDIRECT_INPUT0 && !HAS_DYNAMIC_K_PADDING && TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST
389
+ #if IS_DYNAMIC && !INDIRECT_INPUT0 && !INPUT0_HAS_DYNAMIC_PADDING && ! INPUT1_HAS_DYNAMIC_PADDING && TRANSPOSE_INPUT0 == TRANSPOSE_X_LAST
390
390
A_FLOATN a_read = (TILE_K_NOT_DIVISIBLE == 0 || K_IS_ALIGNED_4BYTE ) ? BLOCK_READ_A (a_ptr , 0 ): a_ptr [sglid ];
391
391
#endif
392
392
unroll_for (uint dot_id = 0 ; dot_id < tile_m_iterations ; dot_id ++ ) {
@@ -395,7 +395,7 @@ KERNEL(gemm_tiled_opt)(
395
395
#if INDIRECT_INPUT0
396
396
uint a_idx = FUNC_CALL (get_input0_indirect_index )(OPTIONAL_SHAPE_INFO_TENSOR b , f , w , z , (y + dot_id ), (k * TILE_K + sglid ), beam_table );
397
397
A_FLOATN a_read = input0 [a_idx ];
398
- #elif HAS_DYNAMIC_K_PADDING || INPUT0_HAS_PADDING
398
+ #elif INPUT0_HAS_DYNAMIC_PADDING || INPUT1_HAS_DYNAMIC_PADDING || INPUT0_HAS_PADDING
399
399
// In case of dynamic padding we can't guarantee memory access alignment for
400
400
// block reads (4 bytes), so use scattered read
401
401
uint a_idx = FUNC_CALL (get_input0_index )(OPTIONAL_SHAPE_INFO_TENSOR b , f , w , z , (y + dot_id ), (k * TILE_K + sglid ));
@@ -431,7 +431,7 @@ KERNEL(gemm_tiled_opt)(
431
431
#endif // TILE_K > SIMD_WIDTH
432
432
}
433
433
}
434
- #if IS_DYNAMIC && !INDIRECT_INPUT0 && !HAS_DYNAMIC_K_PADDING
434
+ #if IS_DYNAMIC && !INDIRECT_INPUT0 && !INPUT0_HAS_DYNAMIC_PADDING && ! INPUT1_HAS_DYNAMIC_PADDING
435
435
// Read A for next dot_id
436
436
a_read = (dot_id + 1 < tile_m_iterations ) ? (TILE_K_NOT_DIVISIBLE == 0 || K_IS_ALIGNED_4BYTE ) ? BLOCK_READ_A (a_ptr , 0 ) : a_ptr [sglid ] : 0 ;
437
437
#endif
0 commit comments