@@ -118,8 +118,8 @@ tile_load(tile_t& tile, payload_t& payload) {
118
118
static constexpr uint32_t max_load_width_in_elem =
119
119
load_store_attr::max_load_width_in_bytes / sizeof (dtype);
120
120
121
- // static constexpr uint32_t max_trans_load_height_in_elem =
122
- // load_store_attr::max_trans_load_height_in_elem;
121
+ // static constexpr uint32_t max_trans_load_height_in_elem =
122
+ // load_store_attr::max_trans_load_height_in_elem;
123
123
static constexpr uint32_t max_load_height_in_elem =
124
124
load_store_attr::max_load_height_in_elem;
125
125
@@ -206,6 +206,11 @@ tile_load(tile_t& tile, payload_t& payload) {
206
206
#pragma unroll
207
207
for (uint32_t ii = 0 ; ii < block_size_y / ld_blk_size_y; ++ii) {
208
208
constexpr uint32_t load_elems = ld_blk_size_y * block_size_x * arr_len;
209
+ uint32_t address_offset_x =
210
+ (mem_transpose ? (offset_y + ii * ld_blk_size_y) : offset_x) /
211
+ scale_factor;
212
+ uint32_t address_offset_y =
213
+ mem_transpose ? offset_x : (offset_y + ii * ld_blk_size_y);
209
214
reg_tmp.xetla_format <native_type_t <load_dtype>>() = xetla_load_global<
210
215
native_type_t <load_dtype>,
211
216
(trans ? ld_blk_size_y : block_size_x) / scale_factor,
@@ -222,13 +227,8 @@ tile_load(tile_t& tile, payload_t& payload) {
222
227
payload.surface_width ,
223
228
payload.surface_height ,
224
229
payload.surface_pitch ,
225
- payload.offset_x +
226
- (mem_transpose ? (offset_y / (int )scale_factor +
227
- ii * ld_blk_size_y / (int )scale_factor)
228
- : (offset_x / scale_factor)),
229
-
230
- payload.offset_y +
231
- (mem_transpose ? offset_x : (offset_y + ii * ld_blk_size_y)));
230
+ payload.offset_x + address_offset_x,
231
+ payload.offset_y + address_offset_y);
232
232
233
233
if constexpr (reg_transpose && trans) {
234
234
reg_blk.xetla_select <load_elems, 1 >(ii * load_elems)
0 commit comments