@@ -134,17 +134,14 @@ struct simple_sparse_reorder_impl<SIMPLE_SPARSE_REORDER_TEMPL_CALL,
134
134
size_t offset = padded_dims[0 ] * padded_dims[1 ];
135
135
136
136
int total_blocks = offset / 4096 ;
137
- int16_t *comp_tile_len_ptr = reinterpret_cast <int16_t *>(output);
137
+ using comp_tile_len_type = int ;
138
+ comp_tile_len_type *comp_tile_len_ptr = reinterpret_cast <comp_tile_len_type *>(output);
138
139
int comp_tile_len_index = 0 ;
139
140
int cl_length = 0 ;
140
- // TODO: why 2 / 64?
141
141
// Wasting memory space due to allocation a buffer for the whole tensor?
142
- int output_offset = ceil ((float )total_blocks * 2 / 64.0 );
143
-
144
- size_t offset_2 = static_cast <size_t >(ceil ((float )total_blocks * 2 / 64.0 )) * 64 ;
145
- uint64_t *bitmask_ptr = reinterpret_cast <uint64_t *>(output + offset + offset_2);
146
-
147
- auto outp = &output[output_d.blk_off (0 , 0 , 0 , 0 ) + output_offset * 64 ];
142
+ int output_offset = ceil ((float )total_blocks * sizeof (comp_tile_len_type) / 64.0 ) * 64 ;
143
+ uint64_t *bitmask_ptr = reinterpret_cast <uint64_t *>(output + output_offset + offset);
144
+ auto outp = &output[output_d.blk_off (0 , 0 , 0 , 0 ) + output_offset];
148
145
149
146
// TODO: add threading.
150
147
for (int O = 0 ; O < NB_OC; O++) {
@@ -184,7 +181,7 @@ struct simple_sparse_reorder_impl<SIMPLE_SPARSE_REORDER_TEMPL_CALL,
184
181
if (count % 64 == 0 ) { bitmask_idx++; }
185
182
}
186
183
}
187
- int16_t cl = (int16_t )ceil (non_zeros / 64.0 );
184
+ comp_tile_len_type cl = (comp_tile_len_type )ceil (non_zeros / 64.0 );
188
185
comp_tile_len_index++;
189
186
cl_length = comp_tile_len_ptr[comp_tile_len_index - 1 ] + cl;
190
187
int unsed_bytes_in_cl = 64 - (non_zeros % 64 );
0 commit comments