Skip to content

Commit d23f8f8

Browse files
committed
xe: concat: fix warnings and leading boundary shift
1 parent 0e71195 commit d23f8f8

File tree

3 files changed

+17
-4
lines changed

3 files changed

+17
-4
lines changed

src/gpu/intel/ocl/reusable_simple_concat.cl

+13
Original file line numberDiff line numberDiff line change
@@ -474,8 +474,21 @@ internal_padding_block_concat2(__global DATA_T *dst,
474474
if (leading_boundary_shift) {
475475
int block_scaled_leading_shift
476476
= leading_boundary_shift / blocks_per_simd1;
477+
int rollover = leading_boundary_shift % blocks_per_simd1;
478+
479+
if (((get_local_id(0) / B0) + rollover) >= blocks_per_simd1)
480+
block_scaled_leading_shift++;
481+
477482
bVal = AS_VEC((as_ulong(bVal)
478483
<< block_scaled_leading_shift * DATA_TYPE_SIZE * 8));
484+
485+
// cannot directly shift values to corresponding location in sg since each
486+
// workitem is responsible for multiple blocks, figure out source block
487+
int src_bank = ((get_local_id(0) / B0)
488+
+ leading_boundary_shift % blocks_per_simd1)
489+
% blocks_per_simd1;
490+
bVal = AS_VEC(intel_sub_group_shuffle(
491+
as_ulong(bVal), src_bank * B0 + (get_local_id(0) % B0)));
479492
}
480493

481494
int sg_shuffle_dt = cutoff;

src/gpu/intel/ocl/reusable_simple_concat.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ static status_t attempt_normalize_ip_concat2(
236236
if (simd > total_elems) continue;
237237

238238
const size_t min_bytes_per_workitem = 8;
239-
const size_t elems_per_simd
239+
const dim_t elems_per_simd
240240
= simd * (min_bytes_per_workitem / data_type_size);
241241
const bool simd_even_block_multiple
242242
= ((elems_per_simd) % conf.blocks[0]) == 0;
@@ -258,7 +258,7 @@ static status_t attempt_normalize_ip_concat2(
258258
conf.n = nonempty_inputs;
259259
conf.simd = max_simd;
260260
rt_conf.inner_axis = inner_offset;
261-
conf.data_type_size = data_type_size;
261+
conf.data_type_size = static_cast<int>(data_type_size);
262262

263263
conf.use_large_index = (max_bytes > std::numeric_limits<int>::max());
264264

@@ -296,7 +296,7 @@ static status_t attempt_normalize_ip_concat2(
296296

297297
if (can_use_internal_padding_concat2) {
298298
rt_conf.inner_axis = concat2_inner_axis;
299-
conf.data_type_size = concat2_dtsize;
299+
conf.data_type_size = static_cast<int>(concat2_dtsize);
300300
conf.use_internal_padding_kernel = true;
301301

302302
// TODO: compute::get_optimal_lws( // no emperical diff

src/gpu/intel/ocl/reusable_simple_concat.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ struct reusable_simple_concat_params_t
5959
int data_type_size;
6060
bool use_large_index = true;
6161
bool use_internal_padding_kernel = false;
62-
uint8_t padding[3] = {0};
62+
uint8_t padding[2] = {0};
6363
};
6464

6565
struct reusable_simple_concat_runtime_params_t {

0 commit comments

Comments
 (0)