|
| 1 | +// Copyright (C) 2025 Intel Corporation |
| 2 | +// SPdx_1-License-Identifier: Apache-2.0 |
| 3 | +// |
| 4 | + |
| 5 | +typedef INPUT0_TYPE data_t; |
| 6 | +typedef INPUT1_TYPE grid_t; |
| 7 | +typedef OUTPUT_TYPE output_t; |
| 8 | + |
| 9 | +typedef INPUT0_TYPE data_et; |
| 10 | +typedef float grid_et; |
| 11 | +typedef OUTPUT_TYPE output_et; |
| 12 | + |
| 13 | +#if defined(ALIGN_CORNERS) |
| 14 | +# define rescale_align FUNC(denormalize) |
| 15 | +inline grid_et rescale_align(const grid_et value, const size_t range) { |
| 16 | + return (value + 1) * ((grid_et)(range)-1) / 2; |
| 17 | +} |
| 18 | +#else |
| 19 | +# define rescale_noalign FUNC(denormalize) |
| 20 | +inline grid_et rescale_noalign(const grid_et value, const size_t range) { |
| 21 | + return ((value + 1) * (grid_et)(range)-1) / 2; |
| 22 | +} |
| 23 | +#endif |
| 24 | +#define denormalize FUNC_CALL(denormalize) |
| 25 | + |
| 26 | +inline const bool FUNC(is_between)(int val, int min, int max) { |
| 27 | + return (val >= min) && (val < max); |
| 28 | +} |
| 29 | +#define is_between FUNC_CALL(is_between) |
| 30 | + |
| 31 | +#define PRE_CALC_VALID_OFFSETS_FOR_INPUT_LOAD(x_n, x_y, GLOBAL_OFFSET) \ |
| 32 | + const grid_et y_d = denormalize(y_n, INPUT0_SIZE_Y); \ |
| 33 | + const grid_et x_d = denormalize(x_n, INPUT0_SIZE_X); \ |
| 34 | + const int y_topleft = (int)floor(y_d); \ |
| 35 | + const int x_topleft = (int)floor(x_d); \ |
| 36 | + const grid_et dy = y_d - y_topleft; \ |
| 37 | + const grid_et dx = x_d - x_topleft; \ |
| 38 | + \ |
| 39 | + const bool y_topleft_valid = is_between(y_topleft, 0, INPUT0_SIZE_Y); \ |
| 40 | + const bool y_topleft_plus_valid = is_between(y_topleft + 1, 0, INPUT0_SIZE_Y); \ |
| 41 | + const bool x_topleft_valid = is_between(x_topleft, 0, INPUT0_SIZE_X); \ |
| 42 | + const bool x_topleft_plus_valid = is_between(x_topleft + 1, 0, INPUT0_SIZE_X); \ |
| 43 | + \ |
| 44 | + const bool v00_valid = y_topleft_valid && x_topleft_valid; \ |
| 45 | + const bool v01_valid = y_topleft_valid && x_topleft_plus_valid; \ |
| 46 | + const bool v10_valid = y_topleft_plus_valid && x_topleft_valid; \ |
| 47 | + const bool v11_valid = y_topleft_plus_valid && x_topleft_plus_valid; \ |
| 48 | + \ |
| 49 | + const int v00_OFFSET = v00_valid ? (GLOBAL_OFFSET + y_topleft * INPUT0_SIZE_X + x_topleft) : 0; \ |
| 50 | + const int v01_OFFSET = v01_valid ? (GLOBAL_OFFSET + y_topleft * INPUT0_SIZE_X + x_topleft + 1) : 0; \ |
| 51 | + const int v10_OFFSET = v10_valid ? (GLOBAL_OFFSET + (y_topleft + 1) * INPUT0_SIZE_X + x_topleft) : 0; \ |
| 52 | + const int v11_OFFSET = v11_valid ? (GLOBAL_OFFSET + (y_topleft + 1) * INPUT0_SIZE_X + x_topleft + 1) : 0; |
| 53 | + |
| 54 | +// WARNING: This loads may read from 'wrong' location |
| 55 | +// (in sense that is has nothing to do with |
| 56 | +// sampling point being calculated) - this is done |
| 57 | +// intentianally to keep warp without need to sync |
| 58 | +// and allows for having multiple such loads on the fly - if |
| 59 | +// compiler is smart enough. |
| 60 | +// Otherwise, if load is done conditionally, software pipelinging |
| 61 | +// is hindered by having warp sync due to warp divergence. |
| 62 | +// Tested on a770 GPU with ocl 3.0 |
| 63 | +#define LOAD_INPUT(c, C_STRIDE) \ |
| 64 | + const data_et v00_d = data[v00_OFFSET + c * C_STRIDE]; \ |
| 65 | + const data_et v01_d = data[v01_OFFSET + c * C_STRIDE]; \ |
| 66 | + const data_et v10_d = data[v10_OFFSET + c * C_STRIDE]; \ |
| 67 | + const data_et v11_d = data[v11_OFFSET + c * C_STRIDE]; |
| 68 | + |
| 69 | +#define INTERPOLATE() \ |
| 70 | + const data_et v00 = v00_valid ? v00_d * (1 - dx) : 0; \ |
| 71 | + const data_et v01 = v01_valid ? v01_d * dx : 0; \ |
| 72 | + const data_et v10 = v10_valid ? v10_d * (1 - dx) : 0; \ |
| 73 | + const data_et v11 = v11_valid ? v11_d * dx : 0; \ |
| 74 | + \ |
| 75 | + const data_et q0 = v00 + v01; \ |
| 76 | + const data_et q1 = v10 + v11; \ |
| 77 | + const data_et out = dy * q1 + (1 - dy) * q0; |
| 78 | + |
| 79 | +#define STORE(c, GLOBAL_OFFSET, C_STRIDE) output[GLOBAL_OFFSET + c * C_STRIDE] = out; |
| 80 | + |
| 81 | +// ==================================================================== |
| 82 | +// |
| 83 | +// GRID SAMPLE KERNEL |
| 84 | +// |
| 85 | +// ==================================================================== |
| 86 | + |
| 87 | +KERNEL(grid_sample_opt_bilinear_zeros)(const __global data_t* restrict data, |
| 88 | + const __global grid_t* restrict grid, |
| 89 | + __global output_t* restrict output) { |
| 90 | +#if !defined(INTERPOLATION_MODE_BILINEAR) |
| 91 | +# error[clDNN grid_sample_opt_bilinear.cl]: This kernel only support bilinear interppolation mode. |
| 92 | +#endif |
| 93 | + |
| 94 | +#if !defined(PADDING_MODE_ZEROS) |
| 95 | +# error[clDNN grid_sample_opt_bilinear.cl]: This kernel only support zeros padding mode. |
| 96 | +#endif |
| 97 | + |
| 98 | + const int n = get_global_id(0); |
| 99 | + |
| 100 | + const int LOCAL_GRID_OFFSET_FOR_THI_BLOCK = GRID_ITEMS_PER_BLOCK * 2 * get_group_id(1); |
| 101 | + const int OUTPUT_C_STRIDE = OUTPUT_SIZE_Y * OUTPUT_SIZE_X; |
| 102 | + const int GLOBAL_GRID_OFFSET_FOR_THIS_BLOCK = n * OUTPUT_C_STRIDE * 2 + LOCAL_GRID_OFFSET_FOR_THI_BLOCK; |
| 103 | + const int BLOCK_SIZE = get_local_size(1); |
| 104 | + const grid_t* restrict grid_for_this_block = grid + GLOBAL_GRID_OFFSET_FOR_THIS_BLOCK; |
| 105 | + const int GRID_ITEMS_FOR_THIS_BLOCK = |
| 106 | + min(OUTPUT_C_STRIDE * 2 - LOCAL_GRID_OFFSET_FOR_THI_BLOCK, GRID_ITEMS_PER_BLOCK * 2); |
| 107 | + |
| 108 | + const int INPUT_C_STRIDE = INPUT0_SIZE_Y * INPUT0_SIZE_X; |
| 109 | + const int GLOBAL_INPUT_OFFSET_THIS_THREAD = n * INPUT0_FEATURE_NUM * INPUT_C_STRIDE; |
| 110 | + |
| 111 | + // The basic idea is to cache and reuse grid vals for getting close to |
| 112 | + // optimal numer of loads(and stores). |
| 113 | + for (int thisThreadHW = get_local_linear_id() * 2; thisThreadHW < GRID_ITEMS_FOR_THIS_BLOCK; |
| 114 | + thisThreadHW += 2 * BLOCK_SIZE) { |
| 115 | + const int globalThisThreadHW = (thisThreadHW + LOCAL_GRID_OFFSET_FOR_THI_BLOCK) / 2; |
| 116 | + const int h = globalThisThreadHW / OUTPUT_SIZE_X; |
| 117 | + const int w = globalThisThreadHW % OUTPUT_SIZE_X; |
| 118 | + const int GLOBAL_OUTPUT_OFFSET_THIS_THREAD = |
| 119 | + n * OUTPUT_FEATURE_NUM * OUTPUT_SIZE_Y * OUTPUT_SIZE_X + h * OUTPUT_SIZE_X + w; |
| 120 | + |
| 121 | + const grid_et x_n = grid_for_this_block[thisThreadHW]; |
| 122 | + const grid_et y_n = grid_for_this_block[thisThreadHW + 1]; |
| 123 | + |
| 124 | + PRE_CALC_VALID_OFFSETS_FOR_INPUT_LOAD(x_n, y_n, GLOBAL_INPUT_OFFSET_THIS_THREAD); |
| 125 | + |
| 126 | +#pragma unroll |
| 127 | + for (int c = 0; c < OUTPUT_FEATURE_NUM; ++c) { |
| 128 | + LOAD_INPUT(c, INPUT_C_STRIDE); |
| 129 | + INTERPOLATE(); |
| 130 | + STORE(c, GLOBAL_OUTPUT_OFFSET_THIS_THREAD, OUTPUT_C_STRIDE); |
| 131 | + } |
| 132 | + } |
| 133 | +} |
| 134 | + |
| 135 | +#undef denormalize |
| 136 | +#undef STORE |
| 137 | +#undef INTERPOLATE |
| 138 | +#undef PRE_CALC_VALID_OFFSETS_FOR_INPUT_LOAD |
| 139 | +#undef LOAD_INPUT |
0 commit comments