@@ -1341,47 +1341,53 @@ void align_src_dst_offset(GeneratorT *host, ngen_register_scope_t &scope,
1341
1341
// Reorder may require several steps, in this case a temporary buffer T is
1342
1342
// allocated. For example: A -> T -> B or A -> B -> T -> B
1343
1343
class reorder_2d_impl_t {
1344
+ struct reorder_step_t ;
1345
+
1344
1346
public:
1345
1347
reorder_2d_impl_t (ngen::HW hw, tensor_t tile, const layout_t &src_layout,
1346
1348
const layout_t &dst_layout)
1347
- : hw_(hw), src_(src_layout), dst_(dst_layout), tile_(std::move(tile)) {
1348
- gpu_assert (src_.type () == dst_.type ());
1349
- }
1349
+ : hw_(hw), tile_(std::move(tile)) {
1350
+ gpu_assert (src_layout.type () == dst_layout.type ());
1350
1351
1351
- const tensor_t &tile () const { return tile_; }
1352
-
1353
- template <typename GeneratorT>
1354
- void emit (GeneratorT *host, ngen_register_scope_t &scope,
1355
- const reg_buf_data_t &src_rd, const reg_buf_data_t &dst_rd) {
1356
1352
dim_idx_t a_idx, b_idx;
1357
1353
int tile_a, tile_b;
1358
1354
tile_to_2d_dims (tile_, a_idx, b_idx, tile_a, tile_b);
1359
1355
1360
1356
// Convert src/dst to 2D layouts.
1361
- dim_assignment_t to_ab (src_ .ndims (), 2 );
1357
+ dim_assignment_t to_ab (src_layout .ndims (), 2 );
1362
1358
to_ab.assign (a_idx, 0 );
1363
1359
to_ab.assign (b_idx, 1 );
1364
- auto src_ab = to_ab.map (src_ );
1365
- auto dst_ab = to_ab.map (dst_ );
1360
+ auto src_ab = to_ab.map (src_layout );
1361
+ auto dst_ab = to_ab.map (dst_layout );
1366
1362
1363
+ src_ = src_ab;
1364
+ dst_ = dst_ab;
1367
1365
// Find minimal cost reorder path between layouts.
1368
- auto path = find_min_cost_path (hw_, src_ab, dst_ab, tile_a, tile_b);
1366
+ path_ = find_min_cost_path (hw_, src_ab, dst_ab, tile_a, tile_b);
1367
+ }
1368
+
1369
+ const tensor_t &tile () const { return tile_; }
1370
+ const std::vector<reorder_step_t > &path () const { return path_; }
1371
+
1372
+ template <typename GeneratorT>
1373
+ void emit (GeneratorT *host, ngen_register_scope_t &scope,
1374
+ const reg_buf_data_t &src_rd, const reg_buf_data_t &dst_rd) {
1375
+ auto &orig_type = src_.type ();
1369
1376
1370
1377
// Allocate a temporary GRF buffer if needed.
1371
1378
reg_buf_data_t tmp;
1372
- if (path .size () > 1 ) {
1379
+ if (path_ .size () > 1 ) {
1373
1380
const int grf_size = ngen::GRF::bytes (hw_);
1374
1381
tmp = scope.alloc_reg_buf_data (
1375
- utils::div_up (dst_ab .size (), grf_size));
1382
+ utils::div_up (dst_ .size (), grf_size));
1376
1383
}
1377
1384
1378
1385
// Iterate through found reorders.
1379
- auto *prev_layout = &src_ab ;
1386
+ auto *prev_layout = &src_ ;
1380
1387
auto prev_rd = src_rd;
1381
- int path_len = int (path.size ());
1382
- auto &orig_type = src_ab.type ();
1388
+ int path_len = int (path_.size ());
1383
1389
for (int i = 0 ; i < path_len; i++) {
1384
- auto &step = path [i];
1390
+ auto &step = path_ [i];
1385
1391
auto &tile = step.tile ;
1386
1392
auto &type = step.type ;
1387
1393
auto *next_layout = &step.layout ;
@@ -1777,11 +1783,10 @@ class reorder_2d_impl_t {
1777
1783
}
1778
1784
1779
1785
ngen::HW hw_;
1780
-
1786
+ tensor_t tile_;
1781
1787
layout_t src_;
1782
1788
layout_t dst_;
1783
-
1784
- tensor_t tile_;
1789
+ std::vector<reorder_step_t > path_;
1785
1790
};
1786
1791
1787
1792
class reorder_impl_t {
@@ -1914,6 +1919,15 @@ class reorder_impl_t {
1914
1919
scope.safeRelease (dummy);
1915
1920
1916
1921
reorder_2d_impl_t r (hw_, tile, src_tile_layout, dst_tile_layout);
1922
+ bool tile_ok = true ;
1923
+ for (auto &step : r.path ())
1924
+ if (step.tile .elems () < 2 ) {
1925
+ tile_ok = false ;
1926
+ break ;
1927
+ }
1928
+ // Skip any 2d reorder that attempts scalar moves
1929
+ if (!tile_ok) continue ;
1930
+
1917
1931
src_layout_.for_each_tile (
1918
1932
tile, [&](const std::vector<dim_t > &start) {
1919
1933
auto src_off = src_layout_.offset <dim_t >(start);
0 commit comments