@@ -262,7 +262,7 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
262
262
vymask.bitRep = consecutive;
263
263
vymask.maskRep = 1 ;
264
264
vymask.rsize = *yblock;
265
- vymask.rdivide = 1 ;
265
+ vymask.rshift = 0 ;
266
266
} else if (logicalSlots < slots) {
267
267
auto &fymask = block.colMajor ? block.rowMask .fixed : block.colMask .fixed ;
268
268
fymask.isFixed = true ;
@@ -279,7 +279,7 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
279
279
vxmask.bitRep = (block.simdSize > 16 ) ? 32 : 16 ;
280
280
vxmask.maskRep = 1 ;
281
281
vxmask.rsize = 1 ;
282
- vxmask.rdivide = 1 ;
282
+ vxmask.rshift = 0 ;
283
283
} else if (allowDesc && (channelScattered || astrategy.newDP ) && *xblock > 1 && !byte) {
284
284
fragment = std::min (*xblock, 4 * width / T);
285
285
if (block.colMajor ) // Clang can't handle the ternary operator equivalent of this.
@@ -482,7 +482,7 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
482
482
vrmask.rsize = rblock;
483
483
vrmask.bitRep = std::max<int >(T.paddedSize () / maskGranularity, 1 );
484
484
vrmask.maskRep = cblock;
485
- vrmask.rdivide = std::max<int >(maskGranularity / T, 1 );
485
+ vrmask.rshift = ilog2 ( std::max<int >(maskGranularity / T, 1 ) );
486
486
}
487
487
} else {
488
488
if (avoidFragment) {
@@ -491,8 +491,8 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
491
491
vrmask.isFixed = false ;
492
492
vrmask.bitRep = 0 ; /* will be filled in later */
493
493
vrmask.maskRep = 1 ;
494
- vrmask.rdivide = 1 ;
495
494
vrmask.rsize = 1 ;
495
+ vrmask.rshift = 0 ;
496
496
} else {
497
497
// Fragment it. Could actually handle rowFragment = 2 by changing descriptor.
498
498
block.rowFragment = 1 ;
@@ -520,7 +520,7 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
520
520
vcmask.rsize = cblock;
521
521
vcmask.bitRep = std::max<int >(T.paddedSize () / maskGranularity, 1 );
522
522
vcmask.maskRep = rblock;
523
- vcmask.rdivide = std::max<int >(maskGranularity / T, 1 );
523
+ vcmask.rshift = ilog2 ( std::max<int >(maskGranularity / T, 1 ) );
524
524
}
525
525
} else {
526
526
if (avoidFragment) {
@@ -529,8 +529,8 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
529
529
vcmask.isFixed = false ;
530
530
vcmask.bitRep = 0 ;
531
531
vcmask.maskRep = 1 ;
532
- vcmask.rdivide = 1 ;
533
532
vcmask.rsize = 1 ;
533
+ vcmask.rshift = 0 ;
534
534
} else {
535
535
// Fragment it. Could actually handle colFragment = 2 by changing descriptor.
536
536
block.colFragment = 1 ;
@@ -719,7 +719,8 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
719
719
auto &vxmask = block.colMajor ? block.rowMask .variable : block.colMask .variable ;
720
720
vxmask.isFixed = false ;
721
721
vxmask.bitRep = block.simdSize ;
722
- vxmask.maskRep = vxmask.rdivide = vxmask.rsize = 1 ;
722
+ vxmask.maskRep = vxmask.rsize = 1 ;
723
+ vxmask.rshift = 0 ;
723
724
}
724
725
725
726
if (remainderY) {
@@ -728,7 +729,7 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
728
729
vymask.bitRep = xCacheLines;
729
730
vymask.maskRep = 1 ;
730
731
vymask.rsize = yblock;
731
- vymask.rdivide = 1 ;
732
+ vymask.rshift = 0 ;
732
733
}
733
734
break ;
734
735
}
@@ -739,13 +740,13 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
739
740
if (block.rowMask && !block.rowMask .fixed .isFixed ) {
740
741
if (vrmask.rsize == 0 )
741
742
vrmask.rsize = rblock;
742
- vrmask.maskRep = std::min<int >(vrmask.maskRep , std::max<int >(1 , vrmask. rdivide * block. simdSize / (vrmask.bitRep * vrmask.rsize )));
743
+ vrmask.maskRep = std::min<int >(vrmask.maskRep , std::max<int >(1 , (block. simdSize << vrmask. rshift ) / (vrmask.bitRep * vrmask.rsize )));
743
744
block.noRowsOK = true ; // All-zero masks are always OK.
744
745
}
745
746
if (block.colMask && !block.colMask .fixed .isFixed ) {
746
747
if (vcmask.rsize == 0 )
747
748
vcmask.rsize = cblock;
748
- vcmask.maskRep = std::min<int >(vcmask.maskRep , std::max<int >(1 , vcmask. rdivide * block. simdSize / (vcmask.bitRep * vcmask.rsize )));
749
+ vcmask.maskRep = std::min<int >(vcmask.maskRep , std::max<int >(1 , (block. simdSize << vcmask. rshift ) / (vcmask.bitRep * vcmask.rsize )));
749
750
block.noColsOK = true ;
750
751
}
751
752
0 commit comments