@@ -29,6 +29,17 @@ static inline vbool4_t mask_lshift(vbool4_t m, unsigned shiftIn, size_t vl) {
29
29
RV_MASK_CAST (4 , 8 , mvl), RV_MASK_CAST (4 , 8 , mvr), vl
30
30
);
31
31
}
32
+ template <int shift>
33
+ static inline vbool64_t mask_lshift (vbool64_t m, unsigned shiftIn, size_t vl) {
34
+ vuint8m1_t mv = RV_VEC_CAST (64 , 8 , m);
35
+ vuint8m1_t mvl = RV (vsll_vx_u8m1)(mv, shift, vl/8 );
36
+ vuint8m1_t mvr = RV (vsrl_vx_u8m1)(mv, 8 -shift, vl/8 );
37
+ mvr = RV (vslide1up_vx_u8m1)(mvr, shiftIn, vl/8 );
38
+
39
+ return RV (vmor_mm_b64)(
40
+ RV_MASK_CAST (64 , 8 , mvl), RV_MASK_CAST (64 , 8 , mvr), vl
41
+ );
42
+ }
32
43
33
44
static inline vuint8m2_t set_first_vu8 (vuint8m2_t src, uint8_t item, size_t vl) {
34
45
#ifdef __riscv_v_intrinsic
@@ -195,48 +206,41 @@ HEDLEY_ALWAYS_INLINE void do_decode_rvv(const uint8_t* src, long& len, unsigned
195
206
// the yEnc specification requires any character following = to be unescaped, not skipped over, so we'll deal with that
196
207
// firstly, check for invalid sequences of = (we assume that these are rare, as a spec compliant yEnc encoder should not generate these)
197
208
if (LIKELIHOOD (0.0001 , RV (vcpop_m_b4)(RV (vmandn_mm_b4)(cmpEqShift1, cmp, vl2), vl2) != 0 )) {
198
- // note: we assume that uintptr_t corresponds with __riscv_xlen
199
- #if __riscv_xlen == 64
200
- vuint64m1_t cmpEqW = RV_VEC_CAST (4 , 64 , cmpEq);
201
- #else
202
- vuint32m1_t cmpEqW = RV_VEC_CAST (4 , 32 , cmpEq);
203
- #endif
204
- size_t nextShiftDown = (vl2 > sizeof (uintptr_t )*8 ? sizeof (uintptr_t )*8 : vl2) - 1 ;
205
- size_t wvl = (vl2 + sizeof (uintptr_t )*8 -1 ) / (sizeof (uintptr_t )*8 );
206
- for (size_t w=0 ; w<vl2; w+=sizeof (uintptr_t )*8 ) {
207
- // extract bottom word
208
- #if __riscv_xlen == 64
209
- uintptr_t maskW = RV (vmv_x_s_u64m1_u64)(cmpEqW);
210
- #else
211
- uintptr_t maskW = RV (vmv_x_s_u32m1_u32)(cmpEqW);
212
- #endif
213
-
214
- // fix it
215
- maskW = fix_eqMask<uintptr_t >(maskW, (maskW << 1 ) | escFirst);
216
- uint8_t nextEscFirst = (maskW >> nextShiftDown) & 1 ;
217
-
218
- // shift it up (will be used for cmpEqShift1)
219
- maskW = (maskW<<1 ) | escFirst; // TODO: should this be done using mask_lshift<1> instead?
220
- escFirst = nextEscFirst;
221
-
222
- // slide the new value in from the top
223
- #if __riscv_xlen == 64
224
- cmpEqW = RV (vslide1down_vx_u64m1)(cmpEqW, maskW, wvl);
225
- #else
226
- cmpEqW = RV (vslide1down_vx_u32m1)(cmpEqW, maskW, wvl);
227
- #endif
209
+ // replicate fix_eqMask, but in vector form
210
+ vbool4_t groupStart = RV (vmandn_mm_b4)(cmpEq, cmpEqShift1, vl2);
211
+ vbool4_t evenBits = RV_MASK_CAST (4 , 8 , RV (vmv_v_x_u8m1)(0x55 , vl2));
212
+ vbool4_t evenStart = RV (vmand_mm_b4)(groupStart, evenBits, vl2);
213
+
214
+ // compute `cmpEq + evenStart` to obtain oddGroups
215
+ vbool4_t oddGroups;
216
+ vuint64m1_t cmpEq64 = RV_VEC_CAST (4 , 64 , cmpEq);
217
+ vuint64m1_t evenStart64 = RV_VEC_CAST (4 , 64 , evenStart);
218
+ vuint64m1_t oddGroups64;
219
+ if (vl2 <= 64 ) {
220
+ // no loop needed - single 64b add will work
221
+ oddGroups64 = RV (vadd_vv_u64m1)(cmpEq64, evenStart64, 1 );
222
+ } else {
223
+ // need to loop whilst the add causes a carry
224
+ unsigned vl64 = vl2/64 ;
225
+ vbool64_t carry = RV (vmadc_vv_u64m1_b64)(cmpEq64, evenStart64, vl64);
226
+ carry = mask_lshift<1 >(carry, 0 , vl64);
227
+ oddGroups64 = RV (vadd_vv_u64m1)(cmpEq64, evenStart64, 1 );
228
+ while (RV (vcpop_m_b64)(carry, vl64)) {
229
+ vbool64_t nextCarry = RV (vmadc_vx_u64m1_b64)(oddGroups64, 1 , vl64);
230
+ oddGroups64 = RV (vadd_vx_u64m1_mu)(carry, oddGroups64, oddGroups64, 1 , vl64);
231
+ carry = mask_lshift<1 >(nextCarry, 0 , vl64);
232
+ }
228
233
}
229
- # if __riscv_xlen == 64
230
- cmpEqShift1 = RV_MASK_CAST ( 4 , 64 , cmpEqW);
231
- # else
232
- cmpEqShift1 = RV_MASK_CAST ( 4 , 32 , cmpEqW);
233
- # endif
234
+ oddGroups = RV_MASK_CAST ( 4 , 64 , oddGroups64);
235
+
236
+ cmpEq = RV (vmand_mm_b4)( RV (vmxor_mm_b4)(oddGroups, evenBits, vl2), cmpEq, vl2);
237
+
238
+ cmpEqShift1 = mask_lshift< 1 >(cmpEq, escFirst, vl2);
234
239
cmp = RV (vmor_mm_b4)(cmpEqShift1, cmp, vl2); // ~(~cmp & ~cmpEqShift1)
235
240
numOutputChars = RV (vcpop_m_b4)(cmp, vl2);
236
- } else {
237
- // no invalid = sequences found - don't need to fix up cmpEq
238
- escFirst = RV (vcpop_m_b4)(RV (vmand_mm_b4)(cmpEq, lastBit, vl2), vl2);
239
241
}
242
+ escFirst = RV (vcpop_m_b4)(RV (vmand_mm_b4)(cmpEq, lastBit, vl2), vl2);
243
+
240
244
data = RV (vsub_vv_u8m2)(data, RV_vmerge_vxm_u8m2 (yencOffset, 64 +42 , cmpEqShift1, vl2), vl2);
241
245
yencOffset = set_first_vu8 (yencOffset, 42 | (escFirst<<6 ), vl2);
242
246
0 commit comments