@@ -82,7 +82,7 @@ inline int8_t upc(int8_t h) {
82
82
return h | (-((h & (1 << 3 )) >> 3 ) & (-8 ));
83
83
}
84
84
85
- // NOTE: This routine implements the OLD ORDER
85
+ // NOTE: This routine implements the NEW ORDER
86
86
#define avx2_i4toi8 (vinput, vout0, vout1 ) \
87
87
{ \
88
88
__m256i himask = _mm256_broadcastb_epi8 (_mm_set_epi32 (0 , 0 , 0 , 0xF0 )); \
@@ -102,8 +102,8 @@ inline int8_t upc(int8_t h) {
102
102
__m256i vhires = _mm256_or_si256 (vhi, _mm256_and_si256 (vsubhi, vextend)); \
103
103
__m256i vlores = _mm256_or_si256 (vlo, _mm256_and_si256 (vsublo, vextend)); \
104
104
\
105
- __m256i vunlo = _mm256_unpacklo_epi8 (vhires, vlores ); \
106
- __m256i vunhi = _mm256_unpackhi_epi8 (vhires, vlores ); \
105
+ __m256i vunlo = _mm256_unpacklo_epi8 (vlores, vhires ); \
106
+ __m256i vunhi = _mm256_unpackhi_epi8 (vlores, vhires ); \
107
107
*vout0 = _mm256_permute2x128_si256 (vunlo, vunhi, 0x20 ); \
108
108
*vout1 = _mm256_permute2x128_si256 (vunlo, vunhi, 0x31 ); \
109
109
}
@@ -339,8 +339,8 @@ void unpack_i4i8(const ov::SoPtr<ov::ITensor>& from,
339
339
pDst = static_cast <int8_t *>(to->data ()) + tailOffset;
340
340
341
341
for (std::size_t index = 0 ; index < ((total % 64 ) >> 1 ); index ++) {
342
- *(pDst++) = upc (hi4 (*(pSrc)));
343
342
*(pDst++) = upc (lo4 (*(pSrc)));
343
+ *(pDst++) = upc (hi4 (*(pSrc)));
344
344
pSrc++;
345
345
}
346
346
UNPACK_SAVE_TICK ();
@@ -458,8 +458,8 @@ void unpack_i4f16(const ov::SoPtr<ov::ITensor>& from,
458
458
int8_t unpackedToI8[VECSIZE] = {0 };
459
459
size_t unpackedIdx = 0 ;
460
460
for (std::size_t index = 0 ; index < total; index ++) {
461
- unpackedToI8[unpackedIdx++] = upc (hi4 (*(pSrc)));
462
461
unpackedToI8[unpackedIdx++] = upc (lo4 (*(pSrc)));
462
+ unpackedToI8[unpackedIdx++] = upc (hi4 (*(pSrc)));
463
463
if (unpackedIdx == VECSIZE) {
464
464
__m128i i8vec = _mm_loadu_si64 (reinterpret_cast <__m128i*>(unpackedToI8));
465
465
__m128i f16vec = avx2_i8tof16 (i8vec);
0 commit comments