Skip to content

Commit 25455a0

Browse files
authoredJul 31, 2024··
NPUW: Change the sub-byte (i4) element order in the unpack procedure to match OpenVINO 2024.0 (#25827)
### Details: In the latest versions of OpenVINO the sub-byte order is defined as [1,0] meaning that first (MSB) 4 bits of an 8-bit vector form 1st element, and the last (LSB) 4 bits of an 8-bit vector form 0th element. Our unpack procedures for i4 were aligned with the older representation, where sub-byte order was defined as [0,1] meaning that first (MSB) 4 bits of an 8-bit vector form 0th element, and the last (LSB) 4 bits were the 1st element. **Updated these unpack functions to use this new order.** ### Tickets: - *121052*
1 parent 3e058b9 commit 25455a0

File tree

1 file changed

+5
-5
lines changed
  • src/plugins/intel_npu/src/plugin/npuw

1 file changed

+5
-5
lines changed
 

‎src/plugins/intel_npu/src/plugin/npuw/util.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ inline int8_t upc(int8_t h) {
8282
return h | (-((h & (1 << 3)) >> 3) & (-8));
8383
}
8484

85-
// NOTE: This routine implements the OLD ORDER
85+
// NOTE: This routine implements the NEW ORDER
8686
#define avx2_i4toi8(vinput, vout0, vout1) \
8787
{ \
8888
__m256i himask = _mm256_broadcastb_epi8(_mm_set_epi32(0, 0, 0, 0xF0)); \
@@ -102,8 +102,8 @@ inline int8_t upc(int8_t h) {
102102
__m256i vhires = _mm256_or_si256(vhi, _mm256_and_si256(vsubhi, vextend)); \
103103
__m256i vlores = _mm256_or_si256(vlo, _mm256_and_si256(vsublo, vextend)); \
104104
\
105-
__m256i vunlo = _mm256_unpacklo_epi8(vhires, vlores); \
106-
__m256i vunhi = _mm256_unpackhi_epi8(vhires, vlores); \
105+
__m256i vunlo = _mm256_unpacklo_epi8(vlores, vhires); \
106+
__m256i vunhi = _mm256_unpackhi_epi8(vlores, vhires); \
107107
*vout0 = _mm256_permute2x128_si256(vunlo, vunhi, 0x20); \
108108
*vout1 = _mm256_permute2x128_si256(vunlo, vunhi, 0x31); \
109109
}
@@ -339,8 +339,8 @@ void unpack_i4i8(const ov::SoPtr<ov::ITensor>& from,
339339
pDst = static_cast<int8_t*>(to->data()) + tailOffset;
340340

341341
for (std::size_t index = 0; index < ((total % 64) >> 1); index++) {
342-
*(pDst++) = upc(hi4(*(pSrc)));
343342
*(pDst++) = upc(lo4(*(pSrc)));
343+
*(pDst++) = upc(hi4(*(pSrc)));
344344
pSrc++;
345345
}
346346
UNPACK_SAVE_TICK();
@@ -458,8 +458,8 @@ void unpack_i4f16(const ov::SoPtr<ov::ITensor>& from,
458458
int8_t unpackedToI8[VECSIZE] = {0};
459459
size_t unpackedIdx = 0;
460460
for (std::size_t index = 0; index < total; index++) {
461-
unpackedToI8[unpackedIdx++] = upc(hi4(*(pSrc)));
462461
unpackedToI8[unpackedIdx++] = upc(lo4(*(pSrc)));
462+
unpackedToI8[unpackedIdx++] = upc(hi4(*(pSrc)));
463463
if (unpackedIdx == VECSIZE) {
464464
__m128i i8vec = _mm_loadu_si64(reinterpret_cast<__m128i*>(unpackedToI8));
465465
__m128i f16vec = avx2_i8tof16(i8vec);

0 commit comments

Comments
 (0)
Please sign in to comment.