Skip to content

Commit 0becb22

Browse files
ikawrakowKawrakow
andauthored
IQ4_XS: a 4.25 bpw quantization (#5747)
* Try IQ4_NL with blocks of 64 - does not look good * iq4_xs: go to super-blocks of 256 and 6-bit scales for blocks of 32 * iq4_xs: CUDA works - 133.2 t/s * iq4_xs: AVX2 dot product * iq4_xs: ARM_NEON dot product * iq4_nl: Metal implementation As usual, Metal / Apple Silicon don't like my quants. * iq3_xs: minor fix * iq4_xs: shrink by using IQ3_S for attn_k and attn_q * iq4_xs: revert using IQ3_S for attn_k and attn_v PPL vs size is good, but CPU performance suffers: on M2 Max TG-128 drops to 21.7 t/s from 28.8, and on a Ryzen-7950X to 14.5 t/s from 15.8 t/s. On CUDA we have 135 t/s when using IQ3_S vs 133 t/s with pure IQ4_XS. * Fix CI * iq4_xs: Added forgotten check for 256 divisibility --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent c24a2a6 commit 0becb22

11 files changed

+668
-38
lines changed

examples/quantize/quantize.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
3636
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
3737
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
3838
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
39-
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.25 bpw non-linear quantization", },
39+
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
40+
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
4041
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
4142
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
4243
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },

ggml-cuda.cu

+118-1
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,18 @@ typedef struct {
571571
} block_iq4_nl;
572572
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
573573

574+
// QR4_XS = 8 is very slightly faster than QR4_XS = 4
575+
#define QR4_XS 8
576+
#define QI4_XS (QK_K / (4*QR4_XS))
577+
typedef struct {
578+
half d;
579+
uint16_t scales_h;
580+
uint8_t scales_l[QK_K/64];
581+
uint8_t qs[QK_K/2];
582+
} block_iq4_xs;
583+
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
584+
585+
574586
#define WARP_SIZE 32
575587
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
576588

@@ -2427,6 +2439,25 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
24272439

24282440
}
24292441

2442+
template<typename dst_t>
2443+
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
2444+
2445+
const int i = blockIdx.x;
2446+
const block_iq4_xs * x = (const block_iq4_xs *)vx;
2447+
2448+
const int tid = threadIdx.x;
2449+
const int il = tid/8; // 0...3
2450+
const int ib = tid%8; // 0...7
2451+
dst_t * y = yy + i*QK_K + 32*ib + 4*il;
2452+
const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
2453+
const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
2454+
for (int j = 0; j < 4; ++j) {
2455+
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
2456+
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
2457+
}
2458+
2459+
}
2460+
24302461
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
24312462

24322463
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
@@ -5286,6 +5317,76 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
52865317
return d * (sumi1 + sumi2);
52875318
}
52885319

5320+
static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
5321+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
5322+
5323+
#if QK_K == 256
5324+
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
5325+
5326+
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
5327+
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
5328+
5329+
//// iqs is 0...7
5330+
//const int ib64 = iqs/2;
5331+
//const int il = iqs%2;
5332+
//const int32_t * q8_1 = (const int *)bq8_1[2*ib64+0].qs + 2*il;
5333+
//const int32_t * q8_2 = (const int *)bq8_1[2*ib64+1].qs + 2*il;
5334+
//const uint32_t * q4_1 = (const uint32_t *)bq4->qs + 8*ib64 + 2*il;
5335+
//const uint32_t * q4_2 = q4_1 + 4;
5336+
//const int8_t ls1 = (bq4->scales_l[ib64] & 0xf) | (((bq4->scales_h >> (4*ib64+0)) & 3) << 4);
5337+
//const int8_t ls2 = (bq4->scales_l[ib64] >> 4) | (((bq4->scales_h >> (4*ib64+2)) & 3) << 4);
5338+
//const float d1 = (float)bq4->d * (ls1 - 32) * __low2float(bq8_1[2*ib64+0].ds);
5339+
//const float d2 = (float)bq4->d * (ls2 - 32) * __low2float(bq8_1[2*ib64+1].ds);
5340+
//int v1, v2;
5341+
//int sumi1 = 0, sumi2 = 0;
5342+
//for (int j = 0; j < 2; ++j) {
5343+
// get_int_from_table_16(q4_1[j], values, v1, v2);
5344+
// sumi1 = __dp4a(v2, q8_1[j+4], __dp4a(v1, q8_1[j+0], sumi1));
5345+
// get_int_from_table_16(q4_2[j], values, v1, v2);
5346+
// sumi2 = __dp4a(v2, q8_2[j+4], __dp4a(v1, q8_2[j+0], sumi2));
5347+
//}
5348+
//return d1 * sumi1 + d2 * sumi2;
5349+
5350+
// iqs is 0...7
5351+
const int ib32 = iqs;
5352+
const int32_t * q8 = (const int *)bq8_1[ib32].qs;
5353+
const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
5354+
const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
5355+
const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
5356+
int v1, v2;
5357+
int sumi1 = 0, sumi2 = 0;
5358+
for (int j = 0; j < 4; ++j) {
5359+
get_int_from_table_16(q4[j], values, v1, v2);
5360+
sumi1 = __dp4a(v1, q8[j+0], sumi1);
5361+
sumi2 = __dp4a(v2, q8[j+4], sumi2);
5362+
}
5363+
return d * (sumi1 + sumi2);
5364+
5365+
//// iqs is 0...15
5366+
//const int ib32 = iqs/2;
5367+
//const int il = iqs%2;
5368+
//const int32_t * q8 = (const int *)bq8_1[ib32].qs + 2*il;
5369+
//const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32 + 2*il;
5370+
//const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
5371+
//const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
5372+
//int v1, v2;
5373+
//int sumi1 = 0, sumi2 = 0;
5374+
//for (int j = 0; j < 2; ++j) {
5375+
// get_int_from_table_16(q4[j], values, v1, v2);
5376+
// sumi1 = __dp4a(v1, q8[j+0], sumi1);
5377+
// sumi2 = __dp4a(v2, q8[j+4], sumi2);
5378+
//}
5379+
//return d * (sumi1 + sumi2);
5380+
#else
5381+
assert(false);
5382+
return 0.f;
5383+
#endif
5384+
#else
5385+
assert(false);
5386+
return 0.f;
5387+
#endif
5388+
}
5389+
52895390
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
52905391
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
52915392
static __device__ __forceinline__ void mul_mat_q(
@@ -7340,6 +7441,12 @@ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k,
73407441
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
73417442
}
73427443

7444+
template<typename dst_t>
7445+
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
7446+
const int nb = (k + QK_K - 1) / QK_K;
7447+
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
7448+
}
7449+
73437450
template <typename src_t, typename dst_t>
73447451
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
73457452
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
@@ -7385,6 +7492,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
73857492
return dequantize_row_iq1_s_cuda;
73867493
case GGML_TYPE_IQ4_NL:
73877494
return dequantize_row_iq4_nl_cuda;
7495+
case GGML_TYPE_IQ4_XS:
7496+
return dequantize_row_iq4_xs_cuda;
73887497
case GGML_TYPE_IQ3_S:
73897498
return dequantize_row_iq3_s_cuda;
73907499
case GGML_TYPE_F32:
@@ -7428,6 +7537,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
74287537
return dequantize_row_iq1_s_cuda;
74297538
case GGML_TYPE_IQ4_NL:
74307539
return dequantize_row_iq4_nl_cuda;
7540+
case GGML_TYPE_IQ4_XS:
7541+
return dequantize_row_iq4_xs_cuda;
74317542
case GGML_TYPE_IQ3_S:
74327543
return dequantize_row_iq3_s_cuda;
74337544
case GGML_TYPE_F16:
@@ -9176,6 +9287,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
91769287
case GGML_TYPE_IQ3_XXS:
91779288
case GGML_TYPE_IQ1_S:
91789289
case GGML_TYPE_IQ4_NL:
9290+
case GGML_TYPE_IQ4_XS:
91799291
case GGML_TYPE_IQ3_S:
91809292
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
91819293
default:
@@ -9203,6 +9315,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
92039315
case GGML_TYPE_IQ3_XXS:
92049316
case GGML_TYPE_IQ1_S:
92059317
case GGML_TYPE_IQ4_NL:
9318+
case GGML_TYPE_IQ4_XS:
92069319
case GGML_TYPE_IQ3_S:
92079320
return max_compute_capability >= CC_VOLTA ? 128 : 64;
92089321
case GGML_TYPE_Q6_K:
@@ -9313,6 +9426,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
93139426
mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
93149427
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
93159428
break;
9429+
case GGML_TYPE_IQ4_XS:
9430+
mul_mat_vec_q_cuda<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
9431+
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
9432+
break;
93169433
case GGML_TYPE_IQ3_S:
93179434
mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
93189435
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
@@ -12041,7 +12158,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
1204112158
ggml_type a_type = a->type;
1204212159
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
1204312160
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
12044-
a_type == GGML_TYPE_IQ2_S) {
12161+
a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
1204512162
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
1204612163
return false;
1204712164
}

0 commit comments

Comments
 (0)