@@ -571,6 +571,18 @@ typedef struct {
571
571
} block_iq4_nl;
572
572
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
573
573
574
+ // QR4_XS = 8 is very slightly faster than QR4_XS = 4
575
+ #define QR4_XS 8
576
+ #define QI4_XS (QK_K / (4*QR4_XS))
577
+ typedef struct {
578
+ half d;
579
+ uint16_t scales_h;
580
+ uint8_t scales_l[QK_K/64];
581
+ uint8_t qs[QK_K/2];
582
+ } block_iq4_xs;
583
+ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
584
+
585
+
574
586
#define WARP_SIZE 32
575
587
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
576
588
@@ -2427,6 +2439,25 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
2427
2439
2428
2440
}
2429
2441
2442
+ template<typename dst_t>
2443
+ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
2444
+
2445
+ const int i = blockIdx.x;
2446
+ const block_iq4_xs * x = (const block_iq4_xs *)vx;
2447
+
2448
+ const int tid = threadIdx.x;
2449
+ const int il = tid/8; // 0...3
2450
+ const int ib = tid%8; // 0...7
2451
+ dst_t * y = yy + i*QK_K + 32*ib + 4*il;
2452
+ const uint8_t * q4 = x[i].qs + 16*ib + 4*il;
2453
+ const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
2454
+ for (int j = 0; j < 4; ++j) {
2455
+ y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
2456
+ y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
2457
+ }
2458
+
2459
+ }
2460
+
2430
2461
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
2431
2462
2432
2463
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
@@ -5286,6 +5317,76 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
5286
5317
return d * (sumi1 + sumi2);
5287
5318
}
5288
5319
5320
+ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
5321
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
5322
+
5323
+ #if QK_K == 256
5324
+ #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
5325
+
5326
+ const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
5327
+ const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
5328
+
5329
+ //// iqs is 0...7
5330
+ //const int ib64 = iqs/2;
5331
+ //const int il = iqs%2;
5332
+ //const int32_t * q8_1 = (const int *)bq8_1[2*ib64+0].qs + 2*il;
5333
+ //const int32_t * q8_2 = (const int *)bq8_1[2*ib64+1].qs + 2*il;
5334
+ //const uint32_t * q4_1 = (const uint32_t *)bq4->qs + 8*ib64 + 2*il;
5335
+ //const uint32_t * q4_2 = q4_1 + 4;
5336
+ //const int8_t ls1 = (bq4->scales_l[ib64] & 0xf) | (((bq4->scales_h >> (4*ib64+0)) & 3) << 4);
5337
+ //const int8_t ls2 = (bq4->scales_l[ib64] >> 4) | (((bq4->scales_h >> (4*ib64+2)) & 3) << 4);
5338
+ //const float d1 = (float)bq4->d * (ls1 - 32) * __low2float(bq8_1[2*ib64+0].ds);
5339
+ //const float d2 = (float)bq4->d * (ls2 - 32) * __low2float(bq8_1[2*ib64+1].ds);
5340
+ //int v1, v2;
5341
+ //int sumi1 = 0, sumi2 = 0;
5342
+ //for (int j = 0; j < 2; ++j) {
5343
+ // get_int_from_table_16(q4_1[j], values, v1, v2);
5344
+ // sumi1 = __dp4a(v2, q8_1[j+4], __dp4a(v1, q8_1[j+0], sumi1));
5345
+ // get_int_from_table_16(q4_2[j], values, v1, v2);
5346
+ // sumi2 = __dp4a(v2, q8_2[j+4], __dp4a(v1, q8_2[j+0], sumi2));
5347
+ //}
5348
+ //return d1 * sumi1 + d2 * sumi2;
5349
+
5350
+ // iqs is 0...7
5351
+ const int ib32 = iqs;
5352
+ const int32_t * q8 = (const int *)bq8_1[ib32].qs;
5353
+ const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32;
5354
+ const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
5355
+ const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
5356
+ int v1, v2;
5357
+ int sumi1 = 0, sumi2 = 0;
5358
+ for (int j = 0; j < 4; ++j) {
5359
+ get_int_from_table_16(q4[j], values, v1, v2);
5360
+ sumi1 = __dp4a(v1, q8[j+0], sumi1);
5361
+ sumi2 = __dp4a(v2, q8[j+4], sumi2);
5362
+ }
5363
+ return d * (sumi1 + sumi2);
5364
+
5365
+ //// iqs is 0...15
5366
+ //const int ib32 = iqs/2;
5367
+ //const int il = iqs%2;
5368
+ //const int32_t * q8 = (const int *)bq8_1[ib32].qs + 2*il;
5369
+ //const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32 + 2*il;
5370
+ //const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4);
5371
+ //const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds);
5372
+ //int v1, v2;
5373
+ //int sumi1 = 0, sumi2 = 0;
5374
+ //for (int j = 0; j < 2; ++j) {
5375
+ // get_int_from_table_16(q4[j], values, v1, v2);
5376
+ // sumi1 = __dp4a(v1, q8[j+0], sumi1);
5377
+ // sumi2 = __dp4a(v2, q8[j+4], sumi2);
5378
+ //}
5379
+ //return d * (sumi1 + sumi2);
5380
+ #else
5381
+ assert(false);
5382
+ return 0.f;
5383
+ #endif
5384
+ #else
5385
+ assert(false);
5386
+ return 0.f;
5387
+ #endif
5388
+ }
5389
+
5289
5390
template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
5290
5391
allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
5291
5392
static __device__ __forceinline__ void mul_mat_q(
@@ -7340,6 +7441,12 @@ static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k,
7340
7441
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
7341
7442
}
7342
7443
7444
+ template<typename dst_t>
7445
+ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
7446
+ const int nb = (k + QK_K - 1) / QK_K;
7447
+ dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
7448
+ }
7449
+
7343
7450
template <typename src_t, typename dst_t>
7344
7451
static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
7345
7452
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
@@ -7385,6 +7492,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
7385
7492
return dequantize_row_iq1_s_cuda;
7386
7493
case GGML_TYPE_IQ4_NL:
7387
7494
return dequantize_row_iq4_nl_cuda;
7495
+ case GGML_TYPE_IQ4_XS:
7496
+ return dequantize_row_iq4_xs_cuda;
7388
7497
case GGML_TYPE_IQ3_S:
7389
7498
return dequantize_row_iq3_s_cuda;
7390
7499
case GGML_TYPE_F32:
@@ -7428,6 +7537,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
7428
7537
return dequantize_row_iq1_s_cuda;
7429
7538
case GGML_TYPE_IQ4_NL:
7430
7539
return dequantize_row_iq4_nl_cuda;
7540
+ case GGML_TYPE_IQ4_XS:
7541
+ return dequantize_row_iq4_xs_cuda;
7431
7542
case GGML_TYPE_IQ3_S:
7432
7543
return dequantize_row_iq3_s_cuda;
7433
7544
case GGML_TYPE_F16:
@@ -9176,6 +9287,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
9176
9287
case GGML_TYPE_IQ3_XXS:
9177
9288
case GGML_TYPE_IQ1_S:
9178
9289
case GGML_TYPE_IQ4_NL:
9290
+ case GGML_TYPE_IQ4_XS:
9179
9291
case GGML_TYPE_IQ3_S:
9180
9292
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
9181
9293
default:
@@ -9203,6 +9315,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
9203
9315
case GGML_TYPE_IQ3_XXS:
9204
9316
case GGML_TYPE_IQ1_S:
9205
9317
case GGML_TYPE_IQ4_NL:
9318
+ case GGML_TYPE_IQ4_XS:
9206
9319
case GGML_TYPE_IQ3_S:
9207
9320
return max_compute_capability >= CC_VOLTA ? 128 : 64;
9208
9321
case GGML_TYPE_Q6_K:
@@ -9313,6 +9426,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
9313
9426
mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
9314
9427
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
9315
9428
break;
9429
+ case GGML_TYPE_IQ4_XS:
9430
+ mul_mat_vec_q_cuda<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
9431
+ (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
9432
+ break;
9316
9433
case GGML_TYPE_IQ3_S:
9317
9434
mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
9318
9435
(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
@@ -12041,7 +12158,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
12041
12158
ggml_type a_type = a->type;
12042
12159
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
12043
12160
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
12044
- a_type == GGML_TYPE_IQ2_S) {
12161
+ a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS ) {
12045
12162
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
12046
12163
return false;
12047
12164
}
0 commit comments