@@ -1641,6 +1641,7 @@ struct llama_cparams {
1641
1641
float yarn_attn_factor;
1642
1642
float yarn_beta_fast;
1643
1643
float yarn_beta_slow;
1644
+ float defrag_thold;
1644
1645
1645
1646
bool mul_mat_q;
1646
1647
bool offload_kqv;
@@ -5117,16 +5118,16 @@ struct llm_build_context {
5117
5118
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
5118
5119
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5119
5120
5120
- for (int i = 0; i < n_kv ; ++i) {
5121
- const int id = ids[i];
5121
+ for (uint32_t i = 0; i < ids.size() ; ++i) {
5122
+ const uint32_t id = ids[i];
5122
5123
5123
- if (i == id || id == n_kv ) {
5124
+ if (i == id || id == ids.size() ) {
5124
5125
continue;
5125
5126
}
5126
5127
5127
- int nm = 1;
5128
+ uint32_t nm = 1;
5128
5129
5129
- while (i + nm < n_kv && (int) ids[i + nm] == id + nm) {
5130
+ while (i + nm < ids.size() && ids[i + nm] == id + nm) {
5130
5131
nm++;
5131
5132
}
5132
5133
@@ -5158,6 +5159,8 @@ struct llm_build_context {
5158
5159
i += nm - 1;
5159
5160
}
5160
5161
5162
+ //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
5163
+
5161
5164
return gf;
5162
5165
}
5163
5166
@@ -7938,6 +7941,8 @@ static int llama_decode_internal(
7938
7941
batch.seq_id = seq_id_arr.data();
7939
7942
}
7940
7943
7944
+ llama_kv_cache_update(&lctx);
7945
+
7941
7946
// if we have enough unused cells before the current head ->
7942
7947
// better to start searching from the beginning of the cache, hoping to fill it
7943
7948
if (kv_self.head > kv_self.used + 2*n_tokens) {
@@ -7956,8 +7961,6 @@ static int llama_decode_internal(
7956
7961
7957
7962
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
7958
7963
7959
- llama_kv_cache_update(&lctx);
7960
-
7961
7964
ggml_backend_sched_reset(lctx.sched);
7962
7965
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
7963
7966
@@ -8007,6 +8010,18 @@ static int llama_decode_internal(
8007
8010
}
8008
8011
}
8009
8012
8013
+ // decide if we need to defrag the kv cache
8014
+ if (cparams.defrag_thold >= 0.0f) {
8015
+ const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
8016
+
8017
+ // queue defragmentation for next llama_kv_cache_update
8018
+ if (fragmentation > cparams.defrag_thold) {
8019
+ //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
8020
+
8021
+ llama_kv_cache_defrag(kv_self);
8022
+ }
8023
+ }
8024
+
8010
8025
#ifdef GGML_PERF
8011
8026
// print timing information per ggml operation (for debugging purposes)
8012
8027
// requires GGML_PERF to be defined
@@ -8098,12 +8113,16 @@ static int llama_decode_internal(
8098
8113
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8099
8114
auto & kv_self = lctx.kv_self;
8100
8115
8116
+ const auto & hparams = lctx.model.hparams;
8117
+
8118
+ const uint32_t n_layer = hparams.n_layer;
8119
+
8101
8120
const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
8102
8121
const uint32_t n_used = kv_self.used;
8103
8122
8104
8123
assert(n_used <= n_kv);
8105
8124
8106
- const int64_t t_start = ggml_time_us();
8125
+ // const int64_t t_start = ggml_time_us();
8107
8126
8108
8127
// number of cells moved
8109
8128
uint32_t n_moves = 0;
@@ -8127,15 +8146,26 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8127
8146
8128
8147
// found a hole - fill it with data from the end of the cache
8129
8148
8130
- // determine the size of the hole
8131
8149
uint32_t nh = 1;
8150
+
8151
+ // determine the size of the hole
8132
8152
while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
8133
8153
nh++;
8134
8154
}
8135
8155
8136
- // starting from the end, find nh non-empty cells
8156
+ // each move requires 6*n_layer tensors (see build_defrag)
8157
+ // - source view, destination view, copy operation
8158
+ // - x2 for keys and values
8159
+ //
8160
+ if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
8161
+ // the graph is too big, we cannot move more cells
8162
+ break;
8163
+ }
8164
+
8137
8165
uint32_t nf = 0;
8138
8166
uint32_t is = n_kv - 1;
8167
+
8168
+ // starting from the end, find nh non-empty cells
8139
8169
for (; is > i0; --is) {
8140
8170
const auto & cell1 = kv_self.cells[is];
8141
8171
@@ -8156,11 +8186,17 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8156
8186
8157
8187
nf = 0;
8158
8188
8189
+ uint32_t i1 = is;
8190
+
8191
+ // are we moving a continuous block of memory?
8192
+ bool cont = false;
8193
+
8159
8194
// go back and move the nf cells to the hole
8160
- for (uint32_t i1 = is ; i1 < n_kv; ++i1) {
8161
- const auto & cell1 = kv_self.cells[i1];
8195
+ for (; i1 < n_kv; ++i1) {
8196
+ auto & cell1 = kv_self.cells[i1];
8162
8197
8163
8198
if (cell1.is_empty() || ids[i1] != n_kv) {
8199
+ cont = false;
8164
8200
continue;
8165
8201
}
8166
8202
@@ -8170,11 +8206,23 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8170
8206
// move the cell meta data
8171
8207
kv_self.cells[i0 + nf] = cell1;
8172
8208
8173
- n_moves++;
8209
+ // clear the old cell and move the head there
8210
+ cell1 = llama_kv_cell();
8211
+ kv_self.head = n_used;
8212
+
8213
+ if (!cont) {
8214
+ n_moves++;
8215
+ cont = true;
8216
+ }
8217
+
8174
8218
nf++;
8219
+
8220
+ if (nf == nh) {
8221
+ break;
8222
+ }
8175
8223
}
8176
8224
8177
- LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, n_kv , i0, i0 + nh);
8225
+ // LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1 , i0, i0 + nh);
8178
8226
8179
8227
i0 += nh - 1;
8180
8228
}
@@ -8183,15 +8231,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8183
8231
return;
8184
8232
}
8185
8233
8186
- LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
8187
-
8188
- kv_self.head = n_used;
8189
- kv_self.used = n_used;
8234
+ //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
8190
8235
8191
- // zero the rest of the cells
8192
- for (uint32_t i = n_used; i < n_kv; ++i) {
8193
- kv_self.cells[i] = llama_kv_cell();
8194
- }
8236
+ //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
8195
8237
8196
8238
#if 0
8197
8239
// CPU defrag
@@ -8203,9 +8245,6 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8203
8245
// likely not worth the effort, as we have ggml_graph based defrag
8204
8246
//
8205
8247
8206
- const auto & hparams = lctx.model.hparams;
8207
-
8208
- const uint32_t n_layer = hparams.n_layer;
8209
8248
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
8210
8249
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
8211
8250
@@ -8274,9 +8313,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
8274
8313
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
8275
8314
#endif
8276
8315
8277
- const int64_t t_end = ggml_time_us();
8316
+ // const int64_t t_end = ggml_time_us();
8278
8317
8279
- LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
8318
+ // LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
8280
8319
}
8281
8320
8282
8321
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
@@ -11670,6 +11709,7 @@ struct llama_context_params llama_context_default_params() {
11670
11709
/*.yarn_beta_fast =*/ 32.0f,
11671
11710
/*.yarn_beta_slow =*/ 1.0f,
11672
11711
/*.yarn_orig_ctx =*/ 0,
11712
+ /*.defrag_thold =*/ -1.0f,
11673
11713
/*.cb_eval =*/ nullptr,
11674
11714
/*.cb_eval_user_data =*/ nullptr,
11675
11715
/*.type_k =*/ GGML_TYPE_F16,
@@ -11834,6 +11874,7 @@ struct llama_context * llama_new_context_with_model(
11834
11874
cparams.yarn_attn_factor = params.yarn_attn_factor;
11835
11875
cparams.yarn_beta_fast = params.yarn_beta_fast;
11836
11876
cparams.yarn_beta_slow = params.yarn_beta_slow;
11877
+ cparams.defrag_thold = params.defrag_thold;
11837
11878
cparams.mul_mat_q = params.mul_mat_q;
11838
11879
cparams.offload_kqv = params.offload_kqv;
11839
11880
cparams.do_pooling = params.do_pooling;
@@ -12035,7 +12076,7 @@ struct llama_context * llama_new_context_with_model(
12035
12076
}
12036
12077
12037
12078
// buffer used to store the computation graph and the tensor meta data
12038
- ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead( ));
12079
+ ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false ));
12039
12080
12040
12081
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
12041
12082
0 commit comments