clear kv cache on each completion

hazelnutcloud · hazelnutcloud · commit 436fa6c51e4c · 2024-03-07T08:19:39.000+08:00
diff --git a/godot/addons/godot-llama-cpp/godot-llama-cpp.gdextension b/godot/addons/godot-llama-cpp/godot-llama-cpp.gdextension
@@ -10,7 +10,7 @@ macos.release = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp-aarch64-mac
 windows.debug.x86_32 = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp.windows.template_debug.x86_32.dll"
 windows.release.x86_32 = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp.windows.template_release.x86_32.dll"
 windows.debug.x86_64 = "res://addons/godot-llama-cpp/lib/godot-llama-cpp-x86_64-windows-gnu-Debug.dll"
-windows.release.x86_64 = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp.windows.template_release.x86_64.dll"
+windows.release.x86_64 = "res://addons/godot-llama-cpp/lib/godot-llama-cpp-x86_64-windows-gnu-ReleaseFast.dll"
 linux.debug.x86_64 = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp.linux.template_debug.x86_64.so"
 linux.release.x86_64 = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp.linux.template_release.x86_64.so"
 linux.debug.arm64 = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp.linux.template_debug.arm64.so"
diff --git a/src/llama_context.cpp b/src/llama_context.cpp
@@ -30,8 +30,8 @@ void LlamaContext::_ready() {
 		return;
 	}
 
-  ctx_params.seed = -1;
-	ctx_params.n_ctx = 2048;
+	ctx_params.seed = -1;
+	ctx_params.n_ctx = 4096;
 	int32_t n_threads = OS::get_singleton()->get_processor_count();
 	ctx_params.n_threads = n_threads;
 	ctx_params.n_threads_batch = n_threads;
@@ -66,14 +66,14 @@ void LlamaContext::_fulfill_completion(const String &prompt) {
 		return;
 	}
 
-	llama_batch batch = llama_batch_init(tokens_list.size(), 0, 1);
-
 	for (size_t i = 0; i < tokens_list.size(); i++) {
 		llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
 	}
-  
+
 	batch.logits[batch.n_tokens - 1] = true;
 
+	llama_kv_cache_clear(ctx);
+
 	int decode_res = llama_decode(ctx, batch);
 	if (decode_res != 0) {
 		UtilityFunctions::printerr(vformat("%s: Failed to decode prompt with error code: %d", __func__, decode_res));
@@ -129,8 +129,6 @@ void LlamaContext::_fulfill_completion(const String &prompt) {
 			break;
 		}
 	}
-
-	llama_batch_free(batch);
 }
 
 void LlamaContext::set_model(const Ref<LlamaModel> p_model) {
@@ -145,6 +143,9 @@ LlamaContext::~LlamaContext() {
 	if (ctx) {
 		llama_free(ctx);
 	}
+
+	llama_batch_free(batch);
+
 	if (task_id) {
 		WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);
 	}
diff --git a/src/llama_context.h b/src/llama_context.h
@@ -12,19 +12,20 @@ class LlamaContext : public Node {
 private:
 	Ref<LlamaModel> model;
 	llama_context *ctx = nullptr;
-  llama_context_params ctx_params = llama_context_default_params();
-  int task_id;
+	llama_context_params ctx_params = llama_context_default_params();
+	llama_batch batch = llama_batch_init(4096, 0, 1);
+	int task_id;
 
 protected:
 	static void _bind_methods();
 
 public:
 	void set_model(const Ref<LlamaModel> model);
 	Ref<LlamaModel> get_model();
-  Variant request_completion(const String &prompt);
-  void _fulfill_completion(const String &prompt);
+	Variant request_completion(const String &prompt);
+	void _fulfill_completion(const String &prompt);
 	virtual void _ready() override;
-  ~LlamaContext();
+	~LlamaContext();
 };
 } //namespace godot
 

Original file line number	Diff line number	Diff line change
`@@ -30,8 +30,8 @@ void LlamaContext::_ready() {`
`30`	`30`	`return;`
`31`	`31`	`}`
`32`	`32`
`33`		`- ctx_params.seed = -1;`
`34`		`- ctx_params.n_ctx = 2048;`
	`33`	`+ ctx_params.seed = -1;`
	`34`	`+ ctx_params.n_ctx = 4096;`
`35`	`35`	`int32_t n_threads = OS::get_singleton()->get_processor_count();`
`36`	`36`	`ctx_params.n_threads = n_threads;`
`37`	`37`	`ctx_params.n_threads_batch = n_threads;`
`@@ -66,14 +66,14 @@ void LlamaContext::_fulfill_completion(const String &prompt) {`
`66`	`66`	`return;`
`67`	`67`	`}`
`68`	`68`
`69`		`- llama_batch batch = llama_batch_init(tokens_list.size(), 0, 1);`
`70`		`-`
`71`	`69`	`for (size_t i = 0; i < tokens_list.size(); i++) {`
`72`	`70`	`llama_batch_add(batch, tokens_list[i], i, { 0 }, false);`
`73`	`71`	`}`
`74`		`-`
	`72`	`+`
`75`	`73`	`batch.logits[batch.n_tokens - 1] = true;`
`76`	`74`
	`75`	`+ llama_kv_cache_clear(ctx);`
	`76`	`+`
`77`	`77`	`int decode_res = llama_decode(ctx, batch);`
`78`	`78`	`if (decode_res != 0) {`
`79`	`79`	`UtilityFunctions::printerr(vformat("%s: Failed to decode prompt with error code: %d", __func__, decode_res));`
`@@ -129,8 +129,6 @@ void LlamaContext::_fulfill_completion(const String &prompt) {`
`129`	`129`	`break;`
`130`	`130`	`}`
`131`	`131`	`}`
`132`		`-`
`133`		`- llama_batch_free(batch);`
`134`	`132`	`}`
`135`	`133`
`136`	`134`	`void LlamaContext::set_model(const Ref<LlamaModel> p_model) {`
`@@ -145,6 +143,9 @@ LlamaContext::~LlamaContext() {`
`145`	`143`	`if (ctx) {`
`146`	`144`	`llama_free(ctx);`
`147`	`145`	`}`
	`146`	`+`
	`147`	`+ llama_batch_free(batch);`
	`148`	`+`
`148`	`149`	`if (task_id) {`
`149`	`150`	`WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);`
`150`	`151`	`}`