start refactor

hazelnutcloud · hazelnutcloud · commit 4e149e27e788 · 2024-03-16T16:35:03.000+08:00
diff --git a/build.zig b/build.zig
@@ -251,8 +251,6 @@ pub fn build(b: *std.Build) !void {
         extension.linkFramework("MetalKit");
         extension.linkFramework("Foundation");
         extension.linkFramework("Accelerate");
-        // b.installFile("llama.cpp/ggml-metal.metal", b.pathJoin(&.{ std.fs.path.basename(b.lib_dir), "ggml-metal.metal" }));
-        // b.installFile("llama.cpp/ggml-common.h", b.pathJoin(&.{ std.fs.path.basename(b.lib_dir), "ggml-common.h" }));
     } else {
         if (target.result.os.tag == .windows) {
             const vk_path = b.graph.env_map.get("VK_SDK_PATH") orelse @panic("VK_SDK_PATH not set");
diff --git a/godot/main.gd b/godot/main.gd
@@ -9,19 +9,19 @@ func _on_button_pressed():
 	
 func handle_submit():
 	print(input.text)
-	Llama.request_completion(input.text)
+	Llama.prompt(input.text)
 	
 	input.clear()
 	input.editable = false
 	submit_button.disabled = true
 	output.text = "..."
 	
-	var completion = await Llama.completion_generated
+	var completion = await Llama.prompt_generated
 	output.text = ""
-	while !completion[1]:
-		print(completion[0])
-		output.text += completion[0]
-		completion = await Llama.completion_generated
+	# while !completion[1]:
+	# 	print(completion[0])
+	# 	output.text += completion[0]
+	# 	completion = await Llama.prompt_generated
 		
 	input.editable = true
 	submit_button.disabled = false
diff --git a/llama.cpp b/llama.cpp
@@ -1 +1 @@
-Subproject commit 4755afd1cbd40d93c017e5b98c39796f52345314
+Subproject commit d84c48505f60bcd358b82a751d40418c4d235643
diff --git a/src/llama_context.cpp b/src/llama_context.cpp
@@ -4,7 +4,8 @@
 #include "llama_model.h"
 #include <godot_cpp/classes/engine.hpp>
 #include <godot_cpp/classes/os.hpp>
-#include <godot_cpp/classes/worker_thread_pool.hpp>
+#include <godot_cpp/classes/semaphore.hpp>
+#include <godot_cpp/classes/thread.hpp>
 #include <godot_cpp/core/class_db.hpp>
 #include <godot_cpp/variant/utility_functions.hpp>
 
@@ -15,29 +16,42 @@ void LlamaContext::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_model"), &LlamaContext::get_model);
 	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::OBJECT, "model", PROPERTY_HINT_RESOURCE_TYPE, "LlamaModel"), "set_model", "get_model");
 
-  ClassDB::bind_method(D_METHOD("get_seed"), &LlamaContext::get_seed);
-  ClassDB::bind_method(D_METHOD("set_seed", "seed"), &LlamaContext::set_seed);
-  ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "seed"), "set_seed", "get_seed");
+	ClassDB::bind_method(D_METHOD("get_seed"), &LlamaContext::get_seed);
+	ClassDB::bind_method(D_METHOD("set_seed", "seed"), &LlamaContext::set_seed);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "seed"), "set_seed", "get_seed");
 
-  ClassDB::bind_method(D_METHOD("get_n_ctx"), &LlamaContext::get_n_ctx);
-  ClassDB::bind_method(D_METHOD("set_n_ctx", "n_ctx"), &LlamaContext::set_n_ctx);
-  ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_ctx"), "set_n_ctx", "get_n_ctx");
+	ClassDB::bind_method(D_METHOD("get_n_ctx"), &LlamaContext::get_n_ctx);
+	ClassDB::bind_method(D_METHOD("set_n_ctx", "n_ctx"), &LlamaContext::set_n_ctx);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_ctx"), "set_n_ctx", "get_n_ctx");
 
-  ClassDB::bind_method(D_METHOD("get_n_threads"), &LlamaContext::get_n_threads);
-  ClassDB::bind_method(D_METHOD("set_n_threads", "n_threads"), &LlamaContext::set_n_threads);
-  ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_threads"), "set_n_threads", "get_n_threads");
+	ClassDB::bind_method(D_METHOD("get_temperature"), &LlamaContext::get_temperature);
+	ClassDB::bind_method(D_METHOD("set_temperature", "temperature"), &LlamaContext::set_temperature);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::FLOAT, "temperature"), "set_temperature", "get_temperature");
 
-  ClassDB::bind_method(D_METHOD("get_n_threads_batch"), &LlamaContext::get_n_threads_batch);
-  ClassDB::bind_method(D_METHOD("set_n_threads_batch", "n_threads_batch"), &LlamaContext::set_n_threads_batch);
-  ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_threads_batch"), "set_n_threads_batch", "get_n_threads_batch");
+	ClassDB::bind_method(D_METHOD("get_top_p"), &LlamaContext::get_top_p);
+	ClassDB::bind_method(D_METHOD("set_top_p", "top_p"), &LlamaContext::set_top_p);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::FLOAT, "top_p"), "set_top_p", "get_top_p");
 
-	ClassDB::bind_method(D_METHOD("request_completion", "prompt"), &LlamaContext::request_completion);
-	ClassDB::bind_method(D_METHOD("_fulfill_completion", "prompt"), &LlamaContext::_fulfill_completion);
+	ClassDB::bind_method(D_METHOD("get_top_k"), &LlamaContext::get_top_k);
+	ClassDB::bind_method(D_METHOD("set_top_k", "top_k"), &LlamaContext::set_top_k);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "top_k"), "set_top_k", "get_top_k");
 
-	ADD_SIGNAL(MethodInfo("completion_generated", PropertyInfo(Variant::STRING, "completion"), PropertyInfo(Variant::BOOL, "is_final")));
+	ClassDB::bind_method(D_METHOD("get_presence_penalty"), &LlamaContext::get_presence_penalty);
+	ClassDB::bind_method(D_METHOD("set_presence_penalty", "presence_penalty"), &LlamaContext::set_presence_penalty);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::FLOAT, "presence_penalty"), "set_presence_penalty", "get_presence_penalty");
+
+	ClassDB::bind_method(D_METHOD("get_frequency_penalty"), &LlamaContext::get_frequency_penalty);
+	ClassDB::bind_method(D_METHOD("set_frequency_penalty", "frequency_penalty"), &LlamaContext::set_frequency_penalty);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::FLOAT, "frequency_penalty"), "set_frequency_penalty", "get_frequency_penalty");
+
+	ClassDB::bind_method(D_METHOD("prompt", "prompt", "max_new_tokens"), &LlamaContext::prompt);
+	ClassDB::bind_method(D_METHOD("_run_prompts"), &LlamaContext::_run_prompts);
+
+	ADD_SIGNAL(MethodInfo("prompt_completion", PropertyInfo(Variant::STRING, "prompt_id"), PropertyInfo(Variant::STRING, "completion"), PropertyInfo(Variant::BOOL, "is_final")));
 }
 
-LlamaContext::LlamaContext() {
+LlamaContext::LlamaContext() :
+		sampling_params() {
 	batch = llama_batch_init(4096, 0, 1);
 
 	ctx_params = llama_context_default_params();
@@ -66,100 +80,57 @@ void LlamaContext::_ready() {
 		return;
 	}
 	UtilityFunctions::print(vformat("%s: Context initialized", __func__));
-}
-
-PackedStringArray LlamaContext::_get_configuration_warnings() const {
-  PackedStringArray warnings;
-  if (model == NULL) {
-    warnings.push_back("Model resource property not defined");
-  }
-  return warnings;
-}
-
-Variant LlamaContext::request_completion(const String &prompt) {
-	UtilityFunctions::print(vformat("%s: Requesting completion for prompt: %s", __func__, prompt));
-	if (task_id) {
-		WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);
-	}
-	task_id = WorkerThreadPool::get_singleton()->add_task(Callable(this, "_fulfill_completion").bind(prompt));
-	return OK;
-}
-
-void LlamaContext::_fulfill_completion(const String &prompt) {
-	UtilityFunctions::print(vformat("%s: Fulfilling completion for prompt: %s", __func__, prompt));
-	std::vector<llama_token> tokens_list;
-	tokens_list = ::llama_tokenize(ctx, std::string(prompt.utf8().get_data()), true);
 
-	const int n_len = 128;
-	const int n_ctx = llama_n_ctx(ctx);
-	const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
-	if (n_kv_req > n_ctx) {
-		UtilityFunctions::printerr(vformat("%s: n_kv_req > n_ctx, the required KV cache size is not big enough\neither reduce n_len or increase n_ctx", __func__));
-		return;
-	}
-
-	for (size_t i = 0; i < tokens_list.size(); i++) {
-		llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
-	}
+	sampling_ctx = llama_sampling_init(sampling_params);
 
-	batch.logits[batch.n_tokens - 1] = true;
+	semaphore.instantiate();
+	mutex.instantiate();
+	worker_thread.instantiate();
 
-	llama_kv_cache_clear(ctx);
+	worker_thread->start(Callable(this, "_run_prompts"));
+}
 
-	int decode_res = llama_decode(ctx, batch);
-	if (decode_res != 0) {
-		UtilityFunctions::printerr(vformat("%s: Failed to decode prompt with error code: %d", __func__, decode_res));
-		return;
+PackedStringArray LlamaContext::_get_configuration_warnings() const {
+	PackedStringArray warnings;
+	if (model == NULL) {
+		warnings.push_back("Model resource property not defined");
 	}
+	return warnings;
+}
 
-	int n_cur = batch.n_tokens;
-	int n_decode = 0;
-	llama_model *llama_model = model->model;
-
-	while (n_cur <= n_len) {
-		// sample the next token
-		{
-			auto n_vocab = llama_n_vocab(llama_model);
-			auto *logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
-
-			std::vector<llama_token_data> candidates;
-			candidates.reserve(n_vocab);
-
-			for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-				candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-			}
-
-			llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-			// sample the most likely token
-			const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
-
-			// is it an end of stream?
-			if (new_token_id == llama_token_eos(llama_model) || n_cur == n_len) {
-				call_thread_safe("emit_signal", "completion_generated", "\n", true);
+int LlamaContext::prompt(const String &prompt, int max_new_tokens) {
+	mutex->lock();
+	int prompt_id = n_prompts++;
+	prompts.push_back(prompt);
+	mutex->unlock();
 
-				break;
-			}
+	semaphore->post();
 
-			call_thread_safe("emit_signal", "completion_generated", vformat("%s", llama_token_to_piece(ctx, new_token_id).c_str()), false);
+	UtilityFunctions::print(vformat("New prompt %d: %s", prompt_id, prompt));
 
-			// prepare the next batch
-			llama_batch_clear(batch);
+	return prompt_id;
+}
 
-			// push this new token for next evaluation
-			llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
+void LlamaContext::_run_prompts() {
+	while (true) {
+		semaphore->wait();
 
-			n_decode += 1;
+		mutex->lock();
+		if (should_exit) {
+			mutex->unlock();
+			break;
+		}
+		if (prompts.is_empty()) {
+			mutex->unlock();
+			continue;
 		}
+		String prompt = prompts.get(0);
+		prompts.remove_at(0);
+		mutex->unlock();
 
-		n_cur += 1;
+		UtilityFunctions::print(vformat("Running prompt %s", prompt));
 
-		// evaluate the current batch with the transformer model
-		int decode_res = llama_decode(ctx, batch);
-		if (decode_res != 0) {
-			UtilityFunctions::printerr(vformat("%s: Failed to decode batch with error code: %d", __func__, decode_res));
-			break;
-		}
+		OS::get_singleton()->delay_msec(2000);
 	}
 }
 
@@ -184,28 +155,52 @@ void LlamaContext::set_n_ctx(int n_ctx) {
 	ctx_params.n_ctx = n_ctx;
 }
 
-int LlamaContext::get_n_threads() {
-	return ctx_params.n_threads;
+float LlamaContext::get_temperature() {
+	return sampling_params.temp;
 }
-void LlamaContext::set_n_threads(int n_threads) {
-	ctx_params.n_threads = n_threads;
+void LlamaContext::set_temperature(float temperature) {
+	sampling_params.temp = temperature;
+}
+
+float LlamaContext::get_top_p() {
+	return sampling_params.top_p;
+}
+void LlamaContext::set_top_p(float top_p) {
+	sampling_params.top_p = top_p;
+}
+
+int LlamaContext::get_top_k() {
+	return sampling_params.top_k;
+}
+void LlamaContext::set_top_k(int top_k) {
+	sampling_params.top_k = top_k;
 }
 
-int LlamaContext::get_n_threads_batch() {
-	return ctx_params.n_threads_batch;
+float LlamaContext::get_presence_penalty() {
+	return sampling_params.penalty_present;
 }
-void LlamaContext::set_n_threads_batch(int n_threads_batch) {
-	ctx_params.n_threads_batch = n_threads_batch;
+void LlamaContext::set_presence_penalty(float presence_penalty) {
+	sampling_params.penalty_present = presence_penalty;
+}
+
+float LlamaContext::get_frequency_penalty() {
+	return sampling_params.penalty_freq;
+}
+void LlamaContext::set_frequency_penalty(float frequency_penalty) {
+	sampling_params.penalty_freq = frequency_penalty;
 }
 
 LlamaContext::~LlamaContext() {
+	llama_batch_free(batch);
+	llama_sampling_free(sampling_ctx);
 	if (ctx) {
 		llama_free(ctx);
 	}
 
-	llama_batch_free(batch);
-
-	if (task_id) {
-		WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);
-	}
+	mutex->lock();
+	prompts.clear();
+	should_exit = true;
+	mutex->unlock();
+	semaphore->post();
+	worker_thread->wait_to_finish();
 }
diff --git a/src/llama_context.h b/src/llama_context.h
@@ -2,19 +2,31 @@
 #define LLAMA_CONTEXT_H
 
 #include "llama.h"
+#include "common.h"
 #include "llama_model.h"
+#include <godot_cpp/classes/mutex.hpp>
 #include <godot_cpp/classes/node.hpp>
-
+#include <godot_cpp/classes/semaphore.hpp>
+#include <godot_cpp/classes/thread.hpp>
+#include <godot_cpp/templates/vector.hpp>
 namespace godot {
+
 class LlamaContext : public Node {
 	GDCLASS(LlamaContext, Node)
 
 private:
 	Ref<LlamaModel> model;
-	llama_context *ctx = nullptr;
+	Ref<Thread> worker_thread;
+	Ref<Semaphore> semaphore;
+	Ref<Mutex> mutex;
+  bool should_exit = false;
 	llama_context_params ctx_params;
+	llama_sampling_params sampling_params;
+	llama_context *ctx = nullptr;
+	llama_sampling_context *sampling_ctx = nullptr;
 	llama_batch batch;
-	int task_id;
+	Vector<String> prompts;
+	int n_prompts = 0;
 
 protected:
 	static void _bind_methods();
@@ -23,21 +35,28 @@ class LlamaContext : public Node {
 	void set_model(const Ref<LlamaModel> model);
 	Ref<LlamaModel> get_model();
 
-	Variant request_completion(const String &prompt);
-	void _fulfill_completion(const String &prompt);
+	int prompt(const String &prompt, int max_new_tokens);
+	void _run_prompts();
+
+	int get_seed();
+	void set_seed(int seed);
+	int get_n_ctx();
+	void set_n_ctx(int n_ctx);
 
-  int get_seed();
-  void set_seed(int seed);
-  int get_n_ctx();
-  void set_n_ctx(int n_ctx);
-  int get_n_threads();
-  void set_n_threads(int n_threads);
-  int get_n_threads_batch();
-  void set_n_threads_batch(int n_threads_batch);
+	float get_temperature();
+	void set_temperature(float temperature);
+	float get_top_p();
+	void set_top_p(float top_p);
+	int get_top_k();
+	void set_top_k(int top_k);
+	float get_presence_penalty();
+	void set_presence_penalty(float presence_penalty);
+	float get_frequency_penalty();
+	void set_frequency_penalty(float frequency_penalty);
 
-  virtual PackedStringArray _get_configuration_warnings() const override;
+	virtual PackedStringArray _get_configuration_warnings() const override;
 	virtual void _ready() override;
-  LlamaContext();
+	LlamaContext();
 	~LlamaContext();
 };
 } //namespace godot