hazelnutcloud · May 21, 2024
diff --git a/‎build.zig
+116-237 b/‎build.zig
+116-237
diff --git a/‎godot/.gitattributes
+2 b/‎godot/.gitattributes
+2
diff --git a/‎godot/.gitignore
+2 b/‎godot/.gitignore
+2
diff --git a/‎godot/addons/godot-llama-cpp/autoloads/llama-backend.gd
-10 b/‎godot/addons/godot-llama-cpp/autoloads/llama-backend.gd
-10
diff --git a/‎godot/addons/godot-llama-cpp/chat/chat_formatter.gd
+56 b/‎godot/addons/godot-llama-cpp/chat/chat_formatter.gd
+56
diff --git a/‎godot/addons/godot-llama-cpp/plugin.cfg
+1-1 b/‎godot/addons/godot-llama-cpp/plugin.cfg
+1-1
diff --git a/‎godot/addons/godot-llama-cpp/godot-llama-cpp.gd ‎godot/addons/godot-llama-cpp/plugin.gd
+2-2 b/‎godot/addons/godot-llama-cpp/godot-llama-cpp.gd ‎godot/addons/godot-llama-cpp/plugin.gd
+2-2
diff --git a/‎godot/addons/godot-llama-cpp/godot-llama-cpp.gdextension ‎godot/addons/godot-llama-cpp/plugin.gdextension
+2-2 b/‎godot/addons/godot-llama-cpp/godot-llama-cpp.gdextension ‎godot/addons/godot-llama-cpp/plugin.gdextension
+2-2
diff --git a/‎godot/autoloads/llama.tscn
-6 b/‎godot/autoloads/llama.tscn
-6
diff --git a/‎godot/examples/simple/TextEdit.gd
+20 b/‎godot/examples/simple/TextEdit.gd
+20
diff --git a/‎godot/examples/simple/form.gd
+6 b/‎godot/examples/simple/form.gd
+6
diff --git a/‎godot/examples/simple/message.gd
+23 b/‎godot/examples/simple/message.gd
+23
diff --git a/‎godot/examples/simple/message.tscn
+37 b/‎godot/examples/simple/message.tscn
+37
diff --git a/‎godot/examples/simple/simple.gd
+53 b/‎godot/examples/simple/simple.gd
+53
diff --git a/‎godot/examples/simple/simple.tscn
+79 b/‎godot/examples/simple/simple.tscn
+79
diff --git a/‎godot/icon.svg
+1 b/‎godot/icon.svg
+1
diff --git a/‎godot/icon.svg.import
+37 b/‎godot/icon.svg.import
+37
diff --git a/‎godot/main.gd
-27 b/‎godot/main.gd
-27
diff --git a/‎godot/main.tscn
-103 b/‎godot/main.tscn
-103
diff --git a/‎godot/project.godot
+4-25 b/‎godot/project.godot
+4-25
diff --git a/‎godot_cpp b/‎godot_cpp
diff --git a/‎llama.cpp b/‎llama.cpp
diff --git a/‎src/llama_backend.cpp
-19 b/‎src/llama_backend.cpp
-19
diff --git a/‎src/llama_backend.h
-19 b/‎src/llama_backend.h
-19
diff --git a/‎src/llama_context.cpp
+215-96 b/‎src/llama_context.cpp
+215-96
diff --git a/‎src/llama_context.h
+41-16 b/‎src/llama_context.h
+41-16
diff --git a/‎src/llama_model.cpp
+6-3 b/‎src/llama_model.cpp
+6-3
diff --git a/‎src/llama_model_loader.cpp
+1-4 b/‎src/llama_model_loader.cpp
+1-4
diff --git a/‎src/register_types.cpp
-2 b/‎src/register_types.cpp
-2
diff --git a/‎tools/expand_metal.zig
+79 b/‎tools/expand_metal.zig
+79
@@ -0,0 +1,2 @@
+# Normalize EOL for all files that Git considers text files.
+* text=auto eol=lf
@@ -0,0 +1,2 @@
+# Godot 4+ specific ignores
+.godot/
@@ -0,0 +1,56 @@
+class_name ChatFormatter
+
+static func apply(format: String, messages: Array) -> String:
+	match format:
+		"llama3":
+			return format_llama3(messages)
+		"phi3":
+			return format_phi3(messages)
+		"mistral":
+			return format_mistral(messages)
+		_:
+			printerr("Unknown chat format: ", format)
+			return ""
+
+static func format_llama3(messages: Array) -> String:
+	var res = ""
+	
+	for i in range(messages.size()):
+		match messages[i]:
+			{"text": var text, "sender": var sender}:
+				res += """<|start_header_id|>%s<|end_header_id|>
+
+%s<|eot_id|>
+""" % [sender, text]
+			_:
+				printerr("Invalid message at index ", i)
+
+	res += "<|start_header_id|>assistant<|end_header_id|>\n\n"
+	return res
+
+static func format_phi3(messages: Array) -> String:
+	var res = ""
+	
+	for i in range(messages.size()):
+		match messages[i]:
+			{"text": var text, "sender": var sender}:
+				res +="<|%s|>\n%s<|end|>\n" % [sender, text]
+			_:
+				printerr("Invalid message at index ", i)
+	res += "<|assistant|>\n"
+	return res
+	
+static func format_mistral(messages: Array) -> String:
+	var res = ""
+	
+	for i in range(messages.size()):
+		match messages[i]:
+			{"text": var text, "sender": var sender}:
+				if sender == "user":
+					res += "[INST] %s [/INST]" % text
+				else:
+					res += "%s</s>"
+			_:
+				printerr("Invalid message at index ", i)
+	
+	return res
@@ -4,4 +4,4 @@ name="godot-llama-cpp"
 description="Run large language models in Godot. Powered by llama.cpp."
 author="hazelnutcloud"
 version="0.0.1"
-script="godot-llama-cpp.gd"
+script="plugin.gd"
@@ -4,9 +4,9 @@ extends EditorPlugin
 
 func _enter_tree():
 	# Initialization of the plugin goes here.
-	add_autoload_singleton("__LlamaBackend", "res://addons/godot-llama-cpp/autoloads/llama-backend.gd")
+  pass
 
 
 func _exit_tree():
 	# Clean-up of the plugin goes here.
-	remove_autoload_singleton("__LlamaBackend")
+  pass
@@ -5,8 +5,8 @@ compatibility_minimum = "4.2"
 
 [libraries]
 
-macos.debug = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp-aarch64-macos-none-Debug.dylib"
-macos.release = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp-aarch64-macos-none-ReleaseFast.dylib"
+macos.debug = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp-aarch64-macos-none-ReleaseSafe.dylib"
+macos.release = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp-aarch64-macos-none-ReleaseSafe.dylib"
 windows.debug.x86_32 = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp.windows.template_debug.x86_32.dll"
 windows.release.x86_32 = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp.windows.template_release.x86_32.dll"
 windows.debug.x86_64 = "res://addons/godot-llama-cpp/lib/godot-llama-cpp-x86_64-windows-gnu-Debug.dll"
 
@@ -0,0 +1,20 @@
+extends TextEdit
+
+signal submit(input: String)
+	
+func _gui_input(event: InputEvent) -> void:
+	if event is InputEventKey:
+		var keycode = event.get_keycode_with_modifiers()
+		if keycode == KEY_ENTER and event.is_pressed():
+			handle_submit()
+			accept_event()
+		if keycode == KEY_ENTER | KEY_MASK_SHIFT and event.is_pressed():
+			insert_text_at_caret("\n")
+			accept_event()
+
+func _on_button_pressed() -> void:
+	handle_submit()
+
+func handle_submit() -> void:
+	submit.emit(text)
+	text = ""
@@ -0,0 +1,6 @@
+extends HBoxContainer
+
+@onready var text_edit = %TextEdit
+
+func _on_button_pressed() -> void:
+	text_edit.handle_submit()
@@ -0,0 +1,23 @@
+class_name Message
+extends Node
+
+@onready var text_container = %Text
+@onready var icon = %Panel
+@export_enum("user", "assistant") var sender: String
+@export var include_in_prompt: bool = true
+var text:
+	get:
+		return text_container.text
+	set(value):
+		text_container.text = value
+
+var completion_id: int = -1
+var pending: bool = false
+var errored: bool = false
+
+func set_text(new_text: String):
+	text_container.text = new_text
+	
+func append_text(new_text: String):
+	text_container.text += new_text
+
@@ -0,0 +1,37 @@
+[gd_scene load_steps=5 format=3 uid="uid://t862t0v8ht2q"]
+
+[ext_resource type="Script" path="res://examples/simple/message.gd" id="1_pko33"]
+[ext_resource type="Texture2D" uid="uid://dplw232htshgc" path="res://addons/godot-llama-cpp/assets/godot-llama-cpp-1024x1024.svg" id="2_dvc7y"]
+
+[sub_resource type="StyleBoxTexture" id="StyleBoxTexture_t8bgj"]
+texture = ExtResource("2_dvc7y")
+
+[sub_resource type="Theme" id="Theme_bw3pb"]
+Panel/styles/panel = SubResource("StyleBoxTexture_t8bgj")
+
+[node name="RichTextLabel" type="HBoxContainer"]
+anchors_preset = 15
+anchor_right = 1.0
+anchor_bottom = 1.0
+grow_horizontal = 2
+grow_vertical = 2
+size_flags_horizontal = 3
+theme_override_constants/separation = 20
+script = ExtResource("1_pko33")
+sender = "assistant"
+
+[node name="Panel" type="Panel" parent="."]
+unique_name_in_owner = true
+custom_minimum_size = Vector2(80, 80)
+layout_mode = 2
+size_flags_vertical = 0
+theme = SubResource("Theme_bw3pb")
+
+[node name="Text" type="RichTextLabel" parent="."]
+unique_name_in_owner = true
+layout_mode = 2
+size_flags_horizontal = 3
+focus_mode = 2
+text = "..."
+fit_content = true
+selection_enabled = true
@@ -0,0 +1,53 @@
+extends Node
+
+const message = preload("res://examples/simple/message.tscn")
+
+@onready var messages_container = %MessagesContainer
+@onready var llama_context = %LlamaContext
+
+func _on_text_edit_submit(input: String) -> void:
+	handle_input(input)
+
+func handle_input(input: String) -> void:
+	#var messages = [{ "sender": "system", "text": "You are a pirate chatbot who always responds in pirate speak!" }]
+	
+	#var messages = [{ "sender": "system", "text": "You are a helpful chatbot assistant!" }]
+	var messages = []
+	messages.append_array(messages_container.get_children().filter(func(msg: Message): return msg.include_in_prompt).map(
+		func(msg: Message) -> Dictionary:
+			return { "text": msg.text, "sender": msg.sender }
+	))
+	messages.append({"text": input, "sender": "user"})
+	var prompt = ChatFormatter.apply("llama3", messages)
+	print("prompt: ", prompt)
+	
+	var completion_id = llama_context.request_completion(prompt)
+	
+	var user_message: Message = message.instantiate()
+	messages_container.add_child(user_message)
+	user_message.set_text(input)
+	user_message.sender = "user"
+	user_message.completion_id = completion_id
+	
+	var ai_message: Message = message.instantiate()
+	messages_container.add_child(ai_message)
+	ai_message.sender = "assistant"
+	ai_message.completion_id = completion_id
+	ai_message.pending = true
+	ai_message.grab_focus()
+	
+
+
+func _on_llama_context_completion_generated(chunk: Dictionary) -> void:
+	var completion_id = chunk.id
+	for msg: Message in messages_container.get_children():
+		if msg.completion_id != completion_id or msg.sender != "assistant":
+			continue
+		if chunk.has("error"):
+			msg.errored = true
+		elif chunk.has("text"):
+			if msg.pending:
+				msg.pending = false
+				msg.set_text(chunk["text"])
+			else:
+				msg.append_text(chunk["text"])
@@ -0,0 +1,79 @@
+[gd_scene load_steps=6 format=3 uid="uid://c55kb4qvg6geq"]
+
+[ext_resource type="Texture2D" uid="uid://dplw232htshgc" path="res://addons/godot-llama-cpp/assets/godot-llama-cpp-1024x1024.svg" id="1_gjsev"]
+[ext_resource type="Script" path="res://examples/simple/simple.gd" id="1_sruc3"]
+[ext_resource type="PackedScene" uid="uid://t862t0v8ht2q" path="res://examples/simple/message.tscn" id="2_7iip7"]
+[ext_resource type="Script" path="res://examples/simple/TextEdit.gd" id="2_7usqw"]
+[ext_resource type="LlamaModel" path="res://models/meta-llama-3-8b-instruct.Q5_K_M.gguf" id="5_qov1l"]
+
+[node name="Node" type="Node"]
+script = ExtResource("1_sruc3")
+
+[node name="Panel" type="Panel" parent="."]
+anchors_preset = 15
+anchor_right = 1.0
+anchor_bottom = 1.0
+grow_horizontal = 2
+grow_vertical = 2
+
+[node name="MarginContainer" type="MarginContainer" parent="Panel"]
+layout_mode = 1
+anchors_preset = 15
+anchor_right = 1.0
+anchor_bottom = 1.0
+grow_horizontal = 2
+grow_vertical = 2
+theme_override_constants/margin_left = 10
+theme_override_constants/margin_top = 10
+theme_override_constants/margin_right = 10
+theme_override_constants/margin_bottom = 10
+
+[node name="VBoxContainer" type="VBoxContainer" parent="Panel/MarginContainer"]
+layout_mode = 2
+
+[node name="ScrollContainer" type="ScrollContainer" parent="Panel/MarginContainer/VBoxContainer"]
+layout_mode = 2
+size_flags_vertical = 3
+follow_focus = true
+
+[node name="MessagesContainer" type="VBoxContainer" parent="Panel/MarginContainer/VBoxContainer/ScrollContainer"]
+unique_name_in_owner = true
+layout_mode = 2
+size_flags_horizontal = 3
+size_flags_vertical = 3
+theme_override_constants/separation = 30
+
+[node name="RichTextLabel2" parent="Panel/MarginContainer/VBoxContainer/ScrollContainer/MessagesContainer" instance=ExtResource("2_7iip7")]
+layout_mode = 2
+include_in_prompt = false
+
+[node name="Text" parent="Panel/MarginContainer/VBoxContainer/ScrollContainer/MessagesContainer/RichTextLabel2" index="1"]
+text = "How can I help you?"
+
+[node name="HBoxContainer" type="HBoxContainer" parent="Panel/MarginContainer/VBoxContainer"]
+layout_mode = 2
+
+[node name="TextEdit" type="TextEdit" parent="Panel/MarginContainer/VBoxContainer/HBoxContainer"]
+custom_minimum_size = Vector2(2.08165e-12, 100)
+layout_mode = 2
+size_flags_horizontal = 3
+placeholder_text = "Ask me anything..."
+wrap_mode = 1
+script = ExtResource("2_7usqw")
+
+[node name="Button" type="Button" parent="Panel/MarginContainer/VBoxContainer/HBoxContainer"]
+custom_minimum_size = Vector2(100, 2.08165e-12)
+layout_mode = 2
+icon = ExtResource("1_gjsev")
+expand_icon = true
+
+[node name="LlamaContext" type="LlamaContext" parent="."]
+model = ExtResource("5_qov1l")
+temperature = 0.9
+unique_name_in_owner = true
+
+[connection signal="submit" from="Panel/MarginContainer/VBoxContainer/HBoxContainer/TextEdit" to="." method="_on_text_edit_submit"]
+[connection signal="pressed" from="Panel/MarginContainer/VBoxContainer/HBoxContainer/Button" to="Panel/MarginContainer/VBoxContainer/HBoxContainer/TextEdit" method="_on_button_pressed"]
+[connection signal="completion_generated" from="LlamaContext" to="." method="_on_llama_context_completion_generated"]
+
+[editable path="Panel/MarginContainer/VBoxContainer/ScrollContainer/MessagesContainer/RichTextLabel2"]
@@ -0,0 +1,37 @@
+[remap]
+
+importer="texture"
+type="CompressedTexture2D"
+uid="uid://beeg0oqle7bnk"
+path="res://.godot/imported/icon.svg-218a8f2b3041327d8a5756f3a245f83b.ctex"
+metadata={
+"vram_texture": false
+}
+
+[deps]
+
+source_file="res://icon.svg"
+dest_files=["res://.godot/imported/icon.svg-218a8f2b3041327d8a5756f3a245f83b.ctex"]
+
+[params]
+
+compress/mode=0
+compress/high_quality=false
+compress/lossy_quality=0.7
+compress/hdr_compression=1
+compress/normal_map=0
+compress/channel_pack=0
+mipmaps/generate=false
+mipmaps/limit=-1
+roughness/mode=0
+roughness/src_normal=""
+process/fix_alpha_border=true
+process/premult_alpha=false
+process/normal_map_invert_y=false
+process/hdr_as_srgb=false
+process/hdr_clamp_exposure=false
+process/size_limit=0
+detect_3d/compress_to=1
+svg/scale=1.0
+editor/scale_with_editor_scale=false
+editor/convert_colors_with_editor_theme=false
@@ -11,35 +11,14 @@ config_version=5
 [application]
 
 config/name="godot-llama-cpp"
-run/main_scene="res://main.tscn"
+run/main_scene="res://examples/simple/simple.tscn"
 config/features=PackedStringArray("4.2", "Forward Plus")
-config/icon="res://addons/godot-llama-cpp/assets/godot-llama-cpp-1024x1024.svg"
-
-[autoload]
-
-__LlamaBackend="*res://addons/godot-llama-cpp/autoloads/llama-backend.gd"
-Llama="*res://autoloads/llama.tscn"
-
-[display]
-
-window/size/viewport_width=1280
-window/size/viewport_height=720
+config/icon="res://icon.svg"
 
 [editor_plugins]
 
 enabled=PackedStringArray("res://addons/godot-llama-cpp/plugin.cfg")
 
-[input]
-
-submit_form={
-"deadzone": 0.5,
-"events": [Object(InputEventKey,"resource_local_to_scene":false,"resource_name":"","device":-1,"window_id":0,"alt_pressed":false,"shift_pressed":false,"ctrl_pressed":false,"meta_pressed":false,"pressed":false,"keycode":0,"physical_keycode":4194309,"key_label":0,"unicode":0,"echo":false,"script":null)
-]
-}
-
-[rendering]
+[gui]
 
-anti_aliasing/quality/msaa_2d=3
-anti_aliasing/quality/msaa_3d=3
-anti_aliasing/quality/screen_space_aa=1
-anti_aliasing/quality/use_taa=true
+theme/default_theme_scale=2.0
@@ -2,10 +2,12 @@
 #include "common.h"
 #include "llama.h"
 #include "llama_model.h"
+#include <algorithm>
 #include <godot_cpp/classes/engine.hpp>
 #include <godot_cpp/classes/os.hpp>
 #include <godot_cpp/classes/worker_thread_pool.hpp>
 #include <godot_cpp/core/class_db.hpp>
+#include <godot_cpp/variant/dictionary.hpp>
 #include <godot_cpp/variant/utility_functions.hpp>
 
 using namespace godot;
@@ -15,31 +17,41 @@ void LlamaContext::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_model"), &LlamaContext::get_model);
 	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::OBJECT, "model", PROPERTY_HINT_RESOURCE_TYPE, "LlamaModel"), "set_model", "get_model");
 
-  ClassDB::bind_method(D_METHOD("get_seed"), &LlamaContext::get_seed);
-  ClassDB::bind_method(D_METHOD("set_seed", "seed"), &LlamaContext::set_seed);
-  ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "seed"), "set_seed", "get_seed");
+	ClassDB::bind_method(D_METHOD("get_seed"), &LlamaContext::get_seed);
+	ClassDB::bind_method(D_METHOD("set_seed", "seed"), &LlamaContext::set_seed);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "seed"), "set_seed", "get_seed");
 
-  ClassDB::bind_method(D_METHOD("get_n_ctx"), &LlamaContext::get_n_ctx);
-  ClassDB::bind_method(D_METHOD("set_n_ctx", "n_ctx"), &LlamaContext::set_n_ctx);
-  ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_ctx"), "set_n_ctx", "get_n_ctx");
+	ClassDB::bind_method(D_METHOD("get_temperature"), &LlamaContext::get_temperature);
+	ClassDB::bind_method(D_METHOD("set_temperature", "temperature"), &LlamaContext::set_temperature);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::FLOAT, "temperature"), "set_temperature", "get_temperature");
 
-  ClassDB::bind_method(D_METHOD("get_n_threads"), &LlamaContext::get_n_threads);
-  ClassDB::bind_method(D_METHOD("set_n_threads", "n_threads"), &LlamaContext::set_n_threads);
-  ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_threads"), "set_n_threads", "get_n_threads");
+	ClassDB::bind_method(D_METHOD("get_top_p"), &LlamaContext::get_top_p);
+	ClassDB::bind_method(D_METHOD("set_top_p", "top_p"), &LlamaContext::set_top_p);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::FLOAT, "top_p"), "set_top_p", "get_top_p");
 
-  ClassDB::bind_method(D_METHOD("get_n_threads_batch"), &LlamaContext::get_n_threads_batch);
-  ClassDB::bind_method(D_METHOD("set_n_threads_batch", "n_threads_batch"), &LlamaContext::set_n_threads_batch);
-  ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_threads_batch"), "set_n_threads_batch", "get_n_threads_batch");
+	ClassDB::bind_method(D_METHOD("get_frequency_penalty"), &LlamaContext::get_frequency_penalty);
+	ClassDB::bind_method(D_METHOD("set_frequency_penalty", "frequency_penalty"), &LlamaContext::set_frequency_penalty);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::FLOAT, "frequency_penalty"), "set_frequency_penalty", "get_frequency_penalty");
+
+	ClassDB::bind_method(D_METHOD("get_presence_penalty"), &LlamaContext::get_presence_penalty);
+	ClassDB::bind_method(D_METHOD("set_presence_penalty", "presence_penalty"), &LlamaContext::set_presence_penalty);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::FLOAT, "presence_penalty"), "set_presence_penalty", "get_presence_penalty");
+
+	ClassDB::bind_method(D_METHOD("get_n_ctx"), &LlamaContext::get_n_ctx);
+	ClassDB::bind_method(D_METHOD("set_n_ctx", "n_ctx"), &LlamaContext::set_n_ctx);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_ctx"), "set_n_ctx", "get_n_ctx");
+
+	ClassDB::bind_method(D_METHOD("get_n_len"), &LlamaContext::get_n_len);
+	ClassDB::bind_method(D_METHOD("set_n_len", "n_len"), &LlamaContext::set_n_len);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_len"), "set_n_len", "get_n_len");
 
 	ClassDB::bind_method(D_METHOD("request_completion", "prompt"), &LlamaContext::request_completion);
-	ClassDB::bind_method(D_METHOD("_fulfill_completion", "prompt"), &LlamaContext::_fulfill_completion);
+	ClassDB::bind_method(D_METHOD("__thread_loop"), &LlamaContext::__thread_loop);
 
-	ADD_SIGNAL(MethodInfo("completion_generated", PropertyInfo(Variant::STRING, "completion"), PropertyInfo(Variant::BOOL, "is_final")));
+	ADD_SIGNAL(MethodInfo("completion_generated", PropertyInfo(Variant::DICTIONARY, "chunk")));
 }
 
 LlamaContext::LlamaContext() {
-	batch = llama_batch_init(4096, 0, 1);
-
 	ctx_params = llama_context_default_params();
 	ctx_params.seed = -1;
 	ctx_params.n_ctx = 4096;
@@ -60,109 +72,186 @@ void LlamaContext::_ready() {
 		return;
 	}
 
+	mutex.instantiate();
+	semaphore.instantiate();
+	thread.instantiate();
+
+	llama_backend_init();
+	llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_DISABLED);
+
 	ctx = llama_new_context_with_model(model->model, ctx_params);
 	if (ctx == NULL) {
 		UtilityFunctions::printerr(vformat("%s: Failed to initialize llama context, null ctx", __func__));
 		return;
 	}
+
+	sampling_ctx = llama_sampling_init(sampling_params);
+
 	UtilityFunctions::print(vformat("%s: Context initialized", __func__));
-}
 
-PackedStringArray LlamaContext::_get_configuration_warnings() const {
-  PackedStringArray warnings;
-  if (model == NULL) {
-    warnings.push_back("Model resource property not defined");
-  }
-  return warnings;
+	thread->start(callable_mp(this, &LlamaContext::__thread_loop));
 }
 
-Variant LlamaContext::request_completion(const String &prompt) {
-	UtilityFunctions::print(vformat("%s: Requesting completion for prompt: %s", __func__, prompt));
-	if (task_id) {
-		WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);
-	}
-	task_id = WorkerThreadPool::get_singleton()->add_task(Callable(this, "_fulfill_completion").bind(prompt));
-	return OK;
-}
+void LlamaContext::__thread_loop() {
+	while (true) {
+		semaphore->wait();
 
-void LlamaContext::_fulfill_completion(const String &prompt) {
-	UtilityFunctions::print(vformat("%s: Fulfilling completion for prompt: %s", __func__, prompt));
-	std::vector<llama_token> tokens_list;
-	tokens_list = ::llama_tokenize(ctx, std::string(prompt.utf8().get_data()), true);
+		mutex->lock();
+		if (exit_thread) {
+			mutex->unlock();
+			break;
+		}
+		if (completion_requests.size() == 0) {
+			mutex->unlock();
+			continue;
+		}
+		completion_request req = completion_requests.get(0);
+		completion_requests.remove_at(0);
+		mutex->unlock();
 
-	const int n_len = 128;
-	const int n_ctx = llama_n_ctx(ctx);
-	const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
-	if (n_kv_req > n_ctx) {
-		UtilityFunctions::printerr(vformat("%s: n_kv_req > n_ctx, the required KV cache size is not big enough\neither reduce n_len or increase n_ctx", __func__));
-		return;
-	}
+		UtilityFunctions::print(vformat("%s: Running completion for prompt id: %d", __func__, req.id));
 
-	for (size_t i = 0; i < tokens_list.size(); i++) {
-		llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
-	}
+		std::vector<llama_token> request_tokens;
+		request_tokens = ::llama_tokenize(ctx, req.prompt.utf8().get_data(), true, true);
 
-	batch.logits[batch.n_tokens - 1] = true;
+		size_t shared_prefix_idx = 0;
+		auto diff = std::mismatch(context_tokens.begin(), context_tokens.end(), request_tokens.begin(), request_tokens.end());
+		if (diff.first != context_tokens.end()) {
+			shared_prefix_idx = std::distance(context_tokens.begin(), diff.first);
+		} else {
+			shared_prefix_idx = std::min(context_tokens.size(), request_tokens.size());
+		}
 
-	llama_kv_cache_clear(ctx);
+		bool rm_success = llama_kv_cache_seq_rm(ctx, -1, shared_prefix_idx, -1);
+		if (!rm_success) {
+			UtilityFunctions::printerr(vformat("%s: Failed to remove tokens from kv cache", __func__));
+			Dictionary response;
+			response["id"] = req.id;
+			response["error"] = "Failed to remove tokens from kv cache";
+			call_thread_safe("emit_signal", "completion_generated", response);
+			continue;
+		}
+		context_tokens.erase(context_tokens.begin() + shared_prefix_idx, context_tokens.end());
+		request_tokens.erase(request_tokens.begin(), request_tokens.begin() + shared_prefix_idx);
 
-	int decode_res = llama_decode(ctx, batch);
-	if (decode_res != 0) {
-		UtilityFunctions::printerr(vformat("%s: Failed to decode prompt with error code: %d", __func__, decode_res));
-		return;
-	}
+		uint batch_size = std::min(ctx_params.n_batch, (uint)request_tokens.size());
+
+		llama_batch batch = llama_batch_init(batch_size, 0, 1);
+
+		// chunk request_tokens into sequences of size batch_size
+		std::vector<std::vector<llama_token>> sequences;
+		for (size_t i = 0; i < request_tokens.size(); i += batch_size) {
+			sequences.push_back(std::vector<llama_token>(request_tokens.begin() + i, request_tokens.begin() + std::min(i + batch_size, request_tokens.size())));
+		}
+
+		printf("Request tokens: \n");
+		for (auto sequence : sequences) {
+			for (auto token : sequence) {
+				printf("%s", llama_token_to_piece(ctx, token).c_str());
+			}
+		}
+		printf("\n");
 
-	int n_cur = batch.n_tokens;
-	int n_decode = 0;
-	llama_model *llama_model = model->model;
+		int curr_token_pos = context_tokens.size();
+		bool decode_failed = false;
 
-	while (n_cur <= n_len) {
-		// sample the next token
-		{
-			auto n_vocab = llama_n_vocab(llama_model);
-			auto *logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+		for (size_t i = 0; i < sequences.size(); i++) {
+			llama_batch_clear(batch);
 
-			std::vector<llama_token_data> candidates;
-			candidates.reserve(n_vocab);
+			std::vector<llama_token> sequence = sequences[i];
 
-			for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-				candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+			for (size_t j = 0; j < sequence.size(); j++) {
+				llama_batch_add(batch, sequence[j], j + curr_token_pos, { 0 }, false);
+				curr_token_pos++;
 			}
 
-			llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+			if (i == sequences.size() - 1) {
+				batch.logits[batch.n_tokens - 1] = true;
+			}
 
-			// sample the most likely token
-			const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+			if (llama_decode(ctx, batch) != 0) {
+				decode_failed = true;
+				break;
+			}
+		}
 
-			// is it an end of stream?
-			if (new_token_id == llama_token_eos(llama_model) || n_cur == n_len) {
-				call_thread_safe("emit_signal", "completion_generated", "\n", true);
+		if (decode_failed) {
+			Dictionary response;
+			response["id"] = req.id;
+			response["error"] = "llama_decode() failed";
+			call_thread_safe("emit_signal", "completion_generated", response);
+			continue;
+		}
+
+		context_tokens.insert(context_tokens.end(), request_tokens.begin(), request_tokens.end());
+
+		while (true) {
+			if (exit_thread) {
+				return;
+			}
+			llama_token new_token_id = llama_sampling_sample(sampling_ctx, ctx, NULL, batch.n_tokens - 1);
+			llama_sampling_accept(sampling_ctx, ctx, new_token_id, false);
 
+			Dictionary response;
+			response["id"] = req.id;
+
+			context_tokens.push_back(new_token_id);
+
+			if (llama_token_is_eog(model->model, new_token_id) || curr_token_pos == n_len) {
+				response["done"] = true;
+				call_thread_safe("emit_signal", "completion_generated", response);
 				break;
 			}
 
-			call_thread_safe("emit_signal", "completion_generated", vformat("%s", llama_token_to_piece(ctx, new_token_id).c_str()), false);
+			response["text"] = llama_token_to_piece(ctx, new_token_id).c_str();
+			response["done"] = false;
+			call_thread_safe("emit_signal", "completion_generated", response);
 
-			// prepare the next batch
 			llama_batch_clear(batch);
 
-			// push this new token for next evaluation
-			llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
+			llama_batch_add(batch, new_token_id, curr_token_pos, { 0 }, true);
 
-			n_decode += 1;
-		}
+			curr_token_pos++;
 
-		n_cur += 1;
+			if (llama_decode(ctx, batch) != 0) {
+				decode_failed = true;
+				break;
+			}
+		}
 
-		// evaluate the current batch with the transformer model
-		int decode_res = llama_decode(ctx, batch);
-		if (decode_res != 0) {
-			UtilityFunctions::printerr(vformat("%s: Failed to decode batch with error code: %d", __func__, decode_res));
-			break;
+		if (decode_failed) {
+			Dictionary response;
+			response["id"] = req.id;
+			response["error"] = "llama_decode() failed";
+			call_thread_safe("emit_signal", "completion_generated", response);
+			continue;
 		}
 	}
 }
 
+PackedStringArray LlamaContext::_get_configuration_warnings() const {
+	PackedStringArray warnings;
+	if (model == NULL) {
+		warnings.push_back("Model resource property not defined");
+	}
+	return warnings;
+}
+
+int LlamaContext::request_completion(const String &prompt) {
+	int id = request_id++;
+
+	UtilityFunctions::print(vformat("%s: Requesting completion for prompt id: %d", __func__, id));
+
+	mutex->lock();
+	completion_request req = { id, prompt };
+	completion_requests.append(req);
+	mutex->unlock();
+
+	semaphore->post();
+
+	return id;
+}
+
 void LlamaContext::set_model(const Ref<LlamaModel> p_model) {
 	model = p_model;
 }
@@ -184,28 +273,58 @@ void LlamaContext::set_n_ctx(int n_ctx) {
 	ctx_params.n_ctx = n_ctx;
 }
 
-int LlamaContext::get_n_threads() {
-	return ctx_params.n_threads;
+int LlamaContext::get_n_len() {
+	return n_len;
 }
-void LlamaContext::set_n_threads(int n_threads) {
-	ctx_params.n_threads = n_threads;
+void LlamaContext::set_n_len(int n_len) {
+	this->n_len = n_len;
 }
 
-int LlamaContext::get_n_threads_batch() {
-	return ctx_params.n_threads_batch;
+float LlamaContext::get_temperature() {
+  return sampling_params.temp;
 }
-void LlamaContext::set_n_threads_batch(int n_threads_batch) {
-	ctx_params.n_threads_batch = n_threads_batch;
+void LlamaContext::set_temperature(float temperature) {
+  sampling_params.temp = temperature;
 }
 
-LlamaContext::~LlamaContext() {
-	if (ctx) {
-		llama_free(ctx);
+float LlamaContext::get_top_p() {
+  return sampling_params.top_p;
+}
+void LlamaContext::set_top_p(float top_p) {
+  sampling_params.top_p = top_p;
+}
+
+float LlamaContext::get_frequency_penalty() {
+  return sampling_params.penalty_freq;
+}
+void LlamaContext::set_frequency_penalty(float frequency_penalty) {
+  sampling_params.penalty_freq = frequency_penalty;
+}
+
+float LlamaContext::get_presence_penalty() {
+  return sampling_params.penalty_present;
+}
+void LlamaContext::set_presence_penalty(float presence_penalty) {
+  sampling_params.penalty_present = presence_penalty;
+}
+
+void LlamaContext::_exit_tree() {
+	if (Engine::get_singleton()->is_editor_hint()) {
+		return;
 	}
 
-	llama_batch_free(batch);
+	mutex->lock();
+	exit_thread = true;
+	mutex->unlock();
+
+	semaphore->post();
 
-	if (task_id) {
-		WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);
+	thread->wait_to_finish();
+
+	if (ctx) {
+		llama_free(ctx);
 	}
+
+	llama_sampling_free(sampling_ctx);
+	llama_backend_free();
 }
@@ -2,19 +2,38 @@
 #define LLAMA_CONTEXT_H
 
 #include "llama.h"
+#include "common.h"
 #include "llama_model.h"
+#include <godot_cpp/classes/mutex.hpp>
 #include <godot_cpp/classes/node.hpp>
-
+#include <godot_cpp/classes/semaphore.hpp>
+#include <godot_cpp/classes/thread.hpp>
+#include <godot_cpp/templates/vector.hpp>
 namespace godot {
+
+struct completion_request {
+	int id;
+	String prompt;
+};
+
 class LlamaContext : public Node {
 	GDCLASS(LlamaContext, Node)
 
 private:
 	Ref<LlamaModel> model;
 	llama_context *ctx = nullptr;
+  llama_sampling_context *sampling_ctx = nullptr;
 	llama_context_params ctx_params;
-	llama_batch batch;
-	int task_id;
+  llama_sampling_params sampling_params;
+  int n_len = 1024;
+	int request_id = 0;
+	Vector<completion_request> completion_requests;
+
+	Ref<Thread> thread;
+	Ref<Semaphore> semaphore;
+	Ref<Mutex> mutex;
+  std::vector<llama_token> context_tokens;
+  bool exit_thread = false;
 
 protected:
 	static void _bind_methods();
@@ -23,22 +42,28 @@ class LlamaContext : public Node {
 	void set_model(const Ref<LlamaModel> model);
 	Ref<LlamaModel> get_model();
 
-	Variant request_completion(const String &prompt);
-	void _fulfill_completion(const String &prompt);
+	int request_completion(const String &prompt);
+	void __thread_loop();
 
-  int get_seed();
-  void set_seed(int seed);
-  int get_n_ctx();
-  void set_n_ctx(int n_ctx);
-  int get_n_threads();
-  void set_n_threads(int n_threads);
-  int get_n_threads_batch();
-  void set_n_threads_batch(int n_threads_batch);
+	int get_seed();
+	void set_seed(int seed);
+	int get_n_ctx();
+	void set_n_ctx(int n_ctx);
+  int get_n_len();
+  void set_n_len(int n_len);
+  float get_temperature();
+  void set_temperature(float temperature);
+  float get_top_p();
+  void set_top_p(float top_p);
+  float get_frequency_penalty();
+  void set_frequency_penalty(float frequency_penalty);
+  float get_presence_penalty();
+  void set_presence_penalty(float presence_penalty);
 
-  virtual PackedStringArray _get_configuration_warnings() const override;
+	virtual PackedStringArray _get_configuration_warnings() const override;
 	virtual void _ready() override;
-  LlamaContext();
-	~LlamaContext();
+  virtual void _exit_tree() override;
+	LlamaContext();
 };
 } //namespace godot
 
 
@@ -1,5 +1,6 @@
 #include "llama_model.h"
 #include "llama.h"
+#include <godot_cpp/classes/project_settings.hpp>
 #include <godot_cpp/core/class_db.hpp>
 #include <godot_cpp/variant/utility_functions.hpp>
 
@@ -22,14 +23,16 @@ void LlamaModel::load_model(const String &path) {
 		llama_free_model(model);
 	}
 
-	model = llama_load_model_from_file(path.utf8().get_data(), model_params);
+	String absPath = ProjectSettings::get_singleton()->globalize_path(path);
+
+	model = llama_load_model_from_file(absPath.utf8().get_data(), model_params);
 
 	if (model == NULL) {
-		UtilityFunctions::printerr(vformat("%s: Unable to load model from %s", __func__, path));
+		UtilityFunctions::printerr(vformat("%s: Unable to load model from %s", __func__, absPath));
 		return;
 	}
 
-	UtilityFunctions::print(vformat("%s: Model loaded from %s", __func__, path));
+	UtilityFunctions::print(vformat("%s: Model loaded from %s", __func__, absPath));
 }
 
 int LlamaModel::get_n_gpu_layers() {
 
@@ -2,7 +2,6 @@
 #include "llama_model.h"
 #include <godot_cpp/core/class_db.hpp>
 #include <godot_cpp/classes/file_access.hpp>
-#include <godot_cpp/classes/project_settings.hpp>
 #include <godot_cpp/classes/engine.hpp>
 
 using namespace godot;
@@ -24,9 +23,7 @@ Variant godot::LlamaModelLoader::_load(const String &path, const String &origina
     return { model };
   }
 
-	String absPath = ProjectSettings::get_singleton()->globalize_path(path);
-
-	model->load_model(absPath);
+	model->load_model(path);
 
 	return { model };
 }
 
@@ -7,7 +7,6 @@
 #include "llama_model.h"
 #include "llama_model_loader.h"
 #include "llama_context.h"
-#include "llama_backend.h"
 
 using namespace godot;
 
@@ -24,7 +23,6 @@ void initialize_types(ModuleInitializationLevel p_level)
 
 	ClassDB::register_class<LlamaModel>();
   ClassDB::register_class<LlamaContext>();
-  ClassDB::register_class<LlamaBackend>();
 }
 
 void uninitialize_types(ModuleInitializationLevel p_level) {
 
@@ -0,0 +1,79 @@
+const std = @import("std");
+
+const usage =
+    \\Usage: ./embed_metal [options]
+    \\
+    \\Options:
+    \\  --metal-file ggml-metal.metal
+    \\  --common-file ggml-common.h
+    \\  --output-file ggml-metal-embed.metal
+    \\
+;
+
+pub fn main() !void {
+    var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+    defer arena_state.deinit();
+    const arena = arena_state.allocator();
+
+    const args = try std.process.argsAlloc(arena);
+
+    var opt_metal_file_path: ?[]const u8 = null;
+    var opt_common_file_path: ?[]const u8 = null;
+    var opt_output_file_path: ?[]const u8 = null;
+
+    {
+        var i: usize = 1;
+        while (i < args.len) : (i += 1) {
+            const arg = args[i];
+            if (std.mem.eql(u8, "-h", arg) or std.mem.eql(u8, "--help", arg)) {
+                try std.io.getStdOut().writeAll(usage);
+                return std.process.cleanExit();
+            } else if (std.mem.eql(u8, "--metal-file", arg)) {
+                i += 1;
+                if (i > args.len) std.debug.panic("expected arg after '{s}'", .{arg});
+                if (opt_metal_file_path != null) std.debug.panic("duplicated {s} argument", .{arg});
+                opt_metal_file_path = args[i];
+            } else if (std.mem.eql(u8, "--common-file", arg)) {
+                i += 1;
+                if (i > args.len) std.debug.panic("expected arg after '{s}'", .{arg});
+                if (opt_common_file_path != null) std.debug.panic("duplicated {s} argument", .{arg});
+                opt_common_file_path = args[i];
+            } else if (std.mem.eql(u8, "--output-file", arg)) {
+                i += 1;
+                if (i > args.len) std.debug.panic("expected arg after '{s}'", .{arg});
+                if (opt_output_file_path != null) std.debug.panic("duplicated {s} argument", .{arg});
+                opt_output_file_path = args[i];
+            } else {
+                std.debug.panic("unrecognized arg: '{s}'", .{arg});
+            }
+        }
+    }
+
+    const metal_file_path = opt_metal_file_path orelse std.debug.panic("missing --input-file", .{});
+    const common_file_path = opt_common_file_path orelse std.debug.panic("missing --output-file", .{});
+    const output_file_path = opt_output_file_path orelse std.debug.panic("missing --lang", .{});
+
+    const cwd = std.fs.cwd();
+
+    var metal_file = try cwd.openFile(metal_file_path, .{});
+    defer metal_file.close();
+
+    var common_file = try cwd.openFile(common_file_path, .{});
+    defer common_file.close();
+
+    const metal_size = (try metal_file.stat()).size;
+    const metal_contents = try arena.alloc(u8, metal_size);
+    defer arena.free(metal_contents);
+    _ = try metal_file.readAll(metal_contents);
+
+    const common_size = (try common_file.stat()).size;
+    const common_contents = try arena.alloc(u8, common_size);
+    defer arena.free(common_contents);
+    _ = try common_file.readAll(common_contents);
+
+    const output = try std.mem.replaceOwned(u8, arena, metal_contents, "#include \"ggml-common.h\"", common_contents);
+    defer arena.free(output);
+
+    const output_file = try cwd.createFile(output_file_path, .{});
+    try output_file.writeAll(output);
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Normalize EOL for all files that Git considers text files.`
	`2`	`+* text=auto eol=lf`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Godot 4+ specific ignores`
	`2`	`+.godot/`