working inference, kinda

hazelnutcloud · hazelnutcloud · commit 14527c4630fa · 2024-03-01T01:02:18.000+08:00
diff --git a/godot/autoloads/llama.tscn b/godot/autoloads/llama.tscn
@@ -1,6 +1,6 @@
 [gd_scene load_steps=2 format=3 uid="uid://bxobxniygk7jm"]
 
-[ext_resource type="LlamaModel" path="res://models/stablelm-2-zephyr-1_6b-Q4_K_M.gguf" id="1_8pggd"]
+[ext_resource type="LlamaModel" path="res://models/OGNO-7B-Q4_K_M.gguf" id="1_vd8h8"]
 
 [node name="LlamaContext" type="LlamaContext"]
-model = ExtResource("1_8pggd")
+model = ExtResource("1_vd8h8")
diff --git a/godot/main.gd b/godot/main.gd
@@ -1,16 +1,31 @@
 extends Node
 
-@onready var input: TextEdit = $"Form/Input"
-
-# Called when the node enters the scene tree for the first time.
-func _ready():
-	pass # Replace with function body.
-
-
-# Called every frame. 'delta' is the elapsed time since the previous frame.
-func _process(delta):
-	pass
-
+@onready var input: TextEdit = %Input
+@onready var submit_button: Button = %SubmitButton
+@onready var output: Label = %Output
 
 func _on_button_pressed():
+	handle_submit()
+	
+#func _unhandled_key_input(event: InputEvent) -> void:
+	#if (event.is_action_released("submit_form") and input.has_focus()):
+		#handle_submit()
+	
+func handle_submit():
 	print(input.text)
+	Llama.request_completion(input.text)
+	
+	input.clear()
+	input.editable = false
+	submit_button.disabled = true
+	output.text = "..."
+	
+	var completion = await Llama.completion_generated
+	output.text = ""
+	while !completion[1]:
+		print(completion[0])
+		output.text += completion[0]
+		completion = await Llama.completion_generated
+		
+	input.editable = true
+	submit_button.disabled = false
diff --git a/godot/main.tscn b/godot/main.tscn
@@ -1,8 +1,14 @@
-[gd_scene load_steps=3 format=3 uid="uid://7oo8yj56scb1"]
+[gd_scene load_steps=4 format=3 uid="uid://7oo8yj56scb1"]
 
 [ext_resource type="Texture2D" uid="uid://dplw232htshgc" path="res://addons/godot-llama-cpp/assets/godot-llama-cpp-1024x1024.svg" id="1_ojdoj"]
 [ext_resource type="Script" path="res://main.gd" id="1_vvrqe"]
 
+[sub_resource type="StyleBoxFlat" id="StyleBoxFlat_3e37a"]
+corner_radius_top_left = 5
+corner_radius_top_right = 5
+corner_radius_bottom_right = 5
+corner_radius_bottom_left = 5
+
 [node name="Main" type="Node"]
 script = ExtResource("1_vvrqe")
 
@@ -14,63 +20,83 @@ grow_horizontal = 2
 grow_vertical = 2
 color = Color(0.980392, 0.952941, 0.929412, 1)
 
-[node name="Form" type="HBoxContainer" parent="."]
-custom_minimum_size = Vector2(300, 50)
-anchors_preset = -1
-anchor_top = 0.7
-anchor_right = 1.0
-anchor_bottom = 0.7
-offset_left = 350.0
-offset_top = -1.66893e-05
-offset_right = -350.0
-offset_bottom = 50.0
+[node name="CenterContainer" type="CenterContainer" parent="."]
+anchors_preset = 8
+anchor_left = 0.5
+anchor_top = 0.5
+anchor_right = 0.5
+anchor_bottom = 0.5
+offset_left = -400.0
+offset_top = -479.0
+offset_right = 400.0
+offset_bottom = 479.0
 grow_horizontal = 2
-grow_vertical = 0
-alignment = 1
+grow_vertical = 2
 
-[node name="Input" type="TextEdit" parent="Form"]
+[node name="VBoxContainer" type="VBoxContainer" parent="CenterContainer"]
+custom_minimum_size = Vector2(500, 0)
 layout_mode = 2
-size_flags_horizontal = 3
-size_flags_stretch_ratio = 3.0
-placeholder_text = "Ask me anything..."
+theme_override_constants/separation = 10
+alignment = 1
 
-[node name="SubmitButton" type="Button" parent="Form"]
+[node name="Name" type="Label" parent="CenterContainer/VBoxContainer"]
 layout_mode = 2
-size_flags_horizontal = 3
-text = "Submit"
+theme_override_colors/font_color = Color(0.101961, 0.0823529, 0.0627451, 1)
+theme_override_font_sizes/font_size = 32
+text = "godot-llama-cpp"
+horizontal_alignment = 1
 
-[node name="SpriteContainer" type="CenterContainer" parent="."]
-anchors_preset = -1
-anchor_left = 0.5
-anchor_top = 0.4
-anchor_right = 0.5
-anchor_bottom = 0.4
-offset_left = -20.0
-offset_top = -20.0
-offset_right = 20.0
-offset_bottom = 20.0
-grow_horizontal = 2
-grow_vertical = 2
+[node name="MarginContainer" type="MarginContainer" parent="CenterContainer/VBoxContainer"]
+layout_mode = 2
+theme_override_constants/margin_left = 100
+theme_override_constants/margin_right = 100
 
-[node name="GodotLlamaSprite" type="Sprite2D" parent="SpriteContainer"]
-position = Vector2(20, 20)
-scale = Vector2(0.2, 0.2)
+[node name="TextureRect" type="TextureRect" parent="CenterContainer/VBoxContainer/MarginContainer"]
+layout_mode = 2
 texture = ExtResource("1_ojdoj")
+expand_mode = 4
 
-[node name="Label" type="Label" parent="."]
-anchors_preset = -1
-anchor_left = 0.5
-anchor_top = 0.6
-anchor_right = 0.5
-anchor_bottom = 0.6
-offset_left = -127.0
-offset_top = -22.5
-offset_right = 127.0
-offset_bottom = 22.5
-grow_horizontal = 2
-grow_vertical = 2
+[node name="ScrollContainer" type="ScrollContainer" parent="CenterContainer/VBoxContainer"]
+custom_minimum_size = Vector2(0, 60)
+layout_mode = 2
+horizontal_scroll_mode = 0
+
+[node name="Panel" type="PanelContainer" parent="CenterContainer/VBoxContainer/ScrollContainer"]
+layout_mode = 2
+size_flags_horizontal = 3
+size_flags_vertical = 3
+theme_override_styles/panel = SubResource("StyleBoxFlat_3e37a")
+
+[node name="MarginContainer" type="MarginContainer" parent="CenterContainer/VBoxContainer/ScrollContainer/Panel"]
+layout_mode = 2
+theme_override_constants/margin_left = 20
+theme_override_constants/margin_right = 20
+
+[node name="Output" type="Label" parent="CenterContainer/VBoxContainer/ScrollContainer/Panel/MarginContainer"]
+unique_name_in_owner = true
+custom_minimum_size = Vector2(200, 0)
+layout_mode = 2
 theme_override_colors/font_color = Color(0.101961, 0.0823529, 0.0627451, 1)
-theme_override_font_sizes/font_size = 32
-text = "godot-llama-cpp"
+text = "Ask me anything!"
+autowrap_mode = 3
+
+[node name="Form" type="HBoxContainer" parent="CenterContainer/VBoxContainer"]
+custom_minimum_size = Vector2(500, 60)
+layout_mode = 2
+size_flags_horizontal = 4
+alignment = 1
+
+[node name="Input" type="TextEdit" parent="CenterContainer/VBoxContainer/Form"]
+unique_name_in_owner = true
+layout_mode = 2
+size_flags_horizontal = 3
+size_flags_stretch_ratio = 3.0
+placeholder_text = "Why do cows moo?"
+
+[node name="SubmitButton" type="Button" parent="CenterContainer/VBoxContainer/Form"]
+unique_name_in_owner = true
+layout_mode = 2
+size_flags_horizontal = 3
+text = "Submit"
 
-[connection signal="pressed" from="Form/SubmitButton" to="." method="_on_button_pressed"]
+[connection signal="pressed" from="CenterContainer/VBoxContainer/Form/SubmitButton" to="." method="_on_button_pressed"]
diff --git a/godot/project.godot b/godot/project.godot
@@ -13,12 +13,33 @@ config_version=5
 config/name="godot-llama-cpp"
 run/main_scene="res://main.tscn"
 config/features=PackedStringArray("4.2", "Forward Plus")
+config/icon="res://addons/godot-llama-cpp/assets/godot-llama-cpp-1024x1024.svg"
 
 [autoload]
 
 __LlamaBackend="*res://addons/godot-llama-cpp/autoloads/llama-backend.gd"
 Llama="*res://autoloads/llama.tscn"
 
+[display]
+
+window/size/viewport_width=1280
+window/size/viewport_height=720
+
 [editor_plugins]
 
 enabled=PackedStringArray("res://addons/godot-llama-cpp/plugin.cfg")
+
+[input]
+
+submit_form={
+"deadzone": 0.5,
+"events": [Object(InputEventKey,"resource_local_to_scene":false,"resource_name":"","device":-1,"window_id":0,"alt_pressed":false,"shift_pressed":false,"ctrl_pressed":false,"meta_pressed":false,"pressed":false,"keycode":0,"physical_keycode":4194309,"key_label":0,"unicode":0,"echo":false,"script":null)
+]
+}
+
+[rendering]
+
+anti_aliasing/quality/msaa_2d=3
+anti_aliasing/quality/msaa_3d=3
+anti_aliasing/quality/screen_space_aa=1
+anti_aliasing/quality/use_taa=true
diff --git a/src/llama_context.cpp b/src/llama_context.cpp
@@ -1,19 +1,22 @@
 #include "llama_context.h"
+#include "common.h"
 #include "llama.h"
 #include "llama_model.h"
 #include <godot_cpp/classes/engine.hpp>
 #include <godot_cpp/classes/os.hpp>
+#include <godot_cpp/classes/worker_thread_pool.hpp>
 #include <godot_cpp/core/class_db.hpp>
 #include <godot_cpp/variant/utility_functions.hpp>
 
 using namespace godot;
 
-void LlamaContext::set_model(const Ref<LlamaModel> p_model) {
-	model = p_model;
-}
-
-Ref<LlamaModel> LlamaContext::get_model() {
-	return model;
+void LlamaContext::_bind_methods() {
+	ClassDB::bind_method(D_METHOD("set_model", "model"), &LlamaContext::set_model);
+	ClassDB::bind_method(D_METHOD("get_model"), &LlamaContext::get_model);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::OBJECT, "model", PROPERTY_HINT_RESOURCE_TYPE, "LlamaModel"), "set_model", "get_model");
+	ClassDB::bind_method(D_METHOD("request_completion", "prompt"), &LlamaContext::request_completion);
+	ClassDB::bind_method(D_METHOD("_fulfill_completion", "prompt"), &LlamaContext::_fulfill_completion);
+	ADD_SIGNAL(MethodInfo("completion_generated", PropertyInfo(Variant::STRING, "completion"), PropertyInfo(Variant::BOOL, "is_final")));
 }
 
 void LlamaContext::_ready() {
@@ -40,14 +43,103 @@ void LlamaContext::_ready() {
 	UtilityFunctions::print(vformat("%s: Context initialized", __func__));
 }
 
+Variant LlamaContext::request_completion(const String &prompt) {
+	UtilityFunctions::print(vformat("%s: Requesting completion for prompt: %s", __func__, prompt));
+  if (task_id) {
+    WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);
+  }
+	task_id = WorkerThreadPool::get_singleton()->add_task(Callable(this, "_fulfill_completion").bind(prompt));
+	return OK;
+}
+
+void LlamaContext::_fulfill_completion(const String &prompt) {
+	UtilityFunctions::print(vformat("%s: Fulfilling completion for prompt: %s", __func__, prompt));
+	std::vector<llama_token> tokens_list;
+	tokens_list = ::llama_tokenize(ctx, std::string(prompt.utf8().get_data()), true);
+
+	const int n_len = 128;
+	const int n_ctx = llama_n_ctx(ctx);
+	const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
+	if (n_kv_req > n_ctx) {
+		UtilityFunctions::printerr(vformat("%s: n_kv_req > n_ctx, the required KV cache size is not big enough\neither reduce n_len or increase n_ctx", __func__));
+		return;
+	}
+
+	for (size_t i = 0; i < tokens_list.size(); i++) {
+		llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
+	}
+	batch.logits[batch.n_tokens - 1] = true;
+
+	int decode_res = llama_decode(ctx, batch);
+	if (decode_res != 0) {
+		UtilityFunctions::printerr(vformat("%s: Failed to decode prompt with error code: %d", __func__, decode_res));
+		return;
+	}
+
+	int n_cur = batch.n_tokens;
+	int n_decode = 0;
+	llama_model *llama_model = model->model;
+	while (n_cur <= n_len) {
+		// sample the next token
+		{
+			auto n_vocab = llama_n_vocab(llama_model);
+			auto *logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+
+			std::vector<llama_token_data> candidates;
+			candidates.reserve(n_vocab);
+
+			for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+				candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+			}
+
+			llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+			// sample the most likely token
+			const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
+
+			// is it an end of stream?
+			if (new_token_id == llama_token_eos(llama_model) || n_cur == n_len) {
+				call_thread_safe("emit_signal", "completion_generated", "\n", true);
+
+				break;
+			}
+
+			call_thread_safe("emit_signal", "completion_generated", vformat("%s", llama_token_to_piece(ctx, new_token_id).c_str()), false);
+
+			// prepare the next batch
+			llama_batch_clear(batch);
+
+			// push this new token for next evaluation
+			llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
+
+			n_decode += 1;
+		}
+
+		n_cur += 1;
+
+		// evaluate the current batch with the transformer model
+		int decode_res = llama_decode(ctx, batch);
+		if (decode_res != 0) {
+			UtilityFunctions::printerr(vformat("%s: Failed to decode batch with error code: %d", __func__, decode_res));
+			return;
+		}
+	}
+}
+
+void LlamaContext::set_model(const Ref<LlamaModel> p_model) {
+	model = p_model;
+}
+
+Ref<LlamaModel> LlamaContext::get_model() {
+	return model;
+}
+
 LlamaContext::~LlamaContext() {
 	if (ctx) {
 		llama_free(ctx);
 	}
-}
-
-void LlamaContext::_bind_methods() {
-	ClassDB::bind_method(D_METHOD("set_model", "model"), &LlamaContext::set_model);
-	ClassDB::bind_method(D_METHOD("get_model"), &LlamaContext::get_model);
-	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::OBJECT, "model", PROPERTY_HINT_RESOURCE_TYPE, "LlamaModel"), "set_model", "get_model");
+	llama_batch_free(batch);
+	if (task_id) {
+		WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);
+	}
 }
diff --git a/src/llama_context.h b/src/llama_context.h
@@ -13,13 +13,17 @@ class LlamaContext : public Node {
 	Ref<LlamaModel> model;
 	llama_context *ctx = nullptr;
   llama_context_params ctx_params = llama_context_default_params();
+  llama_batch batch = llama_batch_init(512, 0, 1);
+  int task_id;
 
 protected:
 	static void _bind_methods();
 
 public:
 	void set_model(const Ref<LlamaModel> model);
 	Ref<LlamaModel> get_model();
+  Variant request_completion(const String &prompt);
+  void _fulfill_completion(const String &prompt);
 	virtual void _ready() override;
   ~LlamaContext();
 };
diff --git a/src/llama_model.cpp b/src/llama_model.cpp