finally fix fatal logic flaw

hazelnutcloud · hazelnutcloud · commit e0e62917325a · 2024-06-17T17:53:55.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -49,7 +49,7 @@ compile_commands.json
 .vscode/*
 !.vscode/extensions.json
 
-zig-cache
+.zig-cache
 zig-out
 *.gguf
 
diff --git a/README.md b/README.md
@@ -13,5 +13,64 @@ Run large language models in [Godot](https://godotengine.org). Powered by [llama
 ![GitHub last commit](https://img.shields.io/github/last-commit/hazelnutcloud/godot-llama-cpp)
 ![GitHub License](https://img.shields.io/github/license/hazelnutcloud/godot-llama-cpp)
 
-
 </div>
+
+## Overview
+
+This library aims to provide a high-level interface to run large language models in Godot, following Godot's node-based design principles.
+
+```gdscript
+@onready var llama_context = %LlamaContext
+
+var messages = [
+  { "sender": "system", "text": "You are a pirate chatbot who always responds in pirate speak!" },
+  { "sender": "user", "text": "Who are you?" }
+]
+var prompt = ChatFormatter.apply("llama3", messages)
+var completion_id = llama_context.request_completion(prompt)
+
+while (true):
+  var response = await llama_context.completion_generated
+  print(response["text"])
+
+  if response["done"]: break
+```
+
+## Features
+  
+  - Chat formatter for:
+    - [x] Llama3
+    - [x] Mistral
+    - [ ] More to come!
+  - Compute backend builds:
+    - [x] Metal
+    - [x] Vulkan
+    - [ ] CUDA
+  - Asynchronous completion generation
+  - Support any language model that llama.cpp supports in GGUF format
+  - GGUF files are Godot resources
+
+## Building & Installation
+
+1. Download zig v0.13.0 from https://ziglang.org/download/
+2. Clone the repository:
+   ```bash
+   git clone --recurse-submodules https://github.com/hazelnutcloud/godot-llama-cpp.git
+   ```
+3. Copy the `godot-llama-cpp` addon folder in `godot/addons` to your Godot project's `addons` folder.
+   ```bash
+    cp -r godot-llama-cpp/godot/addons/godot-llama-cpp <your_project>/addons
+   ```
+4. Build the extension and install it in your Godot project:
+   ```bash
+   cd godot-llama-cpp
+   zig build --prefix <your_project>/addons/godot-llama-cpp
+   ```
+5. Enable the plugin in your Godot project settings.
+6. Add the `LlamaContext` node to your scene.
+7. Run your Godot project.
+8. Enjoy!
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE](LICENSE.md) file for details.
diff --git a/build.zig b/build.zig
@@ -18,16 +18,15 @@ pub fn build(b: *std.Build) !void {
         .optimize = optimize,
     });
     plugin.addCSourceFiles(.{ .files = try findFilesRecursive(b, "src/", &cfiles_exts) });
-    plugin.addIncludePath(.{ .path = "src/" });
-    plugin.addIncludePath(.{ .path = "godot_cpp/gdextension/" });
-    plugin.addIncludePath(.{ .path = "godot_cpp/include/" });
-    plugin.addIncludePath(.{ .path = "godot_cpp/gen/include" });
-    plugin.addIncludePath(.{ .path = "llama.cpp" });
-    plugin.addIncludePath(.{ .path = "llama.cpp/common" });
+    plugin.addIncludePath(.{ .src_path = .{ .owner = b, .sub_path = "src/" } });
+    plugin.addIncludePath(.{ .src_path = .{ .owner = b, .sub_path = "godot_cpp/gdextension/" } });
+    plugin.addIncludePath(.{ .src_path = .{ .owner = b, .sub_path = "godot_cpp/include/" } });
+    plugin.addIncludePath(.{ .src_path = .{ .owner = b, .sub_path = "godot_cpp/gen/include" } });
+    plugin.addIncludePath(.{ .src_path = .{ .owner = b, .sub_path = "llama.cpp" } });
+    plugin.addIncludePath(.{ .src_path = .{ .owner = b, .sub_path = "llama.cpp/common" } });
     plugin.linkLibrary(lib_llama_cpp);
     plugin.linkLibrary(lib_godot_cpp);
 
-    b.lib_dir = "./godot/addons/godot-llama-cpp/lib";
     b.installArtifact(plugin);
 }
 
@@ -50,7 +49,7 @@ fn build_lib_godot_cpp(params: BuildParams) !*std.Build.Step.Compile {
     b.build_root.handle.access("godot_cpp/gen", .{}) catch |e| {
         switch (e) {
             error.FileNotFound => {
-                _ = try std.ChildProcess.run(.{
+                _ = try std.process.Child.run(.{
                     .allocator = b.allocator,
                     .argv = &.{ "python", "binding_generator.py", "godot_cpp/gdextension/extension_api.json", "godot_cpp" },
                     .cwd_dir = b.build_root.handle,
@@ -60,9 +59,9 @@ fn build_lib_godot_cpp(params: BuildParams) !*std.Build.Step.Compile {
         }
     };
     lib_godot.linkLibCpp();
-    lib_godot.addIncludePath(.{ .path = "godot_cpp/gdextension/" });
-    lib_godot.addIncludePath(.{ .path = "godot_cpp/include/" });
-    lib_godot.addIncludePath(.{ .path = "godot_cpp/gen/include" });
+    lib_godot.addIncludePath(.{ .src_path = .{ .owner = b, .sub_path = "godot_cpp/gdextension/" } });
+    lib_godot.addIncludePath(.{ .src_path = .{ .owner = b, .sub_path = "godot_cpp/include/" } });
+    lib_godot.addIncludePath(.{ .src_path = .{ .owner = b, .sub_path = "godot_cpp/gen/include" } });
     const lib_godot_sources = try findFilesRecursive(b, "godot_cpp/src", &cfiles_exts);
     const lib_godot_gen_sources = try findFilesRecursive(b, "godot_cpp/gen/src", &cfiles_exts);
     lib_godot.addCSourceFiles(.{ .files = lib_godot_gen_sources, .flags = &.{ "-std=c++17", "-fno-exceptions" } });
@@ -77,9 +76,9 @@ fn build_lib_llama_cpp(params: BuildParams) !*std.Build.Step.Compile {
     const optimize = params.optimize;
     const zig_triple = try target.result.zigTriple(b.allocator);
 
-    const commit_hash = try std.ChildProcess.run(.{ .allocator = b.allocator, .argv = &.{ "git", "rev-parse", "HEAD" }, .cwd = b.pathFromRoot("llama.cpp") });
+    const commit_hash = try std.process.Child.run(.{ .allocator = b.allocator, .argv = &.{ "git", "rev-parse", "HEAD" }, .cwd = b.pathFromRoot("llama.cpp") });
     const zig_version = builtin.zig_version_string;
-    try b.build_root.handle.writeFile2(.{ .sub_path = "llama.cpp/common/build-info.cpp", .data = b.fmt(
+    try b.build_root.handle.writeFile(.{ .sub_path = "llama.cpp/common/build-info.cpp", .data = b.fmt(
         \\int LLAMA_BUILD_NUMBER = {};
         \\char const *LLAMA_COMMIT = "{s}";
         \\char const *LLAMA_COMPILER = "Zig {s}";
@@ -108,13 +107,13 @@ fn build_lib_llama_cpp(params: BuildParams) !*std.Build.Step.Compile {
             const expand_metal = b.addExecutable(.{
                 .name = "expand_metal",
                 .target = target,
-                .root_source_file = .{ .path = "tools/expand_metal.zig" },
+                .root_source_file = .{ .src_path = .{ .owner = b, .sub_path = "tools/expand_metal.zig" } },
             });
             var run_expand_metal = b.addRunArtifact(expand_metal);
             run_expand_metal.addArg("--metal-file");
-            run_expand_metal.addFileArg(.{ .path = "llama.cpp/ggml-metal.metal" });
+            run_expand_metal.addFileArg(.{ .src_path = .{ .owner = b, .sub_path = "llama.cpp/ggml-metal.metal" } });
             run_expand_metal.addArg("--common-file");
-            run_expand_metal.addFileArg(.{ .path = "llama.cpp/ggml-common.h" });
+            run_expand_metal.addFileArg(.{ .src_path = .{ .owner = b, .sub_path = "llama.cpp/ggml-common.h" } });
             run_expand_metal.addArg("--output-file");
             const metal_expanded = run_expand_metal.addOutputFileArg("ggml-metal.metal");
             const install_metal = b.addInstallFileWithDir(metal_expanded, .lib, "ggml-metal.metal");
@@ -173,7 +172,7 @@ const ObjBuilder = struct {
         const obj = self.b.addObject(.{ .name = params.name, .target = self.target, .optimize = self.optimize });
         obj.addCSourceFiles(.{ .files = params.sources, .flags = self.flags.items });
         for (self.include_paths) |path| {
-            obj.addIncludePath(.{ .path = path });
+            obj.addIncludePath(.{ .src_path = .{ .owner = self.b, .sub_path = path } });
         }
         obj.linkLibC();
         obj.linkLibCpp();
diff --git a/godot/addons/godot-llama-cpp/plugin.gdextension b/godot/addons/godot-llama-cpp/plugin.gdextension
@@ -5,7 +5,7 @@ compatibility_minimum = "4.2"
 
 [libraries]
 
-macos.debug = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp-aarch64-macos-none-ReleaseSafe.dylib"
+macos.debug = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp-aarch64-macos-none-Debug.dylib"
 macos.release = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp-aarch64-macos-none-ReleaseSafe.dylib"
 windows.debug.x86_32 = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp.windows.template_debug.x86_32.dll"
 windows.release.x86_32 = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp.windows.template_release.x86_32.dll"
diff --git a/godot/examples/simple/simple.gd b/godot/examples/simple/simple.gd
@@ -9,10 +9,7 @@ func _on_text_edit_submit(input: String) -> void:
 	handle_input(input)
 
 func handle_input(input: String) -> void:
-	#var messages = [{ "sender": "system", "text": "You are a pirate chatbot who always responds in pirate speak!" }]
-	
-	#var messages = [{ "sender": "system", "text": "You are a helpful chatbot assistant!" }]
-	var messages = []
+	var messages = [{ "sender": "system", "text": "You are a pirate chatbot who always responds in pirate speak!" }]
 	messages.append_array(messages_container.get_children().filter(func(msg: Message): return msg.include_in_prompt).map(
 		func(msg: Message) -> Dictionary:
 			return { "text": msg.text, "sender": msg.sender }
@@ -35,8 +32,6 @@ func handle_input(input: String) -> void:
 	ai_message.completion_id = completion_id
 	ai_message.pending = true
 	ai_message.grab_focus()
-	
-
 
 func _on_llama_context_completion_generated(chunk: Dictionary) -> void:
 	var completion_id = chunk.id
diff --git a/godot/examples/simple/simple.tscn b/godot/examples/simple/simple.tscn
@@ -4,7 +4,7 @@
 [ext_resource type="Script" path="res://examples/simple/simple.gd" id="1_sruc3"]
 [ext_resource type="PackedScene" uid="uid://t862t0v8ht2q" path="res://examples/simple/message.tscn" id="2_7iip7"]
 [ext_resource type="Script" path="res://examples/simple/TextEdit.gd" id="2_7usqw"]
-[ext_resource type="LlamaModel" path="res://models/meta-llama-3-8b-instruct.Q5_K_M.gguf" id="5_qov1l"]
+[ext_resource type="LlamaModel" path="res://models/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf" id="5_yssjj"]
 
 [node name="Node" type="Node"]
 script = ExtResource("1_sruc3")
@@ -68,8 +68,7 @@ icon = ExtResource("1_gjsev")
 expand_icon = true
 
 [node name="LlamaContext" type="LlamaContext" parent="."]
-model = ExtResource("5_qov1l")
-temperature = 0.9
+model = ExtResource("5_yssjj")
 unique_name_in_owner = true
 
 [connection signal="submit" from="Panel/MarginContainer/VBoxContainer/HBoxContainer/TextEdit" to="." method="_on_text_edit_submit"]
diff --git a/godot_cpp b/godot_cpp
@@ -1 +1 @@
-Subproject commit b28098e76b84e8831b8ac68d490f4bca44678b2a
+Subproject commit 9da6ecd14485a3dacc04d8e0558c21beb709ca9f
diff --git a/llama.cpp b/llama.cpp
@@ -1 +1 @@
-Subproject commit 9afdffe70ebf3166d429b4434783bb0b7f97bdeb
+Subproject commit 21be9cab94e0b5b53cb6edeeebf8c8c799baad03
diff --git a/src/llama_context.cpp b/src/llama_context.cpp
@@ -162,9 +162,10 @@ void LlamaContext::__thread_loop() {
 
 			for (size_t j = 0; j < sequence.size(); j++) {
 				llama_batch_add(batch, sequence[j], j + curr_token_pos, { 0 }, false);
-				curr_token_pos++;
 			}
 
+			curr_token_pos += sequence.size();
+
 			if (i == sequences.size() - 1) {
 				batch.logits[batch.n_tokens - 1] = true;
 			}
@@ -175,6 +176,10 @@ void LlamaContext::__thread_loop() {
 			}
 		}
 
+		printf("Request tokens: %d\n", request_tokens.size());
+		printf("Batch tokens: %d\n", batch.n_tokens);
+		printf("Current token pos: %d\n", curr_token_pos);
+
 		if (decode_failed) {
 			Dictionary response;
 			response["id"] = req.id;
@@ -197,7 +202,10 @@ void LlamaContext::__thread_loop() {
 
 			context_tokens.push_back(new_token_id);
 
-			if (llama_token_is_eog(model->model, new_token_id) || curr_token_pos == n_len) {
+			bool eog = llama_token_is_eog(model->model, new_token_id);
+			bool curr_eq_n_len = curr_token_pos == n_len;
+
+			if (eog || curr_eq_n_len) {
 				response["done"] = true;
 				call_thread_safe("emit_signal", "completion_generated", response);
 				break;
@@ -219,6 +227,8 @@ void LlamaContext::__thread_loop() {
 			}
 		}
 
+		llama_sampling_reset(sampling_ctx);
+
 		if (decode_failed) {
 			Dictionary response;
 			response["id"] = req.id;
@@ -281,31 +291,31 @@ void LlamaContext::set_n_len(int n_len) {
 }
 
 float LlamaContext::get_temperature() {
-  return sampling_params.temp;
+	return sampling_params.temp;
 }
 void LlamaContext::set_temperature(float temperature) {
-  sampling_params.temp = temperature;
+	sampling_params.temp = temperature;
 }
 
 float LlamaContext::get_top_p() {
-  return sampling_params.top_p;
+	return sampling_params.top_p;
 }
 void LlamaContext::set_top_p(float top_p) {
-  sampling_params.top_p = top_p;
+	sampling_params.top_p = top_p;
 }
 
 float LlamaContext::get_frequency_penalty() {
-  return sampling_params.penalty_freq;
+	return sampling_params.penalty_freq;
 }
 void LlamaContext::set_frequency_penalty(float frequency_penalty) {
-  sampling_params.penalty_freq = frequency_penalty;
+	sampling_params.penalty_freq = frequency_penalty;
 }
 
 float LlamaContext::get_presence_penalty() {
-  return sampling_params.penalty_present;
+	return sampling_params.penalty_present;
 }
 void LlamaContext::set_presence_penalty(float presence_penalty) {
-  sampling_params.penalty_present = presence_penalty;
+	sampling_params.penalty_present = presence_penalty;
 }
 
 void LlamaContext::_exit_tree() {

Original file line number	Diff line number	Diff line change
`@@ -162,9 +162,10 @@ void LlamaContext::__thread_loop() {`
`162`	`162`
`163`	`163`	`for (size_t j = 0; j < sequence.size(); j++) {`
`164`	`164`	`llama_batch_add(batch, sequence[j], j + curr_token_pos, { 0 }, false);`
`165`		`- curr_token_pos++;`
`166`	`165`	`}`
`167`	`166`
	`167`	`+ curr_token_pos += sequence.size();`
	`168`	`+`
`168`	`169`	`if (i == sequences.size() - 1) {`
`169`	`170`	`batch.logits[batch.n_tokens - 1] = true;`
`170`	`171`	`}`
`@@ -175,6 +176,10 @@ void LlamaContext::__thread_loop() {`
`175`	`176`	`}`
`176`	`177`	`}`
`177`	`178`
	`179`	`+ printf("Request tokens: %d\n", request_tokens.size());`
	`180`	`+ printf("Batch tokens: %d\n", batch.n_tokens);`
	`181`	`+ printf("Current token pos: %d\n", curr_token_pos);`
	`182`	`+`
`178`	`183`	`if (decode_failed) {`
`179`	`184`	`Dictionary response;`
`180`	`185`	`response["id"] = req.id;`
`@@ -197,7 +202,10 @@ void LlamaContext::__thread_loop() {`
`197`	`202`
`198`	`203`	`context_tokens.push_back(new_token_id);`
`199`	`204`
`200`		`- if (llama_token_is_eog(model->model, new_token_id) \|\| curr_token_pos == n_len) {`
	`205`	`+ bool eog = llama_token_is_eog(model->model, new_token_id);`
	`206`	`+ bool curr_eq_n_len = curr_token_pos == n_len;`
	`207`	`+`
	`208`	`+ if (eog \|\| curr_eq_n_len) {`
`201`	`209`	`response["done"] = true;`
`202`	`210`	`call_thread_safe("emit_signal", "completion_generated", response);`
`203`	`211`	`break;`
`@@ -219,6 +227,8 @@ void LlamaContext::__thread_loop() {`
`219`	`227`	`}`
`220`	`228`	`}`
`221`	`229`
	`230`	`+ llama_sampling_reset(sampling_ctx);`
	`231`	`+`
`222`	`232`	`if (decode_failed) {`
`223`	`233`	`Dictionary response;`
`224`	`234`	`response["id"] = req.id;`
`@@ -281,31 +291,31 @@ void LlamaContext::set_n_len(int n_len) {`
`281`	`291`	`}`
`282`	`292`
`283`	`293`	`float LlamaContext::get_temperature() {`
`284`		`- return sampling_params.temp;`
	`294`	`+ return sampling_params.temp;`
`285`	`295`	`}`
`286`	`296`	`void LlamaContext::set_temperature(float temperature) {`
`287`		`- sampling_params.temp = temperature;`
	`297`	`+ sampling_params.temp = temperature;`
`288`	`298`	`}`
`289`	`299`
`290`	`300`	`float LlamaContext::get_top_p() {`
`291`		`- return sampling_params.top_p;`
	`301`	`+ return sampling_params.top_p;`
`292`	`302`	`}`
`293`	`303`	`void LlamaContext::set_top_p(float top_p) {`
`294`		`- sampling_params.top_p = top_p;`
	`304`	`+ sampling_params.top_p = top_p;`
`295`	`305`	`}`
`296`	`306`
`297`	`307`	`float LlamaContext::get_frequency_penalty() {`
`298`		`- return sampling_params.penalty_freq;`
	`308`	`+ return sampling_params.penalty_freq;`
`299`	`309`	`}`
`300`	`310`	`void LlamaContext::set_frequency_penalty(float frequency_penalty) {`
`301`		`- sampling_params.penalty_freq = frequency_penalty;`
	`311`	`+ sampling_params.penalty_freq = frequency_penalty;`
`302`	`312`	`}`
`303`	`313`
`304`	`314`	`float LlamaContext::get_presence_penalty() {`
`305`		`- return sampling_params.penalty_present;`
	`315`	`+ return sampling_params.penalty_present;`
`306`	`316`	`}`
`307`	`317`	`void LlamaContext::set_presence_penalty(float presence_penalty) {`
`308`		`- sampling_params.penalty_present = presence_penalty;`
	`318`	`+ sampling_params.penalty_present = presence_penalty;`
`309`	`319`	`}`
`310`	`320`
`311`	`321`	`void LlamaContext::_exit_tree() {`