From bc4b614c489d60e6f04e7032d1120639dc6c88b2 Mon Sep 17 00:00:00 2001
From: Hasan Mukhlis <hsnmkls@proton.me>
Date: Fri, 10 May 2024 12:35:45 +0800
Subject: [PATCH 1/6] refactoring

---
 build.zig                                     | 324 +++++-------------
 godot/.gitattributes                          |   2 +
 godot/.gitignore                              |   2 +
 .../autoloads/llama-backend.gd                |  10 -
 godot/addons/godot-llama-cpp/plugin.cfg       |   2 +-
 .../{godot-llama-cpp.gd => plugin.gd}         |   4 +-
 ...ama-cpp.gdextension => plugin.gdextension} |   2 +-
 godot/autoloads/llama.tscn                    |   6 -
 godot/examples/simple/TextEdit.gd             |  17 +
 godot/examples/simple/form.gd                 |   6 +
 godot/examples/simple/message.tscn            |  10 +
 godot/examples/simple/simple.gd               |  21 ++
 godot/examples/simple/simple.tscn             |  77 +++++
 godot/icon.svg                                |   1 +
 godot/icon.svg.import                         |  37 ++
 godot/main.gd                                 |  27 --
 godot/main.tscn                               | 103 ------
 godot/project.godot                           |  28 +-
 godot_cpp                                     |   2 +-
 llama.cpp                                     |   2 +-
 src/llama_backend.cpp                         |  19 -
 src/llama_backend.h                           |  19 -
 src/llama_context.cpp                         | 168 +++------
 src/llama_context.h                           |  39 ++-
 src/llama_model.cpp                           |   9 +-
 src/llama_model_loader.cpp                    |   5 +-
 src/register_types.cpp                        |   2 -
 27 files changed, 347 insertions(+), 597 deletions(-)
 create mode 100644 godot/.gitattributes
 create mode 100644 godot/.gitignore
 delete mode 100644 godot/addons/godot-llama-cpp/autoloads/llama-backend.gd
 rename godot/addons/godot-llama-cpp/{godot-llama-cpp.gd => plugin.gd} (50%)
 rename godot/addons/godot-llama-cpp/{godot-llama-cpp.gdextension => plugin.gdextension} (97%)
 delete mode 100644 godot/autoloads/llama.tscn
 create mode 100644 godot/examples/simple/TextEdit.gd
 create mode 100644 godot/examples/simple/form.gd
 create mode 100644 godot/examples/simple/message.tscn
 create mode 100644 godot/examples/simple/simple.gd
 create mode 100644 godot/examples/simple/simple.tscn
 create mode 100644 godot/icon.svg
 create mode 100644 godot/icon.svg.import
 delete mode 100644 godot/main.gd
 delete mode 100644 godot/main.tscn
 delete mode 100644 src/llama_backend.cpp
 delete mode 100644 src/llama_backend.h

diff --git a/build.zig b/build.zig
index 711165b..165b932 100644
--- a/build.zig
+++ b/build.zig
@@ -7,11 +7,41 @@ const extension_name = "godot-llama-cpp";
 pub fn build(b: *std.Build) !void {
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
-    const zig_triple = try target.result.linuxTriple(b.allocator);
+    const triple = try target.result.linuxTriple(b.allocator);
 
-    var objs = std.ArrayList(*std.Build.Step.Compile).init(b.allocator);
+    const lib_godot_cpp = try build_lib_godot_cpp(.{ .b = b, .target = target, .optimize = optimize });
+    const lib_llama_cpp = try build_lib_llama_cpp(.{ .b = b, .target = target, .optimize = optimize });
+
+    const plugin = b.addSharedLibrary(.{
+        .name = b.fmt("{s}-{s}-{s}", .{ extension_name, triple, @tagName(optimize) }),
+        .target = target,
+        .optimize = optimize,
+    });
+    plugin.addCSourceFiles(.{ .files = try findFilesRecursive(b, "src/", &cfiles_exts) });
+    plugin.addIncludePath(.{ .path = "src/" });
+    plugin.addIncludePath(.{ .path = "godot_cpp/gdextension/" });
+    plugin.addIncludePath(.{ .path = "godot_cpp/include/" });
+    plugin.addIncludePath(.{ .path = "godot_cpp/gen/include" });
+    plugin.addIncludePath(.{ .path = "llama.cpp" });
+    plugin.addIncludePath(.{ .path = "llama.cpp/common" });
+    plugin.linkLibrary(lib_llama_cpp);
+    plugin.linkLibrary(lib_godot_cpp);
+
+    b.lib_dir = "./godot/addons/godot-llama-cpp/lib";
+    b.installArtifact(plugin);
+}
+
+const BuildParams = struct {
+    b: *std.Build,
+    target: std.Build.ResolvedTarget,
+    optimize: std.builtin.OptimizeMode,
+};
+
+fn build_lib_godot_cpp(params: BuildParams) !*std.Build.Step.Compile {
+    const b = params.b;
+    const target = params.target;
+    const optimize = params.optimize;
 
-    // godot-cpp
     const lib_godot = b.addStaticLibrary(.{
         .name = "godot-cpp",
         .target = target,
@@ -26,9 +56,7 @@ pub fn build(b: *std.Build) !void {
                     .cwd_dir = b.build_root.handle,
                 });
             },
-            else => {
-                return;
-            },
+            else => {},
         }
     };
     lib_godot.linkLibCpp();
@@ -40,7 +68,15 @@ pub fn build(b: *std.Build) !void {
     lib_godot.addCSourceFiles(.{ .files = lib_godot_gen_sources, .flags = &.{ "-std=c++17", "-fno-exceptions" } });
     lib_godot.addCSourceFiles(.{ .files = lib_godot_sources, .flags = &.{ "-std=c++17", "-fno-exceptions" } });
 
-    // llama.cpp
+    return lib_godot;
+}
+
+fn build_lib_llama_cpp(params: BuildParams) !*std.Build.Step.Compile {
+    const b = params.b;
+    const target = params.target;
+    const optimize = params.optimize;
+    const zig_triple = try target.result.zigTriple(b.allocator);
+
     const commit_hash = try std.ChildProcess.run(.{ .allocator = b.allocator, .argv = &.{ "git", "rev-parse", "HEAD" }, .cwd = b.pathFromRoot("llama.cpp") });
     const zig_version = builtin.zig_version_string;
     try b.build_root.handle.writeFile2(.{ .sub_path = "llama.cpp/common/build-info.cpp", .data = b.fmt(
@@ -51,256 +87,64 @@ pub fn build(b: *std.Build) !void {
         \\
     , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, zig_triple }) });
 
-    var flags = std.ArrayList([]const u8).init(b.allocator);
-    if (target.result.abi != .msvc) try flags.append("-D_GNU_SOURCE");
-    if (target.result.os.tag == .macos) try flags.appendSlice(&.{
-        "-D_DARWIN_C_SOURCE",
-        "-DGGML_USE_METAL",
-        "-DGGML_USE_ACCELERATE",
-        "-DACCELERATE_USE_LAPACK",
-        "-DACCELERATE_LAPACK_ILP64",
-    }) else try flags.append("-DGGML_USE_VULKAN");
-    try flags.append("-D_XOPEN_SOURCE=600");
-
-    var cflags = std.ArrayList([]const u8).init(b.allocator);
-    try cflags.append("-std=c11");
-    try cflags.appendSlice(flags.items);
-    var cxxflags = std.ArrayList([]const u8).init(b.allocator);
-    try cxxflags.append("-std=c++11");
-    try cxxflags.appendSlice(flags.items);
-
-    const include_paths = [_][]const u8{ "llama.cpp", "llama.cpp/common" };
-    const llama = buildObj(.{
-        .b = b,
-        .name = "llama",
-        .target = target,
-        .optimize = optimize,
-        .sources = &.{"llama.cpp/llama.cpp"},
-        .include_paths = &include_paths,
-        .link_lib_cpp = true,
-        .link_lib_c = false,
-        .flags = cxxflags.items,
-    });
-    const ggml = buildObj(.{
-        .b = b,
-        .name = "ggml",
-        .target = target,
-        .optimize = optimize,
-        .sources = &.{"llama.cpp/ggml.c"},
-        .include_paths = &include_paths,
-        .link_lib_c = true,
-        .link_lib_cpp = false,
-        .flags = cflags.items,
-    });
-    const common = buildObj(.{
-        .b = b,
-        .name = "common",
-        .target = target,
-        .optimize = optimize,
-        .sources = &.{"llama.cpp/common/common.cpp"},
-        .include_paths = &include_paths,
-        .link_lib_cpp = true,
-        .link_lib_c = false,
-        .flags = cxxflags.items,
-    });
-    const console = buildObj(.{
-        .b = b,
-        .name = "console",
-        .target = target,
-        .optimize = optimize,
-        .sources = &.{"llama.cpp/common/console.cpp"},
-        .include_paths = &include_paths,
-        .link_lib_cpp = true,
-        .link_lib_c = false,
-        .flags = cxxflags.items,
-    });
-    const sampling = buildObj(.{
-        .b = b,
-        .name = "sampling",
-        .target = target,
-        .optimize = optimize,
-        .sources = &.{"llama.cpp/common/sampling.cpp"},
-        .include_paths = &include_paths,
-        .link_lib_cpp = true,
-        .link_lib_c = false,
-        .flags = cxxflags.items,
-    });
-    const grammar_parser = buildObj(.{
-        .b = b,
-        .name = "grammar_parser",
-        .target = target,
-        .optimize = optimize,
-        .sources = &.{"llama.cpp/common/grammar-parser.cpp"},
-        .include_paths = &include_paths,
-        .link_lib_cpp = true,
-        .link_lib_c = false,
-        .flags = cxxflags.items,
-    });
-    const build_info = buildObj(.{
-        .b = b,
-        .name = "build_info",
-        .target = target,
-        .optimize = optimize,
-        .sources = &.{"llama.cpp/common/build-info.cpp"},
-        .include_paths = &include_paths,
-        .link_lib_cpp = true,
-        .link_lib_c = false,
-        .flags = cxxflags.items,
-    });
-    const ggml_alloc = buildObj(.{
-        .b = b,
-        .name = "ggml_alloc",
-        .target = target,
-        .optimize = optimize,
-        .sources = &.{"llama.cpp/ggml-alloc.c"},
-        .include_paths = &include_paths,
-        .link_lib_c = true,
-        .link_lib_cpp = false,
-        .flags = cflags.items,
-    });
-    const ggml_backend = buildObj(.{
-        .b = b,
-        .name = "ggml_backend",
-        .target = target,
-        .optimize = optimize,
-        .sources = &.{"llama.cpp/ggml-backend.c"},
-        .include_paths = &include_paths,
-        .link_lib_c = true,
-        .link_lib_cpp = false,
-        .flags = cflags.items,
-    });
-    const ggml_quants = buildObj(.{
-        .b = b,
-        .name = "ggml_quants",
-        .target = target,
-        .optimize = optimize,
-        .sources = &.{"llama.cpp/ggml-quants.c"},
-        .include_paths = &include_paths,
-        .link_lib_c = true,
-        .link_lib_cpp = false,
-        .flags = cflags.items,
-    });
-    const unicode = buildObj(.{
-        .b = b,
-        .name = "unicode",
-        .target = target,
-        .optimize = optimize,
-        .sources = &.{"llama.cpp/unicode.cpp"},
-        .include_paths = &include_paths,
-        .link_lib_c = false,
-        .link_lib_cpp = true,
-        .flags = cxxflags.items,
+    var objs = std.ArrayList(*std.Build.Step.Compile).init(b.allocator);
+    var objBuilder = ObjBuilder.init(.{ .b = b, .target = target, .optimize = optimize, .include_paths = &.{
+        "llama.cpp",
+        "llama.cpp/common",
+    } });
+
+    try objs.appendSlice(&.{
+        objBuilder.build(.{ .name = "ggml", .sources = &.{"llama.cpp/ggml.c"} }),
+        objBuilder.build(.{ .name = "sgemm", .sources = &.{"llama.cpp/sgemm.cpp"} }),
+        objBuilder.build(.{ .name = "ggml_alloc", .sources = &.{"llama.cpp/ggml-alloc.c"} }),
+        objBuilder.build(.{ .name = "ggml_backend", .sources = &.{"llama.cpp/ggml-backend.c"} }),
+        objBuilder.build(.{ .name = "ggml_quants", .sources = &.{"llama.cpp/ggml-quants.c"} }),
+        objBuilder.build(.{ .name = "llama", .sources = &.{"llama.cpp/llama.cpp"} }),
+        objBuilder.build(.{ .name = "unicode", .sources = &.{"llama.cpp/unicode.cpp"} }),
+        objBuilder.build(.{ .name = "unicode_data", .sources = &.{"llama.cpp/unicode-data.cpp"} }),
+        objBuilder.build(.{ .name = "common", .sources = &.{"llama.cpp/common/common.cpp"} }),
+        objBuilder.build(.{ .name = "console", .sources = &.{"llama.cpp/common/console.cpp"} }),
+        objBuilder.build(.{ .name = "sampling", .sources = &.{"llama.cpp/common/sampling.cpp"} }),
+        objBuilder.build(.{ .name = "grammar_parser", .sources = &.{"llama.cpp/common/grammar-parser.cpp"} }),
+        objBuilder.build(.{ .name = "json_schema_to_grammar", .sources = &.{"llama.cpp/common/json-schema-to-grammar.cpp"} }),
+        objBuilder.build(.{ .name = "build_info", .sources = &.{"llama.cpp/common/build-info.cpp"} }),
     });
-    try objs.appendSlice(&.{ llama, ggml, common, console, sampling, grammar_parser, build_info, ggml_alloc, ggml_backend, ggml_quants, unicode });
-
-    if (target.result.os.tag == .macos) {
-        const ggml_metal = buildObj(.{
-            .b = b,
-            .name = "ggml_metal",
-            .target = target,
-            .optimize = optimize,
-            .sources = &.{"llama.cpp/ggml-metal.m"},
-            .include_paths = &include_paths,
-            .link_lib_c = true,
-            .link_lib_cpp = false,
-            .flags = cflags.items,
-        });
-        const airCommand = b.addSystemCommand(&.{ "xcrun", "-sdk", "macosx", "metal", "-O3", "-c" });
-        airCommand.addFileArg(.{ .path = "llama.cpp/ggml-metal.metal" });
-        airCommand.addArg("-o");
-        const air = airCommand.addOutputFileArg("ggml-metal.air");
 
-        const libCommand = b.addSystemCommand(&.{ "xcrun", "-sdk", "macosx", "metallib" });
-        libCommand.addFileArg(air);
-        libCommand.addArg("-o");
-        const lib = libCommand.addOutputFileArg("default.metallib");
-        const libInstall = b.addInstallLibFile(lib, "default.metallib");
-        b.getInstallStep().dependOn(&libInstall.step);
-        try objs.append(ggml_metal);
-    } else {
-        const ggml_vulkan = buildObj(.{
-            .b = b,
-            .name = "ggml_vulkan",
-            .target = target,
-            .optimize = optimize,
-            .sources = &.{"llama.cpp/ggml-vulkan.cpp"},
-            .include_paths = &include_paths,
-            .link_lib_cpp = true,
-            .link_lib_c = false,
-            .flags = cxxflags.items,
-        });
-        try objs.append(ggml_vulkan);
-    }
+    const lib_llama_cpp = b.addStaticLibrary(.{ .name = "llama.cpp", .target = target, .optimize = optimize });
 
-    const extension = b.addSharedLibrary(.{ .name = b.fmt("{s}-{s}-{s}", .{ extension_name, zig_triple, @tagName(optimize) }), .target = target, .optimize = optimize });
-    const sources = try findFilesRecursive(b, "src", &cfiles_exts);
-    extension.addCSourceFiles(.{ .files = sources, .flags = &.{ "-std=c++17", "-fno-exceptions" } });
-    extension.addIncludePath(.{ .path = "src" });
-    extension.addIncludePath(.{ .path = "godot_cpp/include/" });
-    extension.addIncludePath(.{ .path = "godot_cpp/gdextension/" });
-    extension.addIncludePath(.{ .path = "godot_cpp/gen/include/" });
-    extension.addIncludePath(.{ .path = "llama.cpp" });
-    extension.addIncludePath(.{ .path = "llama.cpp/common" });
     for (objs.items) |obj| {
-        extension.addObject(obj);
-    }
-    extension.linkLibC();
-    extension.linkLibCpp();
-    if (target.result.os.tag == .macos) {
-        extension.linkFramework("Metal");
-        extension.linkFramework("MetalKit");
-        extension.linkFramework("Foundation");
-        extension.linkFramework("Accelerate");
-        // b.installFile("llama.cpp/ggml-metal.metal", b.pathJoin(&.{ std.fs.path.basename(b.lib_dir), "ggml-metal.metal" }));
-        // b.installFile("llama.cpp/ggml-common.h", b.pathJoin(&.{ std.fs.path.basename(b.lib_dir), "ggml-common.h" }));
-    } else {
-        if (target.result.os.tag == .windows) {
-            const vk_path = b.graph.env_map.get("VK_SDK_PATH") orelse @panic("VK_SDK_PATH not set");
-            extension.addLibraryPath(.{ .path = b.pathJoin(&.{ vk_path, "Lib" }) });
-            extension.linkSystemLibrary("vulkan-1");
-        } else {
-            extension.linkSystemLibrary("vulkan");
-        }
+        lib_llama_cpp.addObject(obj);
     }
-    extension.linkLibrary(lib_godot);
 
-    b.installArtifact(extension);
+    return lib_llama_cpp;
 }
 
-const BuildObjectParams = struct {
+const ObjBuilder = struct {
     b: *std.Build,
-    name: []const u8,
     target: std.Build.ResolvedTarget,
     optimize: std.builtin.OptimizeMode,
-    sources: []const []const u8,
     include_paths: []const []const u8,
-    link_lib_c: bool,
-    link_lib_cpp: bool,
-    flags: []const []const u8,
-};
 
-fn buildObj(params: BuildObjectParams) *std.Build.Step.Compile {
-    const obj = params.b.addObject(.{
-        .name = params.name,
-        .target = params.target,
-        .optimize = params.optimize,
-    });
-    if (params.target.result.os.tag == .windows) {
-        const vk_path = params.b.graph.env_map.get("VK_SDK_PATH") orelse @panic("VK_SDK_PATH not set");
-        obj.addIncludePath(.{ .path = params.b.pathJoin(&.{ vk_path, "include" }) });
-    }
-    for (params.include_paths) |path| {
-        obj.addIncludePath(.{ .path = path });
+    fn init(params: struct { b: *std.Build, target: std.Build.ResolvedTarget, optimize: std.builtin.OptimizeMode, include_paths: []const []const u8 }) ObjBuilder {
+        return ObjBuilder{
+            .b = params.b,
+            .target = params.target,
+            .optimize = params.optimize,
+            .include_paths = params.include_paths,
+        };
     }
-    if (params.link_lib_c) {
+
+    fn build(self: *ObjBuilder, params: struct { name: []const u8, sources: []const []const u8 }) *std.Build.Step.Compile {
+        const obj = self.b.addObject(.{ .name = params.name, .target = self.target, .optimize = self.optimize });
+        obj.addCSourceFiles(.{ .files = params.sources });
+        for (self.include_paths) |path| {
+            obj.addIncludePath(.{ .path = path });
+        }
         obj.linkLibC();
-    }
-    if (params.link_lib_cpp) {
         obj.linkLibCpp();
+        return obj;
     }
-    obj.addCSourceFiles(.{ .files = params.sources, .flags = params.flags });
-    return obj;
-}
+};
 
 fn findFilesRecursive(b: *std.Build, dir_name: []const u8, exts: []const []const u8) ![][]const u8 {
     var sources = std.ArrayList([]const u8).init(b.allocator);
diff --git a/godot/.gitattributes b/godot/.gitattributes
new file mode 100644
index 0000000..8ad74f7
--- /dev/null
+++ b/godot/.gitattributes
@@ -0,0 +1,2 @@
+# Normalize EOL for all files that Git considers text files.
+* text=auto eol=lf
diff --git a/godot/.gitignore b/godot/.gitignore
new file mode 100644
index 0000000..4709183
--- /dev/null
+++ b/godot/.gitignore
@@ -0,0 +1,2 @@
+# Godot 4+ specific ignores
+.godot/
diff --git a/godot/addons/godot-llama-cpp/autoloads/llama-backend.gd b/godot/addons/godot-llama-cpp/autoloads/llama-backend.gd
deleted file mode 100644
index 83bb52e..0000000
--- a/godot/addons/godot-llama-cpp/autoloads/llama-backend.gd
+++ /dev/null
@@ -1,10 +0,0 @@
-# This script will be autoloaded by the editor plugin
-extends Node
-
-var backend: LlamaBackend = LlamaBackend.new()
-
-func _enter_tree() -> void:
-  backend.init()
-
-func _exit_tree() -> void:
-  backend.deinit()
diff --git a/godot/addons/godot-llama-cpp/plugin.cfg b/godot/addons/godot-llama-cpp/plugin.cfg
index 36e0b68..6335055 100644
--- a/godot/addons/godot-llama-cpp/plugin.cfg
+++ b/godot/addons/godot-llama-cpp/plugin.cfg
@@ -4,4 +4,4 @@ name="godot-llama-cpp"
 description="Run large language models in Godot. Powered by llama.cpp."
 author="hazelnutcloud"
 version="0.0.1"
-script="godot-llama-cpp.gd"
+script="plugin.gd"
diff --git a/godot/addons/godot-llama-cpp/godot-llama-cpp.gd b/godot/addons/godot-llama-cpp/plugin.gd
similarity index 50%
rename from godot/addons/godot-llama-cpp/godot-llama-cpp.gd
rename to godot/addons/godot-llama-cpp/plugin.gd
index ca63dbc..5cee76e 100644
--- a/godot/addons/godot-llama-cpp/godot-llama-cpp.gd
+++ b/godot/addons/godot-llama-cpp/plugin.gd
@@ -4,9 +4,9 @@ extends EditorPlugin
 
 func _enter_tree():
 	# Initialization of the plugin goes here.
-	add_autoload_singleton("__LlamaBackend", "res://addons/godot-llama-cpp/autoloads/llama-backend.gd")
+  pass
 
 
 func _exit_tree():
 	# Clean-up of the plugin goes here.
-	remove_autoload_singleton("__LlamaBackend")
+  pass
diff --git a/godot/addons/godot-llama-cpp/godot-llama-cpp.gdextension b/godot/addons/godot-llama-cpp/plugin.gdextension
similarity index 97%
rename from godot/addons/godot-llama-cpp/godot-llama-cpp.gdextension
rename to godot/addons/godot-llama-cpp/plugin.gdextension
index 4171072..75c8d09 100644
--- a/godot/addons/godot-llama-cpp/godot-llama-cpp.gdextension
+++ b/godot/addons/godot-llama-cpp/plugin.gdextension
@@ -6,7 +6,7 @@ compatibility_minimum = "4.2"
 [libraries]
 
 macos.debug = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp-aarch64-macos-none-Debug.dylib"
-macos.release = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp-aarch64-macos-none-ReleaseFast.dylib"
+macos.release = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp-aarch64-macos-none-ReleaseSafe.dylib"
 windows.debug.x86_32 = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp.windows.template_debug.x86_32.dll"
 windows.release.x86_32 = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp.windows.template_release.x86_32.dll"
 windows.debug.x86_64 = "res://addons/godot-llama-cpp/lib/godot-llama-cpp-x86_64-windows-gnu-Debug.dll"
diff --git a/godot/autoloads/llama.tscn b/godot/autoloads/llama.tscn
deleted file mode 100644
index 9115818..0000000
--- a/godot/autoloads/llama.tscn
+++ /dev/null
@@ -1,6 +0,0 @@
-[gd_scene load_steps=2 format=3 uid="uid://bxobxniygk7jm"]
-
-[ext_resource type="LlamaModel" path="res://models/OGNO-7B-Q4_K_M.gguf" id="1_vd8h8"]
-
-[node name="LlamaContext" type="LlamaContext"]
-model = ExtResource("1_vd8h8")
diff --git a/godot/examples/simple/TextEdit.gd b/godot/examples/simple/TextEdit.gd
new file mode 100644
index 0000000..037a2ba
--- /dev/null
+++ b/godot/examples/simple/TextEdit.gd
@@ -0,0 +1,17 @@
+extends TextEdit
+
+signal submit(input: String)
+	
+func _gui_input(event: InputEvent) -> void:
+	if event is InputEventKey:
+		var keycode = event.get_keycode_with_modifiers()
+		if keycode == KEY_ENTER and event.is_pressed():
+			handle_submit()
+			accept_event()
+		if keycode == KEY_ENTER | KEY_MASK_SHIFT and event.is_pressed():
+			insert_text_at_caret("\n")
+
+func handle_submit() -> void:
+	submit.emit(text)
+	text = ""
+	
diff --git a/godot/examples/simple/form.gd b/godot/examples/simple/form.gd
new file mode 100644
index 0000000..092aa73
--- /dev/null
+++ b/godot/examples/simple/form.gd
@@ -0,0 +1,6 @@
+extends HBoxContainer
+
+@onready var text_edit = %TextEdit
+
+func _on_button_pressed() -> void:
+	text_edit.handle_submit()
diff --git a/godot/examples/simple/message.tscn b/godot/examples/simple/message.tscn
new file mode 100644
index 0000000..1c9d3c4
--- /dev/null
+++ b/godot/examples/simple/message.tscn
@@ -0,0 +1,10 @@
+[gd_scene format=3 uid="uid://t862t0v8ht2q"]
+
+[node name="RichTextLabel" type="RichTextLabel"]
+offset_right = 40.0
+offset_bottom = 40.0
+size_flags_horizontal = 3
+focus_mode = 2
+fit_content = true
+scroll_active = false
+selection_enabled = true
diff --git a/godot/examples/simple/simple.gd b/godot/examples/simple/simple.gd
new file mode 100644
index 0000000..9739a30
--- /dev/null
+++ b/godot/examples/simple/simple.gd
@@ -0,0 +1,21 @@
+extends Node
+
+@onready var messages_container = %MessagesContainer
+@onready var llama_context = %LlamaContext
+
+var message = preload("res://examples/simple/message.tscn")
+
+func _on_text_edit_submit(input: String) -> void:
+	handle_input(input)
+
+func handle_input(input: String) -> void:
+	var new_message = message.instantiate()
+	new_message.text = input
+	messages_container.add_child(new_message)
+	
+	var id = llama_context.request_completion(input)
+	print("request id: ", id)
+	
+	var chunk = await llama_context.completion_generated
+	print('new chunk: ', chunk)
+	
diff --git a/godot/examples/simple/simple.tscn b/godot/examples/simple/simple.tscn
new file mode 100644
index 0000000..d9ca0e9
--- /dev/null
+++ b/godot/examples/simple/simple.tscn
@@ -0,0 +1,77 @@
+[gd_scene load_steps=6 format=3 uid="uid://c55kb4qvg6geq"]
+
+[ext_resource type="LlamaModel" path="res://models/Phi-3-mini-128k-instruct.Q5_K_M.gguf" id="1_ff70a"]
+[ext_resource type="Texture2D" uid="uid://dplw232htshgc" path="res://addons/godot-llama-cpp/assets/godot-llama-cpp-1024x1024.svg" id="1_gjsev"]
+[ext_resource type="Script" path="res://examples/simple/simple.gd" id="1_sruc3"]
+[ext_resource type="Script" path="res://examples/simple/TextEdit.gd" id="2_7usqw"]
+[ext_resource type="Script" path="res://examples/simple/form.gd" id="2_p1ih5"]
+
+[node name="Node" type="Node"]
+script = ExtResource("1_sruc3")
+
+[node name="Panel" type="Panel" parent="."]
+anchors_preset = 15
+anchor_right = 1.0
+anchor_bottom = 1.0
+grow_horizontal = 2
+grow_vertical = 2
+
+[node name="MarginContainer" type="MarginContainer" parent="Panel"]
+layout_mode = 1
+anchors_preset = 15
+anchor_right = 1.0
+anchor_bottom = 1.0
+grow_horizontal = 2
+grow_vertical = 2
+theme_override_constants/margin_left = 10
+theme_override_constants/margin_top = 10
+theme_override_constants/margin_right = 10
+theme_override_constants/margin_bottom = 10
+
+[node name="VBoxContainer" type="VBoxContainer" parent="Panel/MarginContainer"]
+layout_mode = 2
+
+[node name="ScrollContainer" type="ScrollContainer" parent="Panel/MarginContainer/VBoxContainer"]
+layout_mode = 2
+size_flags_vertical = 3
+
+[node name="MessagesContainer" type="VBoxContainer" parent="Panel/MarginContainer/VBoxContainer/ScrollContainer"]
+unique_name_in_owner = true
+layout_mode = 2
+size_flags_horizontal = 3
+size_flags_vertical = 3
+
+[node name="RichTextLabel" type="RichTextLabel" parent="Panel/MarginContainer/VBoxContainer/ScrollContainer/MessagesContainer"]
+layout_mode = 2
+focus_mode = 2
+text = "How can I help you?"
+fit_content = true
+scroll_active = false
+selection_enabled = true
+
+[node name="HBoxContainer" type="HBoxContainer" parent="Panel/MarginContainer/VBoxContainer"]
+layout_mode = 2
+script = ExtResource("2_p1ih5")
+
+[node name="TextEdit" type="TextEdit" parent="Panel/MarginContainer/VBoxContainer/HBoxContainer"]
+unique_name_in_owner = true
+custom_minimum_size = Vector2(2.08165e-12, 100)
+layout_mode = 2
+size_flags_horizontal = 3
+placeholder_text = "Ask me anything..."
+wrap_mode = 1
+script = ExtResource("2_7usqw")
+
+[node name="Button" type="Button" parent="Panel/MarginContainer/VBoxContainer/HBoxContainer"]
+custom_minimum_size = Vector2(100, 2.08165e-12)
+layout_mode = 2
+icon = ExtResource("1_gjsev")
+expand_icon = true
+
+[node name="LlamaContext" type="LlamaContext" parent="."]
+model = ExtResource("1_ff70a")
+unique_name_in_owner = true
+
+[connection signal="submit" from="Panel/MarginContainer/VBoxContainer/HBoxContainer/TextEdit" to="." method="_on_text_edit_submit"]
+[connection signal="pressed" from="Panel/MarginContainer/VBoxContainer/HBoxContainer/Button" to="Panel/MarginContainer/VBoxContainer/HBoxContainer" method="_on_button_pressed"]
+[connection signal="pressed" from="Panel/MarginContainer/VBoxContainer/HBoxContainer/Button" to="Panel/MarginContainer/VBoxContainer/HBoxContainer/Button" method="_on_pressed"]
diff --git a/godot/icon.svg b/godot/icon.svg
new file mode 100644
index 0000000..3fe4f4a
--- /dev/null
+++ b/godot/icon.svg
@@ -0,0 +1 @@
+<svg height="128" width="128" xmlns="http://www.w3.org/2000/svg"><rect x="2" y="2" width="124" height="124" rx="14" fill="#363d52" stroke="#212532" stroke-width="4"/><g transform="scale(.101) translate(122 122)"><g fill="#fff"><path d="M105 673v33q407 354 814 0v-33z"/><path d="m105 673 152 14q12 1 15 14l4 67 132 10 8-61q2-11 15-15h162q13 4 15 15l8 61 132-10 4-67q3-13 15-14l152-14V427q30-39 56-81-35-59-83-108-43 20-82 47-40-37-88-64 7-51 8-102-59-28-123-42-26 43-46 89-49-7-98 0-20-46-46-89-64 14-123 42 1 51 8 102-48 27-88 64-39-27-82-47-48 49-83 108 26 42 56 81zm0 33v39c0 276 813 276 814 0v-39l-134 12-5 69q-2 10-14 13l-162 11q-12 0-16-11l-10-65H446l-10 65q-4 11-16 11l-162-11q-12-3-14-13l-5-69z" fill="#478cbf"/><path d="M483 600c0 34 58 34 58 0v-86c0-34-58-34-58 0z"/><circle cx="725" cy="526" r="90"/><circle cx="299" cy="526" r="90"/></g><g fill="#414042"><circle cx="307" cy="532" r="60"/><circle cx="717" cy="532" r="60"/></g></g></svg>
diff --git a/godot/icon.svg.import b/godot/icon.svg.import
new file mode 100644
index 0000000..5a5ae61
--- /dev/null
+++ b/godot/icon.svg.import
@@ -0,0 +1,37 @@
+[remap]
+
+importer="texture"
+type="CompressedTexture2D"
+uid="uid://beeg0oqle7bnk"
+path="res://.godot/imported/icon.svg-218a8f2b3041327d8a5756f3a245f83b.ctex"
+metadata={
+"vram_texture": false
+}
+
+[deps]
+
+source_file="res://icon.svg"
+dest_files=["res://.godot/imported/icon.svg-218a8f2b3041327d8a5756f3a245f83b.ctex"]
+
+[params]
+
+compress/mode=0
+compress/high_quality=false
+compress/lossy_quality=0.7
+compress/hdr_compression=1
+compress/normal_map=0
+compress/channel_pack=0
+mipmaps/generate=false
+mipmaps/limit=-1
+roughness/mode=0
+roughness/src_normal=""
+process/fix_alpha_border=true
+process/premult_alpha=false
+process/normal_map_invert_y=false
+process/hdr_as_srgb=false
+process/hdr_clamp_exposure=false
+process/size_limit=0
+detect_3d/compress_to=1
+svg/scale=1.0
+editor/scale_with_editor_scale=false
+editor/convert_colors_with_editor_theme=false
diff --git a/godot/main.gd b/godot/main.gd
deleted file mode 100644
index 6c258e3..0000000
--- a/godot/main.gd
+++ /dev/null
@@ -1,27 +0,0 @@
-extends Node
-
-@onready var input: TextEdit = %Input
-@onready var submit_button: Button = %SubmitButton
-@onready var output: Label = %Output
-
-func _on_button_pressed():
-	handle_submit()
-	
-func handle_submit():
-	print(input.text)
-	Llama.request_completion(input.text)
-	
-	input.clear()
-	input.editable = false
-	submit_button.disabled = true
-	output.text = "..."
-	
-	var completion = await Llama.completion_generated
-	output.text = ""
-	while !completion[1]:
-		print(completion[0])
-		output.text += completion[0]
-		completion = await Llama.completion_generated
-		
-	input.editable = true
-	submit_button.disabled = false
diff --git a/godot/main.tscn b/godot/main.tscn
deleted file mode 100644
index 7bda86d..0000000
--- a/godot/main.tscn
+++ /dev/null
@@ -1,103 +0,0 @@
-[gd_scene load_steps=4 format=3 uid="uid://7oo8yj56scb1"]
-
-[ext_resource type="Texture2D" uid="uid://dplw232htshgc" path="res://addons/godot-llama-cpp/assets/godot-llama-cpp-1024x1024.svg" id="1_ojdoj"]
-[ext_resource type="Script" path="res://main.gd" id="1_vvrqe"]
-
-[sub_resource type="StyleBoxFlat" id="StyleBoxFlat_3e37a"]
-corner_radius_top_left = 5
-corner_radius_top_right = 5
-corner_radius_bottom_right = 5
-corner_radius_bottom_left = 5
-
-[node name="Main" type="Node"]
-script = ExtResource("1_vvrqe")
-
-[node name="Background" type="ColorRect" parent="."]
-anchors_preset = 15
-anchor_right = 1.0
-anchor_bottom = 1.0
-grow_horizontal = 2
-grow_vertical = 2
-color = Color(0.980392, 0.952941, 0.929412, 1)
-
-[node name="CenterContainer" type="CenterContainer" parent="."]
-anchors_preset = 8
-anchor_left = 0.5
-anchor_top = 0.5
-anchor_right = 0.5
-anchor_bottom = 0.5
-offset_left = -400.0
-offset_top = -479.0
-offset_right = 400.0
-offset_bottom = 479.0
-grow_horizontal = 2
-grow_vertical = 2
-
-[node name="VBoxContainer" type="VBoxContainer" parent="CenterContainer"]
-custom_minimum_size = Vector2(500, 0)
-layout_mode = 2
-theme_override_constants/separation = 10
-alignment = 1
-
-[node name="Name" type="Label" parent="CenterContainer/VBoxContainer"]
-layout_mode = 2
-theme_override_colors/font_color = Color(0.101961, 0.0823529, 0.0627451, 1)
-theme_override_font_sizes/font_size = 32
-text = "godot-llama-cpp"
-horizontal_alignment = 1
-
-[node name="MarginContainer" type="MarginContainer" parent="CenterContainer/VBoxContainer"]
-layout_mode = 2
-theme_override_constants/margin_left = 100
-theme_override_constants/margin_right = 100
-
-[node name="TextureRect" type="TextureRect" parent="CenterContainer/VBoxContainer/MarginContainer"]
-layout_mode = 2
-texture = ExtResource("1_ojdoj")
-expand_mode = 4
-
-[node name="ScrollContainer" type="ScrollContainer" parent="CenterContainer/VBoxContainer"]
-custom_minimum_size = Vector2(2.08165e-12, 150)
-layout_mode = 2
-horizontal_scroll_mode = 0
-
-[node name="Panel" type="PanelContainer" parent="CenterContainer/VBoxContainer/ScrollContainer"]
-custom_minimum_size = Vector2(2.08165e-12, 2.08165e-12)
-layout_mode = 2
-size_flags_horizontal = 3
-size_flags_vertical = 3
-theme_override_styles/panel = SubResource("StyleBoxFlat_3e37a")
-
-[node name="MarginContainer" type="MarginContainer" parent="CenterContainer/VBoxContainer/ScrollContainer/Panel"]
-layout_mode = 2
-theme_override_constants/margin_left = 20
-theme_override_constants/margin_right = 20
-
-[node name="Output" type="Label" parent="CenterContainer/VBoxContainer/ScrollContainer/Panel/MarginContainer"]
-unique_name_in_owner = true
-custom_minimum_size = Vector2(200, 2.08165e-12)
-layout_mode = 2
-theme_override_colors/font_color = Color(0.101961, 0.0823529, 0.0627451, 1)
-text = "Ask me anything!"
-autowrap_mode = 3
-
-[node name="Form" type="HBoxContainer" parent="CenterContainer/VBoxContainer"]
-custom_minimum_size = Vector2(500, 60)
-layout_mode = 2
-size_flags_horizontal = 4
-alignment = 1
-
-[node name="Input" type="TextEdit" parent="CenterContainer/VBoxContainer/Form"]
-unique_name_in_owner = true
-layout_mode = 2
-size_flags_horizontal = 3
-size_flags_stretch_ratio = 3.0
-placeholder_text = "Why do cows moo?"
-
-[node name="SubmitButton" type="Button" parent="CenterContainer/VBoxContainer/Form"]
-unique_name_in_owner = true
-layout_mode = 2
-size_flags_horizontal = 3
-text = "Submit"
-
-[connection signal="pressed" from="CenterContainer/VBoxContainer/Form/SubmitButton" to="." method="_on_button_pressed"]
diff --git a/godot/project.godot b/godot/project.godot
index e5ff82f..640ca57 100644
--- a/godot/project.godot
+++ b/godot/project.godot
@@ -11,35 +11,13 @@ config_version=5
 [application]
 
 config/name="godot-llama-cpp"
-run/main_scene="res://main.tscn"
 config/features=PackedStringArray("4.2", "Forward Plus")
-config/icon="res://addons/godot-llama-cpp/assets/godot-llama-cpp-1024x1024.svg"
-
-[autoload]
-
-__LlamaBackend="*res://addons/godot-llama-cpp/autoloads/llama-backend.gd"
-Llama="*res://autoloads/llama.tscn"
-
-[display]
-
-window/size/viewport_width=1280
-window/size/viewport_height=720
+config/icon="res://icon.svg"
 
 [editor_plugins]
 
 enabled=PackedStringArray("res://addons/godot-llama-cpp/plugin.cfg")
 
-[input]
-
-submit_form={
-"deadzone": 0.5,
-"events": [Object(InputEventKey,"resource_local_to_scene":false,"resource_name":"","device":-1,"window_id":0,"alt_pressed":false,"shift_pressed":false,"ctrl_pressed":false,"meta_pressed":false,"pressed":false,"keycode":0,"physical_keycode":4194309,"key_label":0,"unicode":0,"echo":false,"script":null)
-]
-}
-
-[rendering]
+[gui]
 
-anti_aliasing/quality/msaa_2d=3
-anti_aliasing/quality/msaa_3d=3
-anti_aliasing/quality/screen_space_aa=1
-anti_aliasing/quality/use_taa=true
+theme/default_theme_scale=2.0
diff --git a/godot_cpp b/godot_cpp
index 51c752c..b28098e 160000
--- a/godot_cpp
+++ b/godot_cpp
@@ -1 +1 @@
-Subproject commit 51c752c46b44769d3b6c661526c364a18ea64781
+Subproject commit b28098e76b84e8831b8ac68d490f4bca44678b2a
diff --git a/llama.cpp b/llama.cpp
index 4755afd..8c570c9 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit 4755afd1cbd40d93c017e5b98c39796f52345314
+Subproject commit 8c570c9496212073079476651c7517c02581101f
diff --git a/src/llama_backend.cpp b/src/llama_backend.cpp
deleted file mode 100644
index 0e2c0a5..0000000
--- a/src/llama_backend.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "llama.h"
-#include "llama_backend.h"
-#include <godot_cpp/core/class_db.hpp>
-
-using namespace godot;
-
-void LlamaBackend::init() {
-  llama_backend_init();
-  llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_DISABLED);
-}
-
-void LlamaBackend::deinit() {
-  llama_backend_free();
-}
-
-void LlamaBackend::_bind_methods() {
-  ClassDB::bind_method(D_METHOD("init"), &LlamaBackend::init);
-  ClassDB::bind_method(D_METHOD("deinit"), &LlamaBackend::deinit);
-}
\ No newline at end of file
diff --git a/src/llama_backend.h b/src/llama_backend.h
deleted file mode 100644
index 8b0628f..0000000
--- a/src/llama_backend.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef LLAMA_BACKEND_H
-#define LLAMA_BACKEND_H
-
-#include <godot_cpp/classes/ref_counted.hpp>
-
-namespace godot {
-class LlamaBackend : public RefCounted {
-	GDCLASS(LlamaBackend, RefCounted)
-
-protected:
-	static void _bind_methods();
-
-public:
-  void init();
-  void deinit();
-};
-} //namespace godot
-
-#endif
\ No newline at end of file
diff --git a/src/llama_context.cpp b/src/llama_context.cpp
index 78ad255..6f4b397 100644
--- a/src/llama_context.cpp
+++ b/src/llama_context.cpp
@@ -7,6 +7,7 @@
 #include <godot_cpp/classes/worker_thread_pool.hpp>
 #include <godot_cpp/core/class_db.hpp>
 #include <godot_cpp/variant/utility_functions.hpp>
+#include <godot_cpp/variant/dictionary.hpp>
 
 using namespace godot;
 
@@ -15,31 +16,21 @@ void LlamaContext::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("get_model"), &LlamaContext::get_model);
 	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::OBJECT, "model", PROPERTY_HINT_RESOURCE_TYPE, "LlamaModel"), "set_model", "get_model");
 
-  ClassDB::bind_method(D_METHOD("get_seed"), &LlamaContext::get_seed);
-  ClassDB::bind_method(D_METHOD("set_seed", "seed"), &LlamaContext::set_seed);
-  ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "seed"), "set_seed", "get_seed");
+	ClassDB::bind_method(D_METHOD("get_seed"), &LlamaContext::get_seed);
+	ClassDB::bind_method(D_METHOD("set_seed", "seed"), &LlamaContext::set_seed);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "seed"), "set_seed", "get_seed");
 
-  ClassDB::bind_method(D_METHOD("get_n_ctx"), &LlamaContext::get_n_ctx);
-  ClassDB::bind_method(D_METHOD("set_n_ctx", "n_ctx"), &LlamaContext::set_n_ctx);
-  ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_ctx"), "set_n_ctx", "get_n_ctx");
-
-  ClassDB::bind_method(D_METHOD("get_n_threads"), &LlamaContext::get_n_threads);
-  ClassDB::bind_method(D_METHOD("set_n_threads", "n_threads"), &LlamaContext::set_n_threads);
-  ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_threads"), "set_n_threads", "get_n_threads");
-
-  ClassDB::bind_method(D_METHOD("get_n_threads_batch"), &LlamaContext::get_n_threads_batch);
-  ClassDB::bind_method(D_METHOD("set_n_threads_batch", "n_threads_batch"), &LlamaContext::set_n_threads_batch);
-  ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_threads_batch"), "set_n_threads_batch", "get_n_threads_batch");
+	ClassDB::bind_method(D_METHOD("get_n_ctx"), &LlamaContext::get_n_ctx);
+	ClassDB::bind_method(D_METHOD("set_n_ctx", "n_ctx"), &LlamaContext::set_n_ctx);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_ctx"), "set_n_ctx", "get_n_ctx");
 
 	ClassDB::bind_method(D_METHOD("request_completion", "prompt"), &LlamaContext::request_completion);
-	ClassDB::bind_method(D_METHOD("_fulfill_completion", "prompt"), &LlamaContext::_fulfill_completion);
+	ClassDB::bind_method(D_METHOD("__thread_loop"), &LlamaContext::__thread_loop);
 
-	ADD_SIGNAL(MethodInfo("completion_generated", PropertyInfo(Variant::STRING, "completion"), PropertyInfo(Variant::BOOL, "is_final")));
+	ADD_SIGNAL(MethodInfo("completion_generated", PropertyInfo(Variant::DICTIONARY, "chunk")));
 }
 
 LlamaContext::LlamaContext() {
-	batch = llama_batch_init(4096, 0, 1);
-
 	ctx_params = llama_context_default_params();
 	ctx_params.seed = -1;
 	ctx_params.n_ctx = 4096;
@@ -60,107 +51,66 @@ void LlamaContext::_ready() {
 		return;
 	}
 
+	mutex.instantiate();
+	semaphore.instantiate();
+	thread.instantiate();
+
+	llama_backend_init();
+	llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_DISABLED);
+
 	ctx = llama_new_context_with_model(model->model, ctx_params);
 	if (ctx == NULL) {
 		UtilityFunctions::printerr(vformat("%s: Failed to initialize llama context, null ctx", __func__));
 		return;
 	}
 	UtilityFunctions::print(vformat("%s: Context initialized", __func__));
-}
 
-PackedStringArray LlamaContext::_get_configuration_warnings() const {
-  PackedStringArray warnings;
-  if (model == NULL) {
-    warnings.push_back("Model resource property not defined");
-  }
-  return warnings;
+	thread->start(callable_mp(this, &LlamaContext::__thread_loop));
 }
 
-Variant LlamaContext::request_completion(const String &prompt) {
-	UtilityFunctions::print(vformat("%s: Requesting completion for prompt: %s", __func__, prompt));
-	if (task_id) {
-		WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);
-	}
-	task_id = WorkerThreadPool::get_singleton()->add_task(Callable(this, "_fulfill_completion").bind(prompt));
-	return OK;
-}
+void LlamaContext::__thread_loop() {
+	while (true) {
+		semaphore->wait();
 
-void LlamaContext::_fulfill_completion(const String &prompt) {
-	UtilityFunctions::print(vformat("%s: Fulfilling completion for prompt: %s", __func__, prompt));
-	std::vector<llama_token> tokens_list;
-	tokens_list = ::llama_tokenize(ctx, std::string(prompt.utf8().get_data()), true);
+		mutex->lock();
+		if (completion_requests.size() == 0) {
+			mutex->unlock();
+			continue;
+		}
+		completion_request req = completion_requests.get(0);
+		completion_requests.remove_at(0);
+		mutex->unlock();
 
-	const int n_len = 128;
-	const int n_ctx = llama_n_ctx(ctx);
-	const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
-	if (n_kv_req > n_ctx) {
-		UtilityFunctions::printerr(vformat("%s: n_kv_req > n_ctx, the required KV cache size is not big enough\neither reduce n_len or increase n_ctx", __func__));
-		return;
-	}
+		UtilityFunctions::print(vformat("%s: Running completion for prompt id: %d", __func__, req.id));
 
-	for (size_t i = 0; i < tokens_list.size(); i++) {
-		llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
+    Dictionary chunk;
+    chunk["id"] = req.id;
+    chunk["text"] = "Hello, world!";
+    call_deferred("emit_signal", "completion_generated", chunk);
 	}
+}
 
-	batch.logits[batch.n_tokens - 1] = true;
-
-	llama_kv_cache_clear(ctx);
-
-	int decode_res = llama_decode(ctx, batch);
-	if (decode_res != 0) {
-		UtilityFunctions::printerr(vformat("%s: Failed to decode prompt with error code: %d", __func__, decode_res));
-		return;
+PackedStringArray LlamaContext::_get_configuration_warnings() const {
+	PackedStringArray warnings;
+	if (model == NULL) {
+		warnings.push_back("Model resource property not defined");
 	}
+	return warnings;
+}
 
-	int n_cur = batch.n_tokens;
-	int n_decode = 0;
-	llama_model *llama_model = model->model;
-
-	while (n_cur <= n_len) {
-		// sample the next token
-		{
-			auto n_vocab = llama_n_vocab(llama_model);
-			auto *logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
-
-			std::vector<llama_token_data> candidates;
-			candidates.reserve(n_vocab);
-
-			for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-				candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-			}
-
-			llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-			// sample the most likely token
-			const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
-
-			// is it an end of stream?
-			if (new_token_id == llama_token_eos(llama_model) || n_cur == n_len) {
-				call_thread_safe("emit_signal", "completion_generated", "\n", true);
-
-				break;
-			}
-
-			call_thread_safe("emit_signal", "completion_generated", vformat("%s", llama_token_to_piece(ctx, new_token_id).c_str()), false);
-
-			// prepare the next batch
-			llama_batch_clear(batch);
+int LlamaContext::request_completion(const String &prompt) {
+	int id = request_id++;
 
-			// push this new token for next evaluation
-			llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
+	UtilityFunctions::print(vformat("%s: Requesting completion for prompt id: %d", __func__, id));
 
-			n_decode += 1;
-		}
+	mutex->lock();
+	completion_request req = { id, prompt };
+	completion_requests.append(req);
+	mutex->unlock();
 
-		n_cur += 1;
+	semaphore->post();
 
-		// evaluate the current batch with the transformer model
-		int decode_res = llama_decode(ctx, batch);
-		if (decode_res != 0) {
-			UtilityFunctions::printerr(vformat("%s: Failed to decode batch with error code: %d", __func__, decode_res));
-			break;
-		}
-	}
+	return id;
 }
 
 void LlamaContext::set_model(const Ref<LlamaModel> p_model) {
@@ -184,28 +134,10 @@ void LlamaContext::set_n_ctx(int n_ctx) {
 	ctx_params.n_ctx = n_ctx;
 }
 
-int LlamaContext::get_n_threads() {
-	return ctx_params.n_threads;
-}
-void LlamaContext::set_n_threads(int n_threads) {
-	ctx_params.n_threads = n_threads;
-}
-
-int LlamaContext::get_n_threads_batch() {
-	return ctx_params.n_threads_batch;
-}
-void LlamaContext::set_n_threads_batch(int n_threads_batch) {
-	ctx_params.n_threads_batch = n_threads_batch;
-}
-
 LlamaContext::~LlamaContext() {
 	if (ctx) {
 		llama_free(ctx);
 	}
 
-	llama_batch_free(batch);
-
-	if (task_id) {
-		WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);
-	}
+	llama_backend_free();
 }
\ No newline at end of file
diff --git a/src/llama_context.h b/src/llama_context.h
index 5db3789..1f987f7 100644
--- a/src/llama_context.h
+++ b/src/llama_context.h
@@ -3,9 +3,18 @@
 
 #include "llama.h"
 #include "llama_model.h"
+#include <godot_cpp/classes/mutex.hpp>
 #include <godot_cpp/classes/node.hpp>
-
+#include <godot_cpp/classes/semaphore.hpp>
+#include <godot_cpp/classes/thread.hpp>
+#include <godot_cpp/templates/vector.hpp>
 namespace godot {
+
+struct completion_request {
+	int id;
+	String prompt;
+};
+
 class LlamaContext : public Node {
 	GDCLASS(LlamaContext, Node)
 
@@ -13,8 +22,12 @@ class LlamaContext : public Node {
 	Ref<LlamaModel> model;
 	llama_context *ctx = nullptr;
 	llama_context_params ctx_params;
-	llama_batch batch;
-	int task_id;
+	int request_id = 0;
+	Vector<completion_request> completion_requests;
+
+	Ref<Thread> thread;
+	Ref<Semaphore> semaphore;
+	Ref<Mutex> mutex;
 
 protected:
 	static void _bind_methods();
@@ -23,21 +36,17 @@ class LlamaContext : public Node {
 	void set_model(const Ref<LlamaModel> model);
 	Ref<LlamaModel> get_model();
 
-	Variant request_completion(const String &prompt);
-	void _fulfill_completion(const String &prompt);
+	int request_completion(const String &prompt);
+	void __thread_loop();
 
-  int get_seed();
-  void set_seed(int seed);
-  int get_n_ctx();
-  void set_n_ctx(int n_ctx);
-  int get_n_threads();
-  void set_n_threads(int n_threads);
-  int get_n_threads_batch();
-  void set_n_threads_batch(int n_threads_batch);
+	int get_seed();
+	void set_seed(int seed);
+	int get_n_ctx();
+	void set_n_ctx(int n_ctx);
 
-  virtual PackedStringArray _get_configuration_warnings() const override;
+	virtual PackedStringArray _get_configuration_warnings() const override;
 	virtual void _ready() override;
-  LlamaContext();
+	LlamaContext();
 	~LlamaContext();
 };
 } //namespace godot
diff --git a/src/llama_model.cpp b/src/llama_model.cpp
index 92cb9db..96eea4f 100644
--- a/src/llama_model.cpp
+++ b/src/llama_model.cpp
@@ -1,5 +1,6 @@
 #include "llama_model.h"
 #include "llama.h"
+#include <godot_cpp/classes/project_settings.hpp>
 #include <godot_cpp/core/class_db.hpp>
 #include <godot_cpp/variant/utility_functions.hpp>
 
@@ -22,14 +23,16 @@ void LlamaModel::load_model(const String &path) {
 		llama_free_model(model);
 	}
 
-	model = llama_load_model_from_file(path.utf8().get_data(), model_params);
+	String absPath = ProjectSettings::get_singleton()->globalize_path(path);
+
+	model = llama_load_model_from_file(absPath.utf8().get_data(), model_params);
 
 	if (model == NULL) {
-		UtilityFunctions::printerr(vformat("%s: Unable to load model from %s", __func__, path));
+		UtilityFunctions::printerr(vformat("%s: Unable to load model from %s", __func__, absPath));
 		return;
 	}
 
-	UtilityFunctions::print(vformat("%s: Model loaded from %s", __func__, path));
+	UtilityFunctions::print(vformat("%s: Model loaded from %s", __func__, absPath));
 }
 
 int LlamaModel::get_n_gpu_layers() {
diff --git a/src/llama_model_loader.cpp b/src/llama_model_loader.cpp
index 77b8a94..940f1c3 100644
--- a/src/llama_model_loader.cpp
+++ b/src/llama_model_loader.cpp
@@ -2,7 +2,6 @@
 #include "llama_model.h"
 #include <godot_cpp/core/class_db.hpp>
 #include <godot_cpp/classes/file_access.hpp>
-#include <godot_cpp/classes/project_settings.hpp>
 #include <godot_cpp/classes/engine.hpp>
 
 using namespace godot;
@@ -24,9 +23,7 @@ Variant godot::LlamaModelLoader::_load(const String &path, const String &origina
     return { model };
   }
 
-	String absPath = ProjectSettings::get_singleton()->globalize_path(path);
-
-	model->load_model(absPath);
+	model->load_model(path);
 	
 	return { model };
 }
diff --git a/src/register_types.cpp b/src/register_types.cpp
index 0cf4cf0..17ec90b 100644
--- a/src/register_types.cpp
+++ b/src/register_types.cpp
@@ -7,7 +7,6 @@
 #include "llama_model.h"
 #include "llama_model_loader.h"
 #include "llama_context.h"
-#include "llama_backend.h"
 
 using namespace godot;
 
@@ -24,7 +23,6 @@ void initialize_types(ModuleInitializationLevel p_level)
 
 	ClassDB::register_class<LlamaModel>();
   ClassDB::register_class<LlamaContext>();
-  ClassDB::register_class<LlamaBackend>();
 }
 
 void uninitialize_types(ModuleInitializationLevel p_level) {

From 6960f3b22b0ce1a3353a92b024925930a11768d7 Mon Sep 17 00:00:00 2001
From: Hasan Mukhlis <hsnmkls@proton.me>
Date: Fri, 10 May 2024 20:24:03 +0800
Subject: [PATCH 2/6] working inference

---
 godot/examples/simple/TextEdit.gd |   5 +-
 godot/examples/simple/simple.gd   |   7 +-
 godot/examples/simple/simple.tscn |  14 +--
 godot/project.godot               |   1 +
 src/llama_context.cpp             | 148 ++++++++++++++++++++++++++++--
 src/llama_context.h               |  10 +-
 6 files changed, 164 insertions(+), 21 deletions(-)

diff --git a/godot/examples/simple/TextEdit.gd b/godot/examples/simple/TextEdit.gd
index 037a2ba..b8d4bc0 100644
--- a/godot/examples/simple/TextEdit.gd
+++ b/godot/examples/simple/TextEdit.gd
@@ -10,8 +10,11 @@ func _gui_input(event: InputEvent) -> void:
 			accept_event()
 		if keycode == KEY_ENTER | KEY_MASK_SHIFT and event.is_pressed():
 			insert_text_at_caret("\n")
+			accept_event()
+
+func _on_button_pressed() -> void:
+	handle_submit()
 
 func handle_submit() -> void:
 	submit.emit(text)
 	text = ""
-	
diff --git a/godot/examples/simple/simple.gd b/godot/examples/simple/simple.gd
index 9739a30..a1f7872 100644
--- a/godot/examples/simple/simple.gd
+++ b/godot/examples/simple/simple.gd
@@ -16,6 +16,7 @@ func handle_input(input: String) -> void:
 	var id = llama_context.request_completion(input)
 	print("request id: ", id)
 	
-	var chunk = await llama_context.completion_generated
-	print('new chunk: ', chunk)
-	
+
+
+func _on_llama_context_completion_generated(chunk: Dictionary) -> void:
+	print("new chunk: ", chunk)
diff --git a/godot/examples/simple/simple.tscn b/godot/examples/simple/simple.tscn
index d9ca0e9..f3e654b 100644
--- a/godot/examples/simple/simple.tscn
+++ b/godot/examples/simple/simple.tscn
@@ -3,8 +3,8 @@
 [ext_resource type="LlamaModel" path="res://models/Phi-3-mini-128k-instruct.Q5_K_M.gguf" id="1_ff70a"]
 [ext_resource type="Texture2D" uid="uid://dplw232htshgc" path="res://addons/godot-llama-cpp/assets/godot-llama-cpp-1024x1024.svg" id="1_gjsev"]
 [ext_resource type="Script" path="res://examples/simple/simple.gd" id="1_sruc3"]
+[ext_resource type="PackedScene" uid="uid://t862t0v8ht2q" path="res://examples/simple/message.tscn" id="2_7iip7"]
 [ext_resource type="Script" path="res://examples/simple/TextEdit.gd" id="2_7usqw"]
-[ext_resource type="Script" path="res://examples/simple/form.gd" id="2_p1ih5"]
 
 [node name="Node" type="Node"]
 script = ExtResource("1_sruc3")
@@ -41,20 +41,14 @@ layout_mode = 2
 size_flags_horizontal = 3
 size_flags_vertical = 3
 
-[node name="RichTextLabel" type="RichTextLabel" parent="Panel/MarginContainer/VBoxContainer/ScrollContainer/MessagesContainer"]
+[node name="RichTextLabel2" parent="Panel/MarginContainer/VBoxContainer/ScrollContainer/MessagesContainer" instance=ExtResource("2_7iip7")]
 layout_mode = 2
-focus_mode = 2
 text = "How can I help you?"
-fit_content = true
-scroll_active = false
-selection_enabled = true
 
 [node name="HBoxContainer" type="HBoxContainer" parent="Panel/MarginContainer/VBoxContainer"]
 layout_mode = 2
-script = ExtResource("2_p1ih5")
 
 [node name="TextEdit" type="TextEdit" parent="Panel/MarginContainer/VBoxContainer/HBoxContainer"]
-unique_name_in_owner = true
 custom_minimum_size = Vector2(2.08165e-12, 100)
 layout_mode = 2
 size_flags_horizontal = 3
@@ -73,5 +67,5 @@ model = ExtResource("1_ff70a")
 unique_name_in_owner = true
 
 [connection signal="submit" from="Panel/MarginContainer/VBoxContainer/HBoxContainer/TextEdit" to="." method="_on_text_edit_submit"]
-[connection signal="pressed" from="Panel/MarginContainer/VBoxContainer/HBoxContainer/Button" to="Panel/MarginContainer/VBoxContainer/HBoxContainer" method="_on_button_pressed"]
-[connection signal="pressed" from="Panel/MarginContainer/VBoxContainer/HBoxContainer/Button" to="Panel/MarginContainer/VBoxContainer/HBoxContainer/Button" method="_on_pressed"]
+[connection signal="pressed" from="Panel/MarginContainer/VBoxContainer/HBoxContainer/Button" to="Panel/MarginContainer/VBoxContainer/HBoxContainer/TextEdit" method="_on_button_pressed"]
+[connection signal="completion_generated" from="LlamaContext" to="." method="_on_llama_context_completion_generated"]
diff --git a/godot/project.godot b/godot/project.godot
index 640ca57..4730d03 100644
--- a/godot/project.godot
+++ b/godot/project.godot
@@ -11,6 +11,7 @@ config_version=5
 [application]
 
 config/name="godot-llama-cpp"
+run/main_scene="res://examples/simple/simple.tscn"
 config/features=PackedStringArray("4.2", "Forward Plus")
 config/icon="res://icon.svg"
 
diff --git a/src/llama_context.cpp b/src/llama_context.cpp
index 6f4b397..4d64442 100644
--- a/src/llama_context.cpp
+++ b/src/llama_context.cpp
@@ -2,12 +2,13 @@
 #include "common.h"
 #include "llama.h"
 #include "llama_model.h"
+#include <algorithm>
 #include <godot_cpp/classes/engine.hpp>
 #include <godot_cpp/classes/os.hpp>
 #include <godot_cpp/classes/worker_thread_pool.hpp>
 #include <godot_cpp/core/class_db.hpp>
-#include <godot_cpp/variant/utility_functions.hpp>
 #include <godot_cpp/variant/dictionary.hpp>
+#include <godot_cpp/variant/utility_functions.hpp>
 
 using namespace godot;
 
@@ -24,6 +25,10 @@ void LlamaContext::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_n_ctx", "n_ctx"), &LlamaContext::set_n_ctx);
 	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_ctx"), "set_n_ctx", "get_n_ctx");
 
+	ClassDB::bind_method(D_METHOD("get_n_len"), &LlamaContext::get_n_len);
+	ClassDB::bind_method(D_METHOD("set_n_len", "n_len"), &LlamaContext::set_n_len);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_len"), "set_n_len", "get_n_len");
+
 	ClassDB::bind_method(D_METHOD("request_completion", "prompt"), &LlamaContext::request_completion);
 	ClassDB::bind_method(D_METHOD("__thread_loop"), &LlamaContext::__thread_loop);
 
@@ -63,6 +68,9 @@ void LlamaContext::_ready() {
 		UtilityFunctions::printerr(vformat("%s: Failed to initialize llama context, null ctx", __func__));
 		return;
 	}
+
+	sampling_ctx = llama_sampling_init(sampling_params);
+
 	UtilityFunctions::print(vformat("%s: Context initialized", __func__));
 
 	thread->start(callable_mp(this, &LlamaContext::__thread_loop));
@@ -73,6 +81,10 @@ void LlamaContext::__thread_loop() {
 		semaphore->wait();
 
 		mutex->lock();
+		if (exit_thread) {
+			mutex->unlock();
+			break;
+		}
 		if (completion_requests.size() == 0) {
 			mutex->unlock();
 			continue;
@@ -83,10 +95,115 @@ void LlamaContext::__thread_loop() {
 
 		UtilityFunctions::print(vformat("%s: Running completion for prompt id: %d", __func__, req.id));
 
-    Dictionary chunk;
-    chunk["id"] = req.id;
-    chunk["text"] = "Hello, world!";
-    call_deferred("emit_signal", "completion_generated", chunk);
+		std::vector<llama_token> request_tokens;
+		request_tokens = ::llama_tokenize(ctx, req.prompt.utf8().get_data(), true);
+
+		size_t shared_prefix_idx = 0;
+		auto diff = std::mismatch(context_tokens.begin(), context_tokens.end(), request_tokens.begin(), request_tokens.end());
+		if (diff.first != context_tokens.end()) {
+			shared_prefix_idx = std::distance(context_tokens.begin(), diff.first);
+		} else {
+			shared_prefix_idx = std::min(context_tokens.size(), request_tokens.size());
+		}
+
+		bool rm_success = llama_kv_cache_seq_rm(ctx, 0, shared_prefix_idx, -1);
+		if (!rm_success) {
+			UtilityFunctions::printerr(vformat("%s: Failed to remove tokens from kv cache", __func__));
+			Dictionary response;
+			response["id"] = req.id;
+			response["error"] = "Failed to remove tokens from kv cache";
+			call_deferred("emit_signal", "completion_generated", response);
+			continue;
+		}
+		context_tokens.erase(context_tokens.begin() + shared_prefix_idx, context_tokens.end());
+		request_tokens.erase(request_tokens.begin(), request_tokens.begin() + shared_prefix_idx);
+
+		uint batch_size = std::min(ctx_params.n_batch, (uint)request_tokens.size());
+
+		llama_batch batch = llama_batch_init(batch_size, 0, 1);
+
+		// chunk request_tokens into sequences of size batch_size
+		std::vector<std::vector<llama_token>> sequences;
+		for (size_t i = 0; i < request_tokens.size(); i += batch_size) {
+			sequences.push_back(std::vector<llama_token>(request_tokens.begin() + i, request_tokens.begin() + std::min(i + batch_size, request_tokens.size())));
+		}
+
+		int curr_token_pos = context_tokens.size();
+		bool decode_failed = false;
+
+		for (size_t i = 0; i < sequences.size(); i++) {
+			llama_batch_clear(batch);
+
+			std::vector<llama_token> sequence = sequences[i];
+
+			for (size_t j = 0; j < sequence.size(); j++) {
+				llama_batch_add(batch, sequence[j], j + curr_token_pos, { 0 }, false);
+				curr_token_pos++;
+			}
+
+			if (i == sequences.size() - 1) {
+				batch.logits[batch.n_tokens - 1] = true;
+			}
+
+			if (llama_decode(ctx, batch) != 0) {
+				decode_failed = true;
+				break;
+			}
+		}
+
+		if (decode_failed) {
+			Dictionary response;
+			response["id"] = req.id;
+			response["error"] = "llama_decode() failed";
+			call_deferred("emit_signal", "completion_generated", response);
+			continue;
+		}
+
+		context_tokens.insert(context_tokens.end(), request_tokens.begin(), request_tokens.end());
+
+		while (true) {
+			if (exit_thread) {
+				return;
+			}
+			llama_token new_token_id = llama_sampling_sample(sampling_ctx, ctx, NULL, batch.n_tokens - 1);
+			llama_sampling_accept(sampling_ctx, ctx, new_token_id, true);
+
+			Dictionary response;
+			response["id"] = req.id;
+
+			if (llama_token_is_eog(model->model, new_token_id) || curr_token_pos == n_len) {
+				response["done"] = true;
+				call_deferred("emit_signal", "completion_generated", response);
+				break;
+			}
+
+			context_tokens.push_back(new_token_id);
+
+			response["text"] = llama_token_to_piece(ctx, new_token_id).c_str();
+			response["done"] = false;
+			call_deferred("emit_signal", "completion_generated", response);
+
+			llama_batch_clear(batch);
+
+			llama_batch_add(batch, new_token_id, curr_token_pos, { 0 }, true);
+
+			curr_token_pos++;
+
+			if (llama_decode(ctx, batch) != 0) {
+				decode_failed = true;
+				break;
+			}
+		}
+
+		if (decode_failed) {
+			Dictionary response;
+			response["id"] = req.id;
+			response["error"] = "llama_decode() failed";
+			call_deferred("emit_signal", "completion_generated", response);
+			continue;
+		}
+
+		llama_sampling_reset(sampling_ctx);
 	}
 }
 
@@ -134,7 +251,26 @@ void LlamaContext::set_n_ctx(int n_ctx) {
 	ctx_params.n_ctx = n_ctx;
 }
 
-LlamaContext::~LlamaContext() {
+int LlamaContext::get_n_len() {
+	return n_len;
+}
+void LlamaContext::set_n_len(int n_len) {
+	this->n_len = n_len;
+}
+
+void LlamaContext::_exit_tree() {
+	if (Engine::get_singleton()->is_editor_hint()) {
+		return;
+	}
+
+	mutex->lock();
+	exit_thread = true;
+	mutex->unlock();
+
+	semaphore->post();
+
+	thread->wait_to_finish();
+
 	if (ctx) {
 		llama_free(ctx);
 	}
diff --git a/src/llama_context.h b/src/llama_context.h
index 1f987f7..309f8f1 100644
--- a/src/llama_context.h
+++ b/src/llama_context.h
@@ -2,6 +2,7 @@
 #define LLAMA_CONTEXT_H
 
 #include "llama.h"
+#include "common.h"
 #include "llama_model.h"
 #include <godot_cpp/classes/mutex.hpp>
 #include <godot_cpp/classes/node.hpp>
@@ -21,13 +22,18 @@ class LlamaContext : public Node {
 private:
 	Ref<LlamaModel> model;
 	llama_context *ctx = nullptr;
+  llama_sampling_context *sampling_ctx = nullptr;
 	llama_context_params ctx_params;
+  llama_sampling_params sampling_params;
+  int n_len = 1024;
 	int request_id = 0;
 	Vector<completion_request> completion_requests;
 
 	Ref<Thread> thread;
 	Ref<Semaphore> semaphore;
 	Ref<Mutex> mutex;
+  std::vector<llama_token> context_tokens;
+  bool exit_thread = false;
 
 protected:
 	static void _bind_methods();
@@ -43,11 +49,13 @@ class LlamaContext : public Node {
 	void set_seed(int seed);
 	int get_n_ctx();
 	void set_n_ctx(int n_ctx);
+  int get_n_len();
+  void set_n_len(int n_len);
 
 	virtual PackedStringArray _get_configuration_warnings() const override;
 	virtual void _ready() override;
+  virtual void _exit_tree() override;
 	LlamaContext();
-	~LlamaContext();
 };
 } //namespace godot
 

From 13eb0adcf479cecfe3bc9c5a07e1657874de1126 Mon Sep 17 00:00:00 2001
From: Hasan Mukhlis <hsnmkls@proton.me>
Date: Sat, 11 May 2024 20:18:04 +0800
Subject: [PATCH 3/6] add metal build

---
 build.zig              | 55 +++++++++++++++++++++++------
 tools/expand_metal.zig | 79 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+), 10 deletions(-)
 create mode 100644 tools/expand_metal.zig

diff --git a/build.zig b/build.zig
index 165b932..aadce33 100644
--- a/build.zig
+++ b/build.zig
@@ -84,14 +84,44 @@ fn build_lib_llama_cpp(params: BuildParams) !*std.Build.Step.Compile {
         \\char const *LLAMA_COMMIT = "{s}";
         \\char const *LLAMA_COMPILER = "Zig {s}";
         \\char const *LLAMA_BUILD_TARGET = "{s}";
-        \\
     , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, zig_triple }) });
 
+    const lib_llama_cpp = b.addStaticLibrary(.{ .name = "llama.cpp", .target = target, .optimize = optimize });
+
     var objs = std.ArrayList(*std.Build.Step.Compile).init(b.allocator);
-    var objBuilder = ObjBuilder.init(.{ .b = b, .target = target, .optimize = optimize, .include_paths = &.{
-        "llama.cpp",
-        "llama.cpp/common",
-    } });
+    var objBuilder = ObjBuilder.init(.{
+        .b = b,
+        .target = target,
+        .optimize = optimize,
+        .include_paths = &.{ "llama.cpp", "llama.cpp/common" },
+    });
+
+    switch (target.result.os.tag) {
+        .macos => {
+            try objBuilder.flags.append("-DGGML_USE_METAL");
+            try objs.append(objBuilder.build(.{ .name = "ggml_metal", .sources = &.{"llama.cpp/ggml-metal.m"} }));
+
+            lib_llama_cpp.linkFramework("Foundation");
+            lib_llama_cpp.linkFramework("Metal");
+            lib_llama_cpp.linkFramework("MetalKit");
+
+            const expand_metal = b.addExecutable(.{
+                .name = "expand_metal",
+                .target = target,
+                .root_source_file = .{ .path = "tools/expand_metal.zig" },
+            });
+            var run_expand_metal = b.addRunArtifact(expand_metal);
+            run_expand_metal.addArg("--metal-file");
+            run_expand_metal.addFileArg(.{ .path = "llama.cpp/ggml-metal.metal" });
+            run_expand_metal.addArg("--common-file");
+            run_expand_metal.addFileArg(.{ .path = "llama.cpp/ggml-common.h" });
+            run_expand_metal.addArg("--output-file");
+            const metal_expanded = run_expand_metal.addOutputFileArg("ggml-metal.metal");
+            const install_metal = b.addInstallFileWithDir(metal_expanded, .lib, "ggml-metal.metal");
+            lib_llama_cpp.step.dependOn(&install_metal.step);
+        },
+        else => {},
+    }
 
     try objs.appendSlice(&.{
         objBuilder.build(.{ .name = "ggml", .sources = &.{"llama.cpp/ggml.c"} }),
@@ -110,8 +140,6 @@ fn build_lib_llama_cpp(params: BuildParams) !*std.Build.Step.Compile {
         objBuilder.build(.{ .name = "build_info", .sources = &.{"llama.cpp/common/build-info.cpp"} }),
     });
 
-    const lib_llama_cpp = b.addStaticLibrary(.{ .name = "llama.cpp", .target = target, .optimize = optimize });
-
     for (objs.items) |obj| {
         lib_llama_cpp.addObject(obj);
     }
@@ -124,19 +152,26 @@ const ObjBuilder = struct {
     target: std.Build.ResolvedTarget,
     optimize: std.builtin.OptimizeMode,
     include_paths: []const []const u8,
-
-    fn init(params: struct { b: *std.Build, target: std.Build.ResolvedTarget, optimize: std.builtin.OptimizeMode, include_paths: []const []const u8 }) ObjBuilder {
+    flags: std.ArrayList([]const u8),
+
+    fn init(params: struct {
+        b: *std.Build,
+        target: std.Build.ResolvedTarget,
+        optimize: std.builtin.OptimizeMode,
+        include_paths: []const []const u8,
+    }) ObjBuilder {
         return ObjBuilder{
             .b = params.b,
             .target = params.target,
             .optimize = params.optimize,
             .include_paths = params.include_paths,
+            .flags = std.ArrayList([]const u8).init(params.b.allocator),
         };
     }
 
     fn build(self: *ObjBuilder, params: struct { name: []const u8, sources: []const []const u8 }) *std.Build.Step.Compile {
         const obj = self.b.addObject(.{ .name = params.name, .target = self.target, .optimize = self.optimize });
-        obj.addCSourceFiles(.{ .files = params.sources });
+        obj.addCSourceFiles(.{ .files = params.sources, .flags = self.flags.items });
         for (self.include_paths) |path| {
             obj.addIncludePath(.{ .path = path });
         }
diff --git a/tools/expand_metal.zig b/tools/expand_metal.zig
new file mode 100644
index 0000000..1e4e9f4
--- /dev/null
+++ b/tools/expand_metal.zig
@@ -0,0 +1,79 @@
+const std = @import("std");
+
+const usage =
+    \\Usage: ./embed_metal [options]
+    \\
+    \\Options:
+    \\  --metal-file ggml-metal.metal
+    \\  --common-file ggml-common.h
+    \\  --output-file ggml-metal-embed.metal
+    \\
+;
+
+pub fn main() !void {
+    var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+    defer arena_state.deinit();
+    const arena = arena_state.allocator();
+
+    const args = try std.process.argsAlloc(arena);
+
+    var opt_metal_file_path: ?[]const u8 = null;
+    var opt_common_file_path: ?[]const u8 = null;
+    var opt_output_file_path: ?[]const u8 = null;
+
+    {
+        var i: usize = 1;
+        while (i < args.len) : (i += 1) {
+            const arg = args[i];
+            if (std.mem.eql(u8, "-h", arg) or std.mem.eql(u8, "--help", arg)) {
+                try std.io.getStdOut().writeAll(usage);
+                return std.process.cleanExit();
+            } else if (std.mem.eql(u8, "--metal-file", arg)) {
+                i += 1;
+                if (i > args.len) std.debug.panic("expected arg after '{s}'", .{arg});
+                if (opt_metal_file_path != null) std.debug.panic("duplicated {s} argument", .{arg});
+                opt_metal_file_path = args[i];
+            } else if (std.mem.eql(u8, "--common-file", arg)) {
+                i += 1;
+                if (i > args.len) std.debug.panic("expected arg after '{s}'", .{arg});
+                if (opt_common_file_path != null) std.debug.panic("duplicated {s} argument", .{arg});
+                opt_common_file_path = args[i];
+            } else if (std.mem.eql(u8, "--output-file", arg)) {
+                i += 1;
+                if (i > args.len) std.debug.panic("expected arg after '{s}'", .{arg});
+                if (opt_output_file_path != null) std.debug.panic("duplicated {s} argument", .{arg});
+                opt_output_file_path = args[i];
+            } else {
+                std.debug.panic("unrecognized arg: '{s}'", .{arg});
+            }
+        }
+    }
+
+    const metal_file_path = opt_metal_file_path orelse std.debug.panic("missing --input-file", .{});
+    const common_file_path = opt_common_file_path orelse std.debug.panic("missing --output-file", .{});
+    const output_file_path = opt_output_file_path orelse std.debug.panic("missing --lang", .{});
+
+    const cwd = std.fs.cwd();
+
+    var metal_file = try cwd.openFile(metal_file_path, .{});
+    defer metal_file.close();
+
+    var common_file = try cwd.openFile(common_file_path, .{});
+    defer common_file.close();
+
+    const metal_size = (try metal_file.stat()).size;
+    const metal_contents = try arena.alloc(u8, metal_size);
+    defer arena.free(metal_contents);
+    _ = try metal_file.readAll(metal_contents);
+
+    const common_size = (try common_file.stat()).size;
+    const common_contents = try arena.alloc(u8, common_size);
+    defer arena.free(common_contents);
+    _ = try common_file.readAll(common_contents);
+
+    const output = try std.mem.replaceOwned(u8, arena, metal_contents, "#include \"ggml-common.h\"", common_contents);
+    defer arena.free(output);
+
+    const output_file = try cwd.createFile(output_file_path, .{});
+    try output_file.writeAll(output);
+}

From f73ef8c5e4289c629ff07f2393ce3c9dce0461d3 Mon Sep 17 00:00:00 2001
From: Hasan Mukhlis <hsnmkls@proton.me>
Date: Sat, 11 May 2024 21:50:57 +0800
Subject: [PATCH 4/6] update godot scenese

---
 godot/examples/simple/message.gd   | 17 ++++++++++++++
 godot/examples/simple/message.tscn | 37 ++++++++++++++++++++++++++----
 godot/examples/simple/simple.gd    | 34 ++++++++++++++++++++-------
 godot/examples/simple/simple.tscn  |  5 ++++
 4 files changed, 80 insertions(+), 13 deletions(-)
 create mode 100644 godot/examples/simple/message.gd

diff --git a/godot/examples/simple/message.gd b/godot/examples/simple/message.gd
new file mode 100644
index 0000000..0b32c6f
--- /dev/null
+++ b/godot/examples/simple/message.gd
@@ -0,0 +1,17 @@
+class_name Message
+extends Node
+
+@onready var text_container = %Text
+@onready var icon = %Panel
+@export_enum("user", "assistant") var sender: String
+
+var completion_id: int = -1
+var pending: bool = false
+var errored: bool = false
+
+func set_text(new_text: String):
+	text_container.text = new_text
+	
+func append_text(new_text: String):
+	text_container.text += new_text
+
diff --git a/godot/examples/simple/message.tscn b/godot/examples/simple/message.tscn
index 1c9d3c4..36b68e7 100644
--- a/godot/examples/simple/message.tscn
+++ b/godot/examples/simple/message.tscn
@@ -1,10 +1,37 @@
-[gd_scene format=3 uid="uid://t862t0v8ht2q"]
+[gd_scene load_steps=5 format=3 uid="uid://t862t0v8ht2q"]
 
-[node name="RichTextLabel" type="RichTextLabel"]
-offset_right = 40.0
-offset_bottom = 40.0
+[ext_resource type="Script" path="res://examples/simple/message.gd" id="1_pko33"]
+[ext_resource type="Texture2D" uid="uid://dplw232htshgc" path="res://addons/godot-llama-cpp/assets/godot-llama-cpp-1024x1024.svg" id="2_dvc7y"]
+
+[sub_resource type="StyleBoxTexture" id="StyleBoxTexture_t8bgj"]
+texture = ExtResource("2_dvc7y")
+
+[sub_resource type="Theme" id="Theme_bw3pb"]
+Panel/styles/panel = SubResource("StyleBoxTexture_t8bgj")
+
+[node name="RichTextLabel" type="HBoxContainer"]
+anchors_preset = 15
+anchor_right = 1.0
+anchor_bottom = 1.0
+grow_horizontal = 2
+grow_vertical = 2
+size_flags_horizontal = 3
+theme_override_constants/separation = 20
+script = ExtResource("1_pko33")
+sender = "assistant"
+
+[node name="Panel" type="Panel" parent="."]
+unique_name_in_owner = true
+custom_minimum_size = Vector2(80, 80)
+layout_mode = 2
+size_flags_vertical = 0
+theme = SubResource("Theme_bw3pb")
+
+[node name="Text" type="RichTextLabel" parent="."]
+unique_name_in_owner = true
+layout_mode = 2
 size_flags_horizontal = 3
 focus_mode = 2
+text = "..."
 fit_content = true
-scroll_active = false
 selection_enabled = true
diff --git a/godot/examples/simple/simple.gd b/godot/examples/simple/simple.gd
index a1f7872..8c4766a 100644
--- a/godot/examples/simple/simple.gd
+++ b/godot/examples/simple/simple.gd
@@ -1,22 +1,40 @@
 extends Node
 
+const message = preload("res://examples/simple/message.tscn")
+
 @onready var messages_container = %MessagesContainer
 @onready var llama_context = %LlamaContext
 
-var message = preload("res://examples/simple/message.tscn")
-
 func _on_text_edit_submit(input: String) -> void:
 	handle_input(input)
 
 func handle_input(input: String) -> void:
-	var new_message = message.instantiate()
-	new_message.text = input
-	messages_container.add_child(new_message)
+	var completion_id = llama_context.request_completion(input)
+	
+	var user_message: Message = message.instantiate()
+	messages_container.add_child(user_message)
+	user_message.set_text(input)
+	user_message.sender = "user"
+	user_message.completion_id = completion_id
 	
-	var id = llama_context.request_completion(input)
-	print("request id: ", id)
+	var ai_message: Message = message.instantiate()
+	messages_container.add_child(ai_message)
+	ai_message.sender = "assistant"
+	ai_message.completion_id = completion_id
+	ai_message.pending = true
 	
 
 
 func _on_llama_context_completion_generated(chunk: Dictionary) -> void:
-	print("new chunk: ", chunk)
+	var completion_id = chunk.id
+	for message: Message in messages_container.get_children():
+		if message.completion_id != completion_id or message.sender != "assistant":
+			continue
+		if chunk.has("error"):
+			message.errored = true
+		elif chunk.has("text"):
+			if message.pending:
+				message.pending = false
+				message.set_text(chunk["text"])
+			else:
+				message.append_text(chunk["text"])
diff --git a/godot/examples/simple/simple.tscn b/godot/examples/simple/simple.tscn
index f3e654b..edc0bca 100644
--- a/godot/examples/simple/simple.tscn
+++ b/godot/examples/simple/simple.tscn
@@ -40,9 +40,12 @@ unique_name_in_owner = true
 layout_mode = 2
 size_flags_horizontal = 3
 size_flags_vertical = 3
+theme_override_constants/separation = 30
 
 [node name="RichTextLabel2" parent="Panel/MarginContainer/VBoxContainer/ScrollContainer/MessagesContainer" instance=ExtResource("2_7iip7")]
 layout_mode = 2
+
+[node name="Text" parent="Panel/MarginContainer/VBoxContainer/ScrollContainer/MessagesContainer/RichTextLabel2" index="1"]
 text = "How can I help you?"
 
 [node name="HBoxContainer" type="HBoxContainer" parent="Panel/MarginContainer/VBoxContainer"]
@@ -69,3 +72,5 @@ unique_name_in_owner = true
 [connection signal="submit" from="Panel/MarginContainer/VBoxContainer/HBoxContainer/TextEdit" to="." method="_on_text_edit_submit"]
 [connection signal="pressed" from="Panel/MarginContainer/VBoxContainer/HBoxContainer/Button" to="Panel/MarginContainer/VBoxContainer/HBoxContainer/TextEdit" method="_on_button_pressed"]
 [connection signal="completion_generated" from="LlamaContext" to="." method="_on_llama_context_completion_generated"]
+
+[editable path="Panel/MarginContainer/VBoxContainer/ScrollContainer/MessagesContainer/RichTextLabel2"]

From ad808a16dc268877ab6648e34118563b9c55f3b9 Mon Sep 17 00:00:00 2001
From: Hasan Mukhlis <hsnmkls@proton.me>
Date: Sun, 12 May 2024 12:46:30 +0800
Subject: [PATCH 5/6] add chat formatters

---
 .../godot-llama-cpp/chat/chat_formatter.gd    | 39 +++++++++++++++++++
 godot/examples/simple/message.gd              |  6 +++
 godot/examples/simple/simple.gd               | 26 +++++++++----
 godot/examples/simple/simple.tscn             |  6 ++-
 llama.cpp                                     |  2 +-
 src/llama_context.cpp                         |  4 +-
 6 files changed, 70 insertions(+), 13 deletions(-)
 create mode 100644 godot/addons/godot-llama-cpp/chat/chat_formatter.gd

diff --git a/godot/addons/godot-llama-cpp/chat/chat_formatter.gd b/godot/addons/godot-llama-cpp/chat/chat_formatter.gd
new file mode 100644
index 0000000..e083cdd
--- /dev/null
+++ b/godot/addons/godot-llama-cpp/chat/chat_formatter.gd
@@ -0,0 +1,39 @@
+class_name ChatFormatter
+
+static func apply(format: String, messages: Array) -> String:
+	match format:
+		"llama3":
+			return format_llama3(messages)
+		"phi3":
+			return format_phi3(messages)
+		_:
+			printerr("Unknown chat format: ", format)
+			return ""
+
+static func format_llama3(messages: Array) -> String:
+	var res = "<|begin_of_text|>"
+	
+	for i in range(messages.size()):
+		match messages[i]:
+			{"text": var text, "sender": var sender}:
+				res += """<|start_header_id|>%s<|end_header_id|>
+
+%s<|eot_id|>
+""" % [sender, text]
+			_:
+				printerr("Invalid message at index ", i)
+
+	res += "<|start_header_id|>assistant<|end_header_id|>\n\n"
+	return res
+
+static func format_phi3(messages: Array) -> String:
+	var res = "<s>"
+	
+	for i in range(messages.size()):
+		match messages[i]:
+			{"text": var text, "sender": var sender}:
+				res +="<|%s|>\n%s<|end|>\n" % [sender, text]
+			_:
+				printerr("Invalid message at index ", i)
+	res += "<|assistant|>\n"
+	return res
diff --git a/godot/examples/simple/message.gd b/godot/examples/simple/message.gd
index 0b32c6f..63b4f31 100644
--- a/godot/examples/simple/message.gd
+++ b/godot/examples/simple/message.gd
@@ -4,6 +4,12 @@ extends Node
 @onready var text_container = %Text
 @onready var icon = %Panel
 @export_enum("user", "assistant") var sender: String
+@export var include_in_prompt: bool = true
+var text:
+	get:
+		return text_container.text
+	set(value):
+		text_container.text = value
 
 var completion_id: int = -1
 var pending: bool = false
diff --git a/godot/examples/simple/simple.gd b/godot/examples/simple/simple.gd
index 8c4766a..56d14bc 100644
--- a/godot/examples/simple/simple.gd
+++ b/godot/examples/simple/simple.gd
@@ -9,7 +9,16 @@ func _on_text_edit_submit(input: String) -> void:
 	handle_input(input)
 
 func handle_input(input: String) -> void:
-	var completion_id = llama_context.request_completion(input)
+	var messages = [{ "sender": "system", "text": "You are a helpful assistant" }]
+	messages.append_array(messages_container.get_children().filter(func(msg: Message): return msg.include_in_prompt).map(
+		func(msg: Message) -> Dictionary:
+			return { "text": msg.text, "sender": msg.sender }
+	))
+	messages.append({"text": input, "sender": "user"})
+	var prompt = ChatFormatter.apply("phi3", messages)
+	print("prompt: ", prompt)
+	
+	var completion_id = llama_context.request_completion(prompt)
 	
 	var user_message: Message = message.instantiate()
 	messages_container.add_child(user_message)
@@ -22,19 +31,20 @@ func handle_input(input: String) -> void:
 	ai_message.sender = "assistant"
 	ai_message.completion_id = completion_id
 	ai_message.pending = true
+	ai_message.grab_focus()
 	
 
 
 func _on_llama_context_completion_generated(chunk: Dictionary) -> void:
 	var completion_id = chunk.id
-	for message: Message in messages_container.get_children():
-		if message.completion_id != completion_id or message.sender != "assistant":
+	for msg: Message in messages_container.get_children():
+		if msg.completion_id != completion_id or msg.sender != "assistant":
 			continue
 		if chunk.has("error"):
-			message.errored = true
+			msg.errored = true
 		elif chunk.has("text"):
-			if message.pending:
-				message.pending = false
-				message.set_text(chunk["text"])
+			if msg.pending:
+				msg.pending = false
+				msg.set_text(chunk["text"])
 			else:
-				message.append_text(chunk["text"])
+				msg.append_text(chunk["text"])
diff --git a/godot/examples/simple/simple.tscn b/godot/examples/simple/simple.tscn
index edc0bca..1ef1c19 100644
--- a/godot/examples/simple/simple.tscn
+++ b/godot/examples/simple/simple.tscn
@@ -1,10 +1,10 @@
 [gd_scene load_steps=6 format=3 uid="uid://c55kb4qvg6geq"]
 
-[ext_resource type="LlamaModel" path="res://models/Phi-3-mini-128k-instruct.Q5_K_M.gguf" id="1_ff70a"]
 [ext_resource type="Texture2D" uid="uid://dplw232htshgc" path="res://addons/godot-llama-cpp/assets/godot-llama-cpp-1024x1024.svg" id="1_gjsev"]
 [ext_resource type="Script" path="res://examples/simple/simple.gd" id="1_sruc3"]
 [ext_resource type="PackedScene" uid="uid://t862t0v8ht2q" path="res://examples/simple/message.tscn" id="2_7iip7"]
 [ext_resource type="Script" path="res://examples/simple/TextEdit.gd" id="2_7usqw"]
+[ext_resource type="LlamaModel" path="res://models/Phi-3-mini-128k-instruct.Q5_K_M.gguf" id="5_qpeda"]
 
 [node name="Node" type="Node"]
 script = ExtResource("1_sruc3")
@@ -34,6 +34,7 @@ layout_mode = 2
 [node name="ScrollContainer" type="ScrollContainer" parent="Panel/MarginContainer/VBoxContainer"]
 layout_mode = 2
 size_flags_vertical = 3
+follow_focus = true
 
 [node name="MessagesContainer" type="VBoxContainer" parent="Panel/MarginContainer/VBoxContainer/ScrollContainer"]
 unique_name_in_owner = true
@@ -44,6 +45,7 @@ theme_override_constants/separation = 30
 
 [node name="RichTextLabel2" parent="Panel/MarginContainer/VBoxContainer/ScrollContainer/MessagesContainer" instance=ExtResource("2_7iip7")]
 layout_mode = 2
+include_in_prompt = false
 
 [node name="Text" parent="Panel/MarginContainer/VBoxContainer/ScrollContainer/MessagesContainer/RichTextLabel2" index="1"]
 text = "How can I help you?"
@@ -66,7 +68,7 @@ icon = ExtResource("1_gjsev")
 expand_icon = true
 
 [node name="LlamaContext" type="LlamaContext" parent="."]
-model = ExtResource("1_ff70a")
+model = ExtResource("5_qpeda")
 unique_name_in_owner = true
 
 [connection signal="submit" from="Panel/MarginContainer/VBoxContainer/HBoxContainer/TextEdit" to="." method="_on_text_edit_submit"]
diff --git a/llama.cpp b/llama.cpp
index 8c570c9..b228aba 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit 8c570c9496212073079476651c7517c02581101f
+Subproject commit b228aba91ac2cd9eb90e9d423ba1d0d20e0117e2
diff --git a/src/llama_context.cpp b/src/llama_context.cpp
index 4d64442..f448552 100644
--- a/src/llama_context.cpp
+++ b/src/llama_context.cpp
@@ -96,7 +96,7 @@ void LlamaContext::__thread_loop() {
 		UtilityFunctions::print(vformat("%s: Running completion for prompt id: %d", __func__, req.id));
 
 		std::vector<llama_token> request_tokens;
-		request_tokens = ::llama_tokenize(ctx, req.prompt.utf8().get_data(), true);
+		request_tokens = ::llama_tokenize(ctx, req.prompt.utf8().get_data(), true, true);
 
 		size_t shared_prefix_idx = 0;
 		auto diff = std::mismatch(context_tokens.begin(), context_tokens.end(), request_tokens.begin(), request_tokens.end());
@@ -166,7 +166,7 @@ void LlamaContext::__thread_loop() {
 				return;
 			}
 			llama_token new_token_id = llama_sampling_sample(sampling_ctx, ctx, NULL, batch.n_tokens - 1);
-			llama_sampling_accept(sampling_ctx, ctx, new_token_id, true);
+			llama_sampling_accept(sampling_ctx, ctx, new_token_id, false);
 
 			Dictionary response;
 			response["id"] = req.id;

From 17f069cee2a04f00932bdb01dee0ec05e29b4f75 Mon Sep 17 00:00:00 2001
From: Hasan Mukhlis <hsnmkls@proton.me>
Date: Thu, 16 May 2024 18:04:55 +0800
Subject: [PATCH 6/6] expose some sampling params

---
 .../godot-llama-cpp/chat/chat_formatter.gd    | 21 +++++-
 .../addons/godot-llama-cpp/plugin.gdextension |  2 +-
 godot/examples/simple/simple.gd               |  7 +-
 godot/examples/simple/simple.tscn             |  5 +-
 llama.cpp                                     |  2 +-
 src/llama_context.cpp                         | 71 ++++++++++++++++---
 src/llama_context.h                           |  8 +++
 7 files changed, 98 insertions(+), 18 deletions(-)

diff --git a/godot/addons/godot-llama-cpp/chat/chat_formatter.gd b/godot/addons/godot-llama-cpp/chat/chat_formatter.gd
index e083cdd..84d8369 100644
--- a/godot/addons/godot-llama-cpp/chat/chat_formatter.gd
+++ b/godot/addons/godot-llama-cpp/chat/chat_formatter.gd
@@ -6,12 +6,14 @@ static func apply(format: String, messages: Array) -> String:
 			return format_llama3(messages)
 		"phi3":
 			return format_phi3(messages)
+		"mistral":
+			return format_mistral(messages)
 		_:
 			printerr("Unknown chat format: ", format)
 			return ""
 
 static func format_llama3(messages: Array) -> String:
-	var res = "<|begin_of_text|>"
+	var res = ""
 	
 	for i in range(messages.size()):
 		match messages[i]:
@@ -27,7 +29,7 @@ static func format_llama3(messages: Array) -> String:
 	return res
 
 static func format_phi3(messages: Array) -> String:
-	var res = "<s>"
+	var res = ""
 	
 	for i in range(messages.size()):
 		match messages[i]:
@@ -37,3 +39,18 @@ static func format_phi3(messages: Array) -> String:
 				printerr("Invalid message at index ", i)
 	res += "<|assistant|>\n"
 	return res
+	
+static func format_mistral(messages: Array) -> String:
+	var res = ""
+	
+	for i in range(messages.size()):
+		match messages[i]:
+			{"text": var text, "sender": var sender}:
+				if sender == "user":
+					res += "[INST] %s [/INST]" % text
+				else:
+					res += "%s</s>"
+			_:
+				printerr("Invalid message at index ", i)
+	
+	return res
diff --git a/godot/addons/godot-llama-cpp/plugin.gdextension b/godot/addons/godot-llama-cpp/plugin.gdextension
index 75c8d09..9f2561c 100644
--- a/godot/addons/godot-llama-cpp/plugin.gdextension
+++ b/godot/addons/godot-llama-cpp/plugin.gdextension
@@ -5,7 +5,7 @@ compatibility_minimum = "4.2"
 
 [libraries]
 
-macos.debug = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp-aarch64-macos-none-Debug.dylib"
+macos.debug = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp-aarch64-macos-none-ReleaseSafe.dylib"
 macos.release = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp-aarch64-macos-none-ReleaseSafe.dylib"
 windows.debug.x86_32 = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp.windows.template_debug.x86_32.dll"
 windows.release.x86_32 = "res://addons/godot-llama-cpp/lib/libgodot-llama-cpp.windows.template_release.x86_32.dll"
diff --git a/godot/examples/simple/simple.gd b/godot/examples/simple/simple.gd
index 56d14bc..4519f5d 100644
--- a/godot/examples/simple/simple.gd
+++ b/godot/examples/simple/simple.gd
@@ -9,13 +9,16 @@ func _on_text_edit_submit(input: String) -> void:
 	handle_input(input)
 
 func handle_input(input: String) -> void:
-	var messages = [{ "sender": "system", "text": "You are a helpful assistant" }]
+	#var messages = [{ "sender": "system", "text": "You are a pirate chatbot who always responds in pirate speak!" }]
+	
+	#var messages = [{ "sender": "system", "text": "You are a helpful chatbot assistant!" }]
+	var messages = []
 	messages.append_array(messages_container.get_children().filter(func(msg: Message): return msg.include_in_prompt).map(
 		func(msg: Message) -> Dictionary:
 			return { "text": msg.text, "sender": msg.sender }
 	))
 	messages.append({"text": input, "sender": "user"})
-	var prompt = ChatFormatter.apply("phi3", messages)
+	var prompt = ChatFormatter.apply("llama3", messages)
 	print("prompt: ", prompt)
 	
 	var completion_id = llama_context.request_completion(prompt)
diff --git a/godot/examples/simple/simple.tscn b/godot/examples/simple/simple.tscn
index 1ef1c19..50b193d 100644
--- a/godot/examples/simple/simple.tscn
+++ b/godot/examples/simple/simple.tscn
@@ -4,7 +4,7 @@
 [ext_resource type="Script" path="res://examples/simple/simple.gd" id="1_sruc3"]
 [ext_resource type="PackedScene" uid="uid://t862t0v8ht2q" path="res://examples/simple/message.tscn" id="2_7iip7"]
 [ext_resource type="Script" path="res://examples/simple/TextEdit.gd" id="2_7usqw"]
-[ext_resource type="LlamaModel" path="res://models/Phi-3-mini-128k-instruct.Q5_K_M.gguf" id="5_qpeda"]
+[ext_resource type="LlamaModel" path="res://models/meta-llama-3-8b-instruct.Q5_K_M.gguf" id="5_qov1l"]
 
 [node name="Node" type="Node"]
 script = ExtResource("1_sruc3")
@@ -68,7 +68,8 @@ icon = ExtResource("1_gjsev")
 expand_icon = true
 
 [node name="LlamaContext" type="LlamaContext" parent="."]
-model = ExtResource("5_qpeda")
+model = ExtResource("5_qov1l")
+temperature = 0.9
 unique_name_in_owner = true
 
 [connection signal="submit" from="Panel/MarginContainer/VBoxContainer/HBoxContainer/TextEdit" to="." method="_on_text_edit_submit"]
diff --git a/llama.cpp b/llama.cpp
index b228aba..9afdffe 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit b228aba91ac2cd9eb90e9d423ba1d0d20e0117e2
+Subproject commit 9afdffe70ebf3166d429b4434783bb0b7f97bdeb
diff --git a/src/llama_context.cpp b/src/llama_context.cpp
index f448552..0fcfc2f 100644
--- a/src/llama_context.cpp
+++ b/src/llama_context.cpp
@@ -21,6 +21,22 @@ void LlamaContext::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_seed", "seed"), &LlamaContext::set_seed);
 	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "seed"), "set_seed", "get_seed");
 
+	ClassDB::bind_method(D_METHOD("get_temperature"), &LlamaContext::get_temperature);
+	ClassDB::bind_method(D_METHOD("set_temperature", "temperature"), &LlamaContext::set_temperature);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::FLOAT, "temperature"), "set_temperature", "get_temperature");
+
+	ClassDB::bind_method(D_METHOD("get_top_p"), &LlamaContext::get_top_p);
+	ClassDB::bind_method(D_METHOD("set_top_p", "top_p"), &LlamaContext::set_top_p);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::FLOAT, "top_p"), "set_top_p", "get_top_p");
+
+	ClassDB::bind_method(D_METHOD("get_frequency_penalty"), &LlamaContext::get_frequency_penalty);
+	ClassDB::bind_method(D_METHOD("set_frequency_penalty", "frequency_penalty"), &LlamaContext::set_frequency_penalty);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::FLOAT, "frequency_penalty"), "set_frequency_penalty", "get_frequency_penalty");
+
+	ClassDB::bind_method(D_METHOD("get_presence_penalty"), &LlamaContext::get_presence_penalty);
+	ClassDB::bind_method(D_METHOD("set_presence_penalty", "presence_penalty"), &LlamaContext::set_presence_penalty);
+	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::FLOAT, "presence_penalty"), "set_presence_penalty", "get_presence_penalty");
+
 	ClassDB::bind_method(D_METHOD("get_n_ctx"), &LlamaContext::get_n_ctx);
 	ClassDB::bind_method(D_METHOD("set_n_ctx", "n_ctx"), &LlamaContext::set_n_ctx);
 	ClassDB::add_property("LlamaContext", PropertyInfo(Variant::INT, "n_ctx"), "set_n_ctx", "get_n_ctx");
@@ -106,13 +122,13 @@ void LlamaContext::__thread_loop() {
 			shared_prefix_idx = std::min(context_tokens.size(), request_tokens.size());
 		}
 
-		bool rm_success = llama_kv_cache_seq_rm(ctx, 0, shared_prefix_idx, -1);
+		bool rm_success = llama_kv_cache_seq_rm(ctx, -1, shared_prefix_idx, -1);
 		if (!rm_success) {
 			UtilityFunctions::printerr(vformat("%s: Failed to remove tokens from kv cache", __func__));
 			Dictionary response;
 			response["id"] = req.id;
 			response["error"] = "Failed to remove tokens from kv cache";
-			call_deferred("emit_signal", "completion_generated", response);
+			call_thread_safe("emit_signal", "completion_generated", response);
 			continue;
 		}
 		context_tokens.erase(context_tokens.begin() + shared_prefix_idx, context_tokens.end());
@@ -128,6 +144,14 @@ void LlamaContext::__thread_loop() {
 			sequences.push_back(std::vector<llama_token>(request_tokens.begin() + i, request_tokens.begin() + std::min(i + batch_size, request_tokens.size())));
 		}
 
+		printf("Request tokens: \n");
+		for (auto sequence : sequences) {
+			for (auto token : sequence) {
+				printf("%s", llama_token_to_piece(ctx, token).c_str());
+			}
+		}
+		printf("\n");
+
 		int curr_token_pos = context_tokens.size();
 		bool decode_failed = false;
 
@@ -155,7 +179,7 @@ void LlamaContext::__thread_loop() {
 			Dictionary response;
 			response["id"] = req.id;
 			response["error"] = "llama_decode() failed";
-			call_deferred("emit_signal", "completion_generated", response);
+			call_thread_safe("emit_signal", "completion_generated", response);
 			continue;
 		}
 
@@ -171,17 +195,17 @@ void LlamaContext::__thread_loop() {
 			Dictionary response;
 			response["id"] = req.id;
 
+			context_tokens.push_back(new_token_id);
+
 			if (llama_token_is_eog(model->model, new_token_id) || curr_token_pos == n_len) {
 				response["done"] = true;
-				call_deferred("emit_signal", "completion_generated", response);
+				call_thread_safe("emit_signal", "completion_generated", response);
 				break;
 			}
 
-			context_tokens.push_back(new_token_id);
-
 			response["text"] = llama_token_to_piece(ctx, new_token_id).c_str();
 			response["done"] = false;
-			call_deferred("emit_signal", "completion_generated", response);
+			call_thread_safe("emit_signal", "completion_generated", response);
 
 			llama_batch_clear(batch);
 
@@ -199,11 +223,9 @@ void LlamaContext::__thread_loop() {
 			Dictionary response;
 			response["id"] = req.id;
 			response["error"] = "llama_decode() failed";
-			call_deferred("emit_signal", "completion_generated", response);
+			call_thread_safe("emit_signal", "completion_generated", response);
 			continue;
 		}
-
-		llama_sampling_reset(sampling_ctx);
 	}
 }
 
@@ -258,6 +280,34 @@ void LlamaContext::set_n_len(int n_len) {
 	this->n_len = n_len;
 }
 
+float LlamaContext::get_temperature() {
+  return sampling_params.temp;
+}
+void LlamaContext::set_temperature(float temperature) {
+  sampling_params.temp = temperature;
+}
+
+float LlamaContext::get_top_p() {
+  return sampling_params.top_p;
+}
+void LlamaContext::set_top_p(float top_p) {
+  sampling_params.top_p = top_p;
+}
+
+float LlamaContext::get_frequency_penalty() {
+  return sampling_params.penalty_freq;
+}
+void LlamaContext::set_frequency_penalty(float frequency_penalty) {
+  sampling_params.penalty_freq = frequency_penalty;
+}
+
+float LlamaContext::get_presence_penalty() {
+  return sampling_params.penalty_present;
+}
+void LlamaContext::set_presence_penalty(float presence_penalty) {
+  sampling_params.penalty_present = presence_penalty;
+}
+
 void LlamaContext::_exit_tree() {
 	if (Engine::get_singleton()->is_editor_hint()) {
 		return;
@@ -275,5 +325,6 @@ void LlamaContext::_exit_tree() {
 		llama_free(ctx);
 	}
 
+	llama_sampling_free(sampling_ctx);
 	llama_backend_free();
 }
\ No newline at end of file
diff --git a/src/llama_context.h b/src/llama_context.h
index 309f8f1..07bfd3e 100644
--- a/src/llama_context.h
+++ b/src/llama_context.h
@@ -51,6 +51,14 @@ class LlamaContext : public Node {
 	void set_n_ctx(int n_ctx);
   int get_n_len();
   void set_n_len(int n_len);
+  float get_temperature();
+  void set_temperature(float temperature);
+  float get_top_p();
+  void set_top_p(float top_p);
+  float get_frequency_penalty();
+  void set_frequency_penalty(float frequency_penalty);
+  float get_presence_penalty();
+  void set_presence_penalty(float presence_penalty);
 
 	virtual PackedStringArray _get_configuration_warnings() const override;
 	virtual void _ready() override;