Fix LlamaContext initialization and batch handling

hazelnutcloud · hazelnutcloud · commit d5d17810438b · 2024-03-07T01:07:53.000+08:00
diff --git a/src/llama_context.cpp b/src/llama_context.cpp
@@ -30,6 +30,7 @@ void LlamaContext::_ready() {
 		return;
 	}
 
+  ctx_params.seed = -1;
 	ctx_params.n_ctx = 2048;
 	int32_t n_threads = OS::get_singleton()->get_processor_count();
 	ctx_params.n_threads = n_threads;
@@ -45,9 +46,9 @@ void LlamaContext::_ready() {
 
 Variant LlamaContext::request_completion(const String &prompt) {
 	UtilityFunctions::print(vformat("%s: Requesting completion for prompt: %s", __func__, prompt));
-  if (task_id) {
-    WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);
-  }
+	if (task_id) {
+		WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);
+	}
 	task_id = WorkerThreadPool::get_singleton()->add_task(Callable(this, "_fulfill_completion").bind(prompt));
 	return OK;
 }
@@ -65,9 +66,12 @@ void LlamaContext::_fulfill_completion(const String &prompt) {
 		return;
 	}
 
+	llama_batch batch = llama_batch_init(tokens_list.size(), 0, 1);
+
 	for (size_t i = 0; i < tokens_list.size(); i++) {
 		llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
 	}
+  
 	batch.logits[batch.n_tokens - 1] = true;
 
 	int decode_res = llama_decode(ctx, batch);
@@ -79,6 +83,7 @@ void LlamaContext::_fulfill_completion(const String &prompt) {
 	int n_cur = batch.n_tokens;
 	int n_decode = 0;
 	llama_model *llama_model = model->model;
+
 	while (n_cur <= n_len) {
 		// sample the next token
 		{
@@ -121,9 +126,11 @@ void LlamaContext::_fulfill_completion(const String &prompt) {
 		int decode_res = llama_decode(ctx, batch);
 		if (decode_res != 0) {
 			UtilityFunctions::printerr(vformat("%s: Failed to decode batch with error code: %d", __func__, decode_res));
-			return;
+			break;
 		}
 	}
+
+	llama_batch_free(batch);
 }
 
 void LlamaContext::set_model(const Ref<LlamaModel> p_model) {
@@ -138,7 +145,6 @@ LlamaContext::~LlamaContext() {
 	if (ctx) {
 		llama_free(ctx);
 	}
-	llama_batch_free(batch);
 	if (task_id) {
 		WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);
 	}
diff --git a/src/llama_context.h b/src/llama_context.h
@@ -13,7 +13,6 @@ class LlamaContext : public Node {
 	Ref<LlamaModel> model;
 	llama_context *ctx = nullptr;
   llama_context_params ctx_params = llama_context_default_params();
-  llama_batch batch = llama_batch_init(512, 0, 1);
   int task_id;
 
 protected:

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ void LlamaContext::_ready() {`
`30`	`30`	`return;`
`31`	`31`	`}`
`32`	`32`
	`33`	`+ ctx_params.seed = -1;`
`33`	`34`	`ctx_params.n_ctx = 2048;`
`34`	`35`	`int32_t n_threads = OS::get_singleton()->get_processor_count();`
`35`	`36`	`ctx_params.n_threads = n_threads;`
`@@ -45,9 +46,9 @@ void LlamaContext::_ready() {`
`45`	`46`
`46`	`47`	`Variant LlamaContext::request_completion(const String &prompt) {`
`47`	`48`	`UtilityFunctions::print(vformat("%s: Requesting completion for prompt: %s", __func__, prompt));`
`48`		`- if (task_id) {`
`49`		`- WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);`
`50`		`- }`
	`49`	`+ if (task_id) {`
	`50`	`+ WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);`
	`51`	`+ }`
`51`	`52`	`task_id = WorkerThreadPool::get_singleton()->add_task(Callable(this, "_fulfill_completion").bind(prompt));`
`52`	`53`	`return OK;`
`53`	`54`	`}`
`@@ -65,9 +66,12 @@ void LlamaContext::_fulfill_completion(const String &prompt) {`
`65`	`66`	`return;`
`66`	`67`	`}`
`67`	`68`
	`69`	`+ llama_batch batch = llama_batch_init(tokens_list.size(), 0, 1);`
	`70`	`+`
`68`	`71`	`for (size_t i = 0; i < tokens_list.size(); i++) {`
`69`	`72`	`llama_batch_add(batch, tokens_list[i], i, { 0 }, false);`
`70`	`73`	`}`
	`74`	`+`
`71`	`75`	`batch.logits[batch.n_tokens - 1] = true;`
`72`	`76`
`73`	`77`	`int decode_res = llama_decode(ctx, batch);`
`@@ -79,6 +83,7 @@ void LlamaContext::_fulfill_completion(const String &prompt) {`
`79`	`83`	`int n_cur = batch.n_tokens;`
`80`	`84`	`int n_decode = 0;`
`81`	`85`	`llama_model *llama_model = model->model;`
	`86`	`+`
`82`	`87`	`while (n_cur <= n_len) {`
`83`	`88`	`// sample the next token`
`84`	`89`	`{`
`@@ -121,9 +126,11 @@ void LlamaContext::_fulfill_completion(const String &prompt) {`
`121`	`126`	`int decode_res = llama_decode(ctx, batch);`
`122`	`127`	`if (decode_res != 0) {`
`123`	`128`	`UtilityFunctions::printerr(vformat("%s: Failed to decode batch with error code: %d", __func__, decode_res));`
`124`		`- return;`
	`129`	`+ break;`
`125`	`130`	`}`
`126`	`131`	`}`
	`132`	`+`
	`133`	`+ llama_batch_free(batch);`
`127`	`134`	`}`
`128`	`135`
`129`	`136`	`void LlamaContext::set_model(const Ref<LlamaModel> p_model) {`
`@@ -138,7 +145,6 @@ LlamaContext::~LlamaContext() {`
`138`	`145`	`if (ctx) {`
`139`	`146`	`llama_free(ctx);`
`140`	`147`	`}`
`141`		`- llama_batch_free(batch);`
`142`	`148`	`if (task_id) {`
`143`	`149`	`WorkerThreadPool::get_singleton()->wait_for_task_completion(task_id);`
`144`	`150`	`}`