fix: quantization

zeroday0619 · zeroday0619 · commit e83dd4c277b4 · 2024-05-07T00:25:08.000+09:00
diff --git a/HakaseCore/llm/llama3.py b/HakaseCore/llm/llama3.py
@@ -2,7 +2,12 @@
 import os.path
 
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    pipeline,
+)
 
 
 class LLama3(object):
@@ -30,14 +35,17 @@ def __init__(self, accelerate_engine: str = "cuda", debug: bool = False) -> None
             bnb_4bit_use_double_quant=True,
             bnb_4bit_compute_dtype=torch.bfloat16,
         )
-        self.model = AutoModelForCausalLM.from_pretrained(
+        self.model_4bit = AutoModelForCausalLM.from_pretrained(
             self.model_id, quantization_config=bnb_config, device_map="auto"
         )
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model_id, add_special_tokens=True
         )
         self.tokenizer.pad_token = self.tokenizer.eos_token
         self.tokenizer.padding_side = "right"
+        self.pipe = pipeline(
+            "text-generation", model=self.model_4bit, tokenizer=self.tokenizer
+        )
 
     def load_prompt(self) -> list[dict[str, str]]:
         # Get Hakase Project Path
@@ -55,10 +63,10 @@ def generate_instruction(self, instruction: str) -> None:
 
     def generate_text(self, instruction: str) -> str:
         self.generate_instruction(instruction=instruction)
-        prompt = self.tokenizer.apply_chat_template(
+        prompt = self.pipe.tokenizer.apply_chat_template(
             self.prompt, tokenize=False, add_generation_prompt=True
         )
-        outputs = self.model.generate(
+        outputs = self.pipe(
             prompt,
             do_sample=True,
             temperature=0.4,