2
2
import os .path
3
3
4
4
import torch
5
- from transformers import AutoModelForCausalLM , AutoTokenizer , BitsAndBytesConfig
5
+ from transformers import (
6
+ AutoModelForCausalLM ,
7
+ AutoTokenizer ,
8
+ BitsAndBytesConfig ,
9
+ pipeline ,
10
+ )
6
11
7
12
8
13
class LLama3 (object ):
@@ -30,14 +35,17 @@ def __init__(self, accelerate_engine: str = "cuda", debug: bool = False) -> None
30
35
bnb_4bit_use_double_quant = True ,
31
36
bnb_4bit_compute_dtype = torch .bfloat16 ,
32
37
)
33
- self .model = AutoModelForCausalLM .from_pretrained (
38
+ self .model_4bit = AutoModelForCausalLM .from_pretrained (
34
39
self .model_id , quantization_config = bnb_config , device_map = "auto"
35
40
)
36
41
self .tokenizer = AutoTokenizer .from_pretrained (
37
42
self .model_id , add_special_tokens = True
38
43
)
39
44
self .tokenizer .pad_token = self .tokenizer .eos_token
40
45
self .tokenizer .padding_side = "right"
46
+ self .pipe = pipeline (
47
+ "text-generation" , model = self .model_4bit , tokenizer = self .tokenizer
48
+ )
41
49
42
50
def load_prompt (self ) -> list [dict [str , str ]]:
43
51
# Get Hakase Project Path
@@ -55,10 +63,10 @@ def generate_instruction(self, instruction: str) -> None:
55
63
56
64
def generate_text (self , instruction : str ) -> str :
57
65
self .generate_instruction (instruction = instruction )
58
- prompt = self .tokenizer .apply_chat_template (
66
+ prompt = self .pipe . tokenizer .apply_chat_template (
59
67
self .prompt , tokenize = False , add_generation_prompt = True
60
68
)
61
- outputs = self .model . generate (
69
+ outputs = self .pipe (
62
70
prompt ,
63
71
do_sample = True ,
64
72
temperature = 0.4 ,
0 commit comments