File size: 5,373 Bytes
36d1bec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1080e8
36d1bec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from llama_cpp import Llama
import gc
import threading

class LlmBackend:
    
    SYSTEM_PROMPT = "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык."
    SYSTEM_TOKEN = 1788
    USER_TOKEN = 1404
    BOT_TOKEN = 9225
    LINEBREAK_TOKEN = 13

    ROLE_TOKENS = {
        "user": USER_TOKEN,
        "bot": BOT_TOKEN,
        "system": SYSTEM_TOKEN
    }

    _instance = None
    _model = None
    _lock = threading.Lock()
    
    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(LlmBackend, cls).__new__(cls)
        return cls._instance
    
    
    def is_model_loaded(self):
        return self._model is not None
    
    def load_model(self, model_path, context_size=2000, enable_gpu=True, gpu_layer_number=35, n_gqa=8, chat_format='llama-2'):
        
        if self._model is not None:
            self.unload_model()
            
        with self._lock:    
            if enable_gpu:
                self._model = Llama(
                    model_path=model_path,
                    chat_format=chat_format,
                    n_ctx=context_size,
                    n_parts=1,
                    #n_batch=100,
                    logits_all=True,
                    #n_threads=12,
                    verbose=True,
                    n_gpu_layers=gpu_layer_number,
                    n_gqa=n_gqa       #must be set for 70b models
                )
                return self._model
            else:
                self._model = Llama(
                    model_path=model_path,
                    chat_format=chat_format,
                    n_ctx=context_size,
                    n_parts=1,
                    #n_batch=100,
                    logits_all=True,
                    #n_threads=12,
                    verbose=True,
                    n_gqa=n_gqa       #must be set for 70b models
                )
                return self._model
        
    def set_system_prompt(self, prompt):
        with self._lock:
            self.SYSTEM_PROMPT = prompt
        
    def unload_model(self):
        with self._lock:
            if self._model is not None:
                self._model.llama_free_model()
                del self._model
            
    def generate_tokens(self, generator):
        print('generate_tokens called')
        with self._lock:
            print('generate_tokens started')
            try:
                for token in generator:            
                    if token == self._model.token_eos():
                        print('End generating')
                        yield b''  # End of chunk
                        break
                        
                    token_str = self._model.detokenize([token])#.decode("utf-8", errors="ignore")
                    yield token_str 
            except Exception as e:
                print('generator exception')
                print(e)
                yield b''  # End of chunk
                
    def create_chat_completion(self, messages, stream=True):
        print('create_chat_completion called')
        with self._lock:
            print('create_chat_completion started')
            try:
                return self._model.create_chat_completion(messages=messages, stream=stream)
            except Exception as e:
                print('create_chat_completion exception')
                print(e)
                return None
                
    
    def get_message_tokens(self, role, content):
        message_tokens = self._model.tokenize(content.encode("utf-8"))
        message_tokens.insert(1, self.ROLE_TOKENS[role])
        message_tokens.insert(2, self.LINEBREAK_TOKEN)
        message_tokens.append(self._model.token_eos())
        return message_tokens

    def get_system_tokens(self):
        return self.get_message_tokens(role="system", content=self.SYSTEM_PROMPT)
    
    def create_chat_generator_for_saiga(self, messages, parameters):
        print('create_chat_completion called')
        with self._lock:
            tokens = self.get_system_tokens()
            for message in messages:
                message_tokens = self.get_message_tokens(role=message.get("from"), content=message.get("content", ""))
                tokens.extend(message_tokens)
            
            tokens.extend([self._model.token_bos(), self.BOT_TOKEN, self.LINEBREAK_TOKEN])
            generator = self._model.generate(
                tokens,
                top_k=parameters['top_k'],
                top_p=parameters['top_p'],
                temp=parameters['temperature'],
                repeat_penalty=parameters['repetition_penalty']
            )
            return generator
        
    def generate_tokens(self, generator):
        with self._lock:
            try:
                for token in generator:            
                    if token == self._model.token_eos():
                        yield b''  # End of chunk
                        break
                        
                    token_str = self._model.detokenize([token])#.decode("utf-8", errors="ignore")
                    yield token_str 
            except Exception as e:
                yield b''  # End of chunk