muryshev commited on
Commit
a4dc558
·
1 Parent(s): 1f831d0

Another bunch of fixes

Browse files
Files changed (2) hide show
  1. app.py +18 -38
  2. llm_backend.py +53 -15
app.py CHANGED
@@ -11,12 +11,16 @@ from apscheduler.schedulers.background import BackgroundScheduler
11
  from datetime import datetime, timedelta
12
  from llm_backend import LlmBackend
13
  import json
 
 
14
 
15
  llm = LlmBackend()
16
  _lock = threading.Lock()
17
 
18
  SYSTEM_PROMPT = os.environ.get('SYSTEM_PROMPT') or "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык."
19
- CONTEXT_SIZE = os.environ.get('CONTEXT_SIZE') or 500
 
 
20
  ENABLE_GPU = os.environ.get('ENABLE_GPU') or False
21
  GPU_LAYERS = os.environ.get('GPU_LAYERS') or 0
22
  N_GQA = os.environ.get('N_GQA') or None #must be set to 8 for 70b models
@@ -24,9 +28,12 @@ CHAT_FORMAT = os.environ.get('CHAT_FORMAT') or 'llama-2'
24
 
25
  # Create a lock object
26
  lock = threading.Lock()
 
27
 
28
- app = Flask(__name__)
29
- # Configure Flask logging
 
 
30
  app.logger.setLevel(logging.DEBUG)
31
 
32
  # Variable to store the last request time
@@ -51,7 +58,7 @@ if os.path.isdir('/data'):
51
 
52
  model = None
53
 
54
- MODEL_PATH = snapshot_download(repo_id=repo_name, allow_patterns=model_name) + '/' + model_name
55
  app.logger.info('Model path: ' + MODEL_PATH)
56
 
57
  DATASET_REPO_URL = "https://huggingface.co/datasets/muryshev/saiga-chat"
@@ -81,25 +88,6 @@ app.logger.info("hfh: "+huggingface_hub.__version__)
81
  # commit_url = repo.push_to_hub()
82
  # app.logger.info(commit_url)
83
 
84
- def generate_tokens(model, generator):
85
- global stop_generation
86
- app.logger.info('generate_tokens started')
87
- with lock:
88
- try:
89
- for token in generator:
90
- if token == model.token_eos() or stop_generation:
91
- stop_generation = False
92
- app.logger.info('End generating')
93
- yield b'' # End of chunk
94
- break
95
-
96
- token_str = model.detokenize([token])#.decode("utf-8", errors="ignore")
97
- yield token_str
98
- except Exception as e:
99
- app.logger.info('generator exception')
100
- app.logger.info(e)
101
- yield b'' # End of chunk
102
-
103
  @app.route('/change_context_size', methods=['GET'])
104
  def handler_change_context_size():
105
  global stop_generation, model
@@ -142,12 +130,7 @@ def generate_and_log_tokens(user_request, generator):
142
  @app.route('/', methods=['POST'])
143
  def generate_response():
144
 
145
- app.logger.info('generate_response')
146
- with _lock:
147
- if not llm.is_model_loaded():
148
- app.logger.info('model loading')
149
- init_model()
150
-
151
  data = request.get_json()
152
  app.logger.info(data)
153
  messages = data.get("messages", [])
@@ -165,12 +148,9 @@ def generate_response():
165
  'return_full_text': parameters.get("return_full_text", False)
166
  }
167
 
168
- generator = llm.create_chat_generator_for_saiga(messages=messages, parameters=p)
169
  app.logger.info('Generator created')
170
 
171
-
172
-
173
-
174
  # Use Response to stream tokens
175
  return Response(generate_and_log_tokens(user_request='1', generator=generator), content_type='text/plain', status=200, direct_passthrough=True)
176
 
@@ -182,7 +162,6 @@ def check_last_request_time():
182
  global last_request_time
183
  current_time = datetime.now()
184
  if (current_time - last_request_time).total_seconds() > 300: # 5 minutes in seconds
185
- # Perform the action (e.g., set a variable)
186
  llm.unload_model()
187
  app.logger.info(f"Model unloaded at {current_time}")
188
  else:
@@ -190,10 +169,11 @@ def check_last_request_time():
190
 
191
 
192
  if __name__ == "__main__":
193
- scheduler = BackgroundScheduler()
194
- scheduler.add_job(check_last_request_time, trigger='interval', minutes=1)
195
- scheduler.start()
196
 
197
  init_model()
198
 
199
- app.run(host="0.0.0.0", port=7860, debug=True, threaded=True)
 
 
 
 
 
11
  from datetime import datetime, timedelta
12
  from llm_backend import LlmBackend
13
  import json
14
+ import log
15
+ import sys
16
 
17
  llm = LlmBackend()
18
  _lock = threading.Lock()
19
 
20
  SYSTEM_PROMPT = os.environ.get('SYSTEM_PROMPT') or "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык."
21
+ CONTEXT_SIZE = os.environ.get('CONTEXT_SIZE') or 500
22
+ HF_CACHE_DIR = os.environ.get('HF_CACHE_DIR') or '/root/.cache'
23
+ USE_SYSTEM_PROMPT = os.environ.get('USE_SYSTEM_PROMPT') or False
24
  ENABLE_GPU = os.environ.get('ENABLE_GPU') or False
25
  GPU_LAYERS = os.environ.get('GPU_LAYERS') or 0
26
  N_GQA = os.environ.get('N_GQA') or None #must be set to 8 for 70b models
 
28
 
29
  # Create a lock object
30
  lock = threading.Lock()
31
+ app = Flask('llm_api')
32
 
33
+ app.logger.handlers.clear()
34
+ handler = logging.StreamHandler(sys.stdout)
35
+ handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
36
+ app.logger.addHandler(handler)
37
  app.logger.setLevel(logging.DEBUG)
38
 
39
  # Variable to store the last request time
 
58
 
59
  model = None
60
 
61
+ MODEL_PATH = snapshot_download(repo_id=repo_name, allow_patterns=model_name, cache_dir=HF_CACHE_DIR) + '/' + model_name
62
  app.logger.info('Model path: ' + MODEL_PATH)
63
 
64
  DATASET_REPO_URL = "https://huggingface.co/datasets/muryshev/saiga-chat"
 
88
  # commit_url = repo.push_to_hub()
89
  # app.logger.info(commit_url)
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  @app.route('/change_context_size', methods=['GET'])
92
  def handler_change_context_size():
93
  global stop_generation, model
 
130
  @app.route('/', methods=['POST'])
131
  def generate_response():
132
 
133
+ app.logger.info('generate_response called')
 
 
 
 
 
134
  data = request.get_json()
135
  app.logger.info(data)
136
  messages = data.get("messages", [])
 
148
  'return_full_text': parameters.get("return_full_text", False)
149
  }
150
 
151
+ generator = llm.create_chat_generator_for_saiga(messages=messages, parameters=p, use_system_prompt=USE_SYSTEM_PROMPT)
152
  app.logger.info('Generator created')
153
 
 
 
 
154
  # Use Response to stream tokens
155
  return Response(generate_and_log_tokens(user_request='1', generator=generator), content_type='text/plain', status=200, direct_passthrough=True)
156
 
 
162
  global last_request_time
163
  current_time = datetime.now()
164
  if (current_time - last_request_time).total_seconds() > 300: # 5 minutes in seconds
 
165
  llm.unload_model()
166
  app.logger.info(f"Model unloaded at {current_time}")
167
  else:
 
169
 
170
 
171
  if __name__ == "__main__":
 
 
 
172
 
173
  init_model()
174
 
175
+ app.run(host="0.0.0.0", port=7860, debug=True, threaded=True)
176
+
177
+ scheduler = BackgroundScheduler()
178
+ scheduler.add_job(check_last_request_time, trigger='interval', minutes=1)
179
+ scheduler.start()
llm_backend.py CHANGED
@@ -1,7 +1,11 @@
1
  from llama_cpp import Llama
2
  import gc
3
  import threading
 
 
4
 
 
 
5
  class LlmBackend:
6
 
7
  SYSTEM_PROMPT = "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык."
@@ -18,6 +22,7 @@ class LlmBackend:
18
 
19
  _instance = None
20
  _model = None
 
21
  _lock = threading.Lock()
22
 
23
  def __new__(cls):
@@ -30,6 +35,14 @@ class LlmBackend:
30
  return self._model is not None
31
 
32
  def load_model(self, model_path, context_size=2000, enable_gpu=True, gpu_layer_number=35, n_gqa=8, chat_format='llama-2'):
 
 
 
 
 
 
 
 
33
 
34
  if self._model is not None:
35
  self.unload_model()
@@ -44,10 +57,11 @@ class LlmBackend:
44
  #n_batch=100,
45
  logits_all=True,
46
  #n_threads=12,
47
- verbose=True,
48
  n_gpu_layers=gpu_layer_number,
49
  n_gqa=n_gqa #must be set for 70b models
50
  )
 
51
  return self._model
52
  else:
53
  self._model = Llama(
@@ -58,9 +72,10 @@ class LlmBackend:
58
  #n_batch=100,
59
  logits_all=True,
60
  #n_threads=12,
61
- verbose=True,
62
  n_gqa=n_gqa #must be set for 70b models
63
  )
 
64
  return self._model
65
 
66
  def set_system_prompt(self, prompt):
@@ -68,54 +83,71 @@ class LlmBackend:
68
  self.SYSTEM_PROMPT = prompt
69
 
70
  def unload_model(self):
 
71
  with self._lock:
72
  if self._model is not None:
73
  del self._model
74
-
 
 
 
 
 
 
 
 
 
 
 
75
  def generate_tokens(self, generator):
76
- print('generate_tokens called')
77
  with self._lock:
78
- print('generate_tokens started')
 
79
  try:
80
  for token in generator:
81
  if token == self._model.token_eos():
82
- print('End generating')
83
  yield b'' # End of chunk
84
  break
85
 
86
  token_str = self._model.detokenize([token])#.decode("utf-8", errors="ignore")
87
  yield token_str
88
  except Exception as e:
89
- print('generator exception')
90
- print(e)
91
  yield b'' # End of chunk
92
 
93
  def create_chat_completion(self, messages, stream=True):
94
- print('create_chat_completion called')
95
  with self._lock:
96
- print('create_chat_completion started')
97
  try:
98
  return self._model.create_chat_completion(messages=messages, stream=stream)
99
  except Exception as e:
100
- print('create_chat_completion exception')
101
- print(e)
102
  return None
103
 
104
 
105
  def get_message_tokens(self, role, content):
 
 
106
  message_tokens = self._model.tokenize(content.encode("utf-8"))
107
  message_tokens.insert(1, self.ROLE_TOKENS[role])
108
  message_tokens.insert(2, self.LINEBREAK_TOKEN)
109
  message_tokens.append(self._model.token_eos())
 
110
  return message_tokens
111
 
112
  def get_system_tokens(self):
113
  return self.get_message_tokens(role="system", content=self.SYSTEM_PROMPT)
114
 
115
- def create_chat_generator_for_saiga(self, messages, parameters):
116
- print('create_chat_completion called')
117
  with self._lock:
118
- tokens = self.get_system_tokens()
 
119
  for message in messages:
120
  message_tokens = self.get_message_tokens(role=message.get("from"), content=message.get("content", ""))
121
  tokens.extend(message_tokens)
@@ -128,19 +160,25 @@ class LlmBackend:
128
  temp=parameters['temperature'],
129
  repeat_penalty=parameters['repetition_penalty']
130
  )
 
131
  return generator
132
 
133
  def generate_tokens(self, generator):
 
134
  with self._lock:
 
135
  try:
136
  for token in generator:
137
  if token == self._model.token_eos():
138
  yield b'' # End of chunk
 
139
  break
140
 
141
  token_str = self._model.detokenize([token])#.decode("utf-8", errors="ignore")
142
  yield token_str
143
  except Exception as e:
 
 
144
  yield b'' # End of chunk
145
 
146
 
 
1
  from llama_cpp import Llama
2
  import gc
3
  import threading
4
+ import logging
5
+ import sys
6
 
7
+ log = logging.getLogger('llm_api.backend')
8
+
9
  class LlmBackend:
10
 
11
  SYSTEM_PROMPT = "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык."
 
22
 
23
  _instance = None
24
  _model = None
25
+ _model_params = None
26
  _lock = threading.Lock()
27
 
28
  def __new__(cls):
 
35
  return self._model is not None
36
 
37
  def load_model(self, model_path, context_size=2000, enable_gpu=True, gpu_layer_number=35, n_gqa=8, chat_format='llama-2'):
38
+ log.info('load_model - started')
39
+ self._model_params = {}
40
+ self._model_params['model_path'] = model_path
41
+ self._model_params['context_size'] = context_size
42
+ self._model_params['enable_gpu'] = enable_gpu
43
+ self._model_params['gpu_layer_number'] = gpu_layer_number
44
+ self._model_params['n_gqa'] = n_gqa
45
+ self._model_params['chat_format'] = chat_format
46
 
47
  if self._model is not None:
48
  self.unload_model()
 
57
  #n_batch=100,
58
  logits_all=True,
59
  #n_threads=12,
60
+ verbose=False,
61
  n_gpu_layers=gpu_layer_number,
62
  n_gqa=n_gqa #must be set for 70b models
63
  )
64
+ log.info('load_model - finished')
65
  return self._model
66
  else:
67
  self._model = Llama(
 
72
  #n_batch=100,
73
  logits_all=True,
74
  #n_threads=12,
75
+ verbose=False,
76
  n_gqa=n_gqa #must be set for 70b models
77
  )
78
+ log.info('load_model - finished')
79
  return self._model
80
 
81
  def set_system_prompt(self, prompt):
 
83
  self.SYSTEM_PROMPT = prompt
84
 
85
  def unload_model(self):
86
+ log.info('unload_model - started')
87
  with self._lock:
88
  if self._model is not None:
89
  del self._model
90
+ log.info('unload_model - finished')
91
+
92
+ def ensure_model_is_loaded(self):
93
+ log.info('ensure_model_is_loaded - started')
94
+ if not self.is_model_loaded():
95
+ log.info('ensure_model_is_loaded - model reloading')
96
+ if self._model_params is not None:
97
+ self.load_model(**self._model_params)
98
+ else:
99
+ log.info('ensure_model_is_loaded - No model config found. Reloading can not be done.')
100
+ log.info('ensure_model_is_loaded - finished')
101
+
102
  def generate_tokens(self, generator):
103
+ log.info('generate_tokens - started')
104
  with self._lock:
105
+ self.ensure_model_is_loaded()
106
+
107
  try:
108
  for token in generator:
109
  if token == self._model.token_eos():
110
+ log.info('generate_tokens - finished')
111
  yield b'' # End of chunk
112
  break
113
 
114
  token_str = self._model.detokenize([token])#.decode("utf-8", errors="ignore")
115
  yield token_str
116
  except Exception as e:
117
+ log.error('generate_tokens - error')
118
+ log.error(e)
119
  yield b'' # End of chunk
120
 
121
  def create_chat_completion(self, messages, stream=True):
122
+ log.info('create_chat_completion called')
123
  with self._lock:
124
+ log.info('create_chat_completion started')
125
  try:
126
  return self._model.create_chat_completion(messages=messages, stream=stream)
127
  except Exception as e:
128
+ log.error('create_chat_completion - error')
129
+ log.error(e)
130
  return None
131
 
132
 
133
  def get_message_tokens(self, role, content):
134
+ log.info('get_message_tokens - started')
135
+ self.ensure_model_is_loaded()
136
  message_tokens = self._model.tokenize(content.encode("utf-8"))
137
  message_tokens.insert(1, self.ROLE_TOKENS[role])
138
  message_tokens.insert(2, self.LINEBREAK_TOKEN)
139
  message_tokens.append(self._model.token_eos())
140
+ log.info('get_message_tokens - finished')
141
  return message_tokens
142
 
143
  def get_system_tokens(self):
144
  return self.get_message_tokens(role="system", content=self.SYSTEM_PROMPT)
145
 
146
+ def create_chat_generator_for_saiga(self, messages, parameters, use_system_prompt=True):
147
+ log.info('create_chat_generator_for_saiga - started')
148
  with self._lock:
149
+ self.ensure_model_is_loaded()
150
+ tokens = self.get_system_tokens() if use_system_prompt else []
151
  for message in messages:
152
  message_tokens = self.get_message_tokens(role=message.get("from"), content=message.get("content", ""))
153
  tokens.extend(message_tokens)
 
160
  temp=parameters['temperature'],
161
  repeat_penalty=parameters['repetition_penalty']
162
  )
163
+ log.info('create_chat_generator_for_saiga - finished')
164
  return generator
165
 
166
  def generate_tokens(self, generator):
167
+ log.info('generate_tokens - started')
168
  with self._lock:
169
+ self.ensure_model_is_loaded()
170
  try:
171
  for token in generator:
172
  if token == self._model.token_eos():
173
  yield b'' # End of chunk
174
+ log.info('generate_tokens - finished')
175
  break
176
 
177
  token_str = self._model.detokenize([token])#.decode("utf-8", errors="ignore")
178
  yield token_str
179
  except Exception as e:
180
+ log.error('generate_tokens - error')
181
+ log.error(e)
182
  yield b'' # End of chunk
183
 
184