Uhhy commited on
Commit
a98609e
verified
1 Parent(s): cb57b05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -15
app.py CHANGED
@@ -15,26 +15,20 @@ from cachetools import TTLCache
15
  from multiprocessing import cpu_count
16
  import queue
17
 
18
- # Configuraci贸n de logging para suprimir mensajes de depuraci贸n innecesarios
19
  logging.basicConfig(level=logging.ERROR)
20
 
21
- # Cargar variables de entorno
22
  load_dotenv()
23
 
24
- # Inicializar aplicaci贸n FastAPI
25
  app = FastAPI()
26
 
27
- # Configuraci贸n de la cach茅
28
  cache_size = 2000
29
  cache_ttl = 7200
30
  cache = TTLCache(maxsize=cache_size, ttl=cache_ttl)
31
 
32
- # Diccionario global para almacenar los modelos en RAM
33
  global_data = {
34
  'models': {}
35
  }
36
 
37
- # Configuraci贸n de los modelos
38
  model_configs = [
39
  {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
40
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
@@ -47,7 +41,6 @@ model_configs = [
47
  {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"}
48
  ]
49
 
50
- # Clase para gestionar modelos
51
  class ModelManager:
52
  def __init__(self):
53
  self.models = {}
@@ -68,19 +61,16 @@ class ModelManager:
68
  future.result()
69
  return self.models
70
 
71
- # Instanciar ModelManager y cargar modelos
72
  model_manager = ModelManager()
73
  model_manager.load_all_models()
74
  global_data['models'] = model_manager.models
75
 
76
- # Clase para la solicitud de chat
77
  class ChatRequest(BaseModel):
78
  message: str
79
  top_k: int = 50
80
  top_p: float = 0.95
81
  temperature: float = 0.7
82
 
83
- # Funci贸n para generar respuestas de chat
84
  @lru_cache(maxsize=20000)
85
  def generate_chat_response(request: ChatRequest, model_name: str):
86
  cache_key = f"{request.message}_{model_name}"
@@ -102,7 +92,6 @@ def generate_chat_response(request: ChatRequest, model_name: str):
102
  )
103
  reply = response['choices'][0]['message']['content']
104
 
105
- # Almacenar en cach茅 la respuesta
106
  cache[cache_key] = {"response": reply, "literal": user_input, "model_name": model_name}
107
 
108
  return cache[cache_key]
@@ -185,14 +174,12 @@ async def generate_chat(request: ChatRequest):
185
  "all_responses": responses
186
  }
187
 
188
- # Cargar los modelos en la memoria RAM de manera m谩s eficiente
189
  def pre_load_models():
190
  for model_name, model in global_data['models'].items():
191
- model._load_model() # M茅todo hipot茅tico para pre-cargar modelos en RAM
192
 
193
  pre_load_models()
194
 
195
- # Optimizaci贸n de la carga de modelos en lotes
196
  def batch_load_models(model_configs):
197
  for i in range(0, len(model_configs), cpu_count()):
198
  batch = model_configs[i:i + cpu_count()]
@@ -200,4 +187,4 @@ def batch_load_models(model_configs):
200
  model_manager.load_model(config)
201
 
202
  if __name__ == "__main__":
203
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
15
  from multiprocessing import cpu_count
16
  import queue
17
 
 
18
  logging.basicConfig(level=logging.ERROR)
19
 
 
20
  load_dotenv()
21
 
 
22
  app = FastAPI()
23
 
 
24
  cache_size = 2000
25
  cache_ttl = 7200
26
  cache = TTLCache(maxsize=cache_size, ttl=cache_ttl)
27
 
 
28
  global_data = {
29
  'models': {}
30
  }
31
 
 
32
  model_configs = [
33
  {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
34
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
 
41
  {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"}
42
  ]
43
 
 
44
  class ModelManager:
45
  def __init__(self):
46
  self.models = {}
 
61
  future.result()
62
  return self.models
63
 
 
64
  model_manager = ModelManager()
65
  model_manager.load_all_models()
66
  global_data['models'] = model_manager.models
67
 
 
68
  class ChatRequest(BaseModel):
69
  message: str
70
  top_k: int = 50
71
  top_p: float = 0.95
72
  temperature: float = 0.7
73
 
 
74
  @lru_cache(maxsize=20000)
75
  def generate_chat_response(request: ChatRequest, model_name: str):
76
  cache_key = f"{request.message}_{model_name}"
 
92
  )
93
  reply = response['choices'][0]['message']['content']
94
 
 
95
  cache[cache_key] = {"response": reply, "literal": user_input, "model_name": model_name}
96
 
97
  return cache[cache_key]
 
174
  "all_responses": responses
175
  }
176
 
 
177
  def pre_load_models():
178
  for model_name, model in global_data['models'].items():
179
+ model._load_model()
180
 
181
  pre_load_models()
182
 
 
183
  def batch_load_models(model_configs):
184
  for i in range(0, len(model_configs), cpu_count()):
185
  batch = model_configs[i:i + cpu_count()]
 
187
  model_manager.load_model(config)
188
 
189
  if __name__ == "__main__":
190
+ uvicorn.run(app, host="0.0.0.0", port=8000)