Update app.py
Browse files
app.py
CHANGED
@@ -15,26 +15,20 @@ from cachetools import TTLCache
|
|
15 |
from multiprocessing import cpu_count
|
16 |
import queue
|
17 |
|
18 |
-
# Configuraci贸n de logging para suprimir mensajes de depuraci贸n innecesarios
|
19 |
logging.basicConfig(level=logging.ERROR)
|
20 |
|
21 |
-
# Cargar variables de entorno
|
22 |
load_dotenv()
|
23 |
|
24 |
-
# Inicializar aplicaci贸n FastAPI
|
25 |
app = FastAPI()
|
26 |
|
27 |
-
# Configuraci贸n de la cach茅
|
28 |
cache_size = 2000
|
29 |
cache_ttl = 7200
|
30 |
cache = TTLCache(maxsize=cache_size, ttl=cache_ttl)
|
31 |
|
32 |
-
# Diccionario global para almacenar los modelos en RAM
|
33 |
global_data = {
|
34 |
'models': {}
|
35 |
}
|
36 |
|
37 |
-
# Configuraci贸n de los modelos
|
38 |
model_configs = [
|
39 |
{"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
|
40 |
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
|
@@ -47,7 +41,6 @@ model_configs = [
|
|
47 |
{"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"}
|
48 |
]
|
49 |
|
50 |
-
# Clase para gestionar modelos
|
51 |
class ModelManager:
|
52 |
def __init__(self):
|
53 |
self.models = {}
|
@@ -68,19 +61,16 @@ class ModelManager:
|
|
68 |
future.result()
|
69 |
return self.models
|
70 |
|
71 |
-
# Instanciar ModelManager y cargar modelos
|
72 |
model_manager = ModelManager()
|
73 |
model_manager.load_all_models()
|
74 |
global_data['models'] = model_manager.models
|
75 |
|
76 |
-
# Clase para la solicitud de chat
|
77 |
class ChatRequest(BaseModel):
|
78 |
message: str
|
79 |
top_k: int = 50
|
80 |
top_p: float = 0.95
|
81 |
temperature: float = 0.7
|
82 |
|
83 |
-
# Funci贸n para generar respuestas de chat
|
84 |
@lru_cache(maxsize=20000)
|
85 |
def generate_chat_response(request: ChatRequest, model_name: str):
|
86 |
cache_key = f"{request.message}_{model_name}"
|
@@ -102,7 +92,6 @@ def generate_chat_response(request: ChatRequest, model_name: str):
|
|
102 |
)
|
103 |
reply = response['choices'][0]['message']['content']
|
104 |
|
105 |
-
# Almacenar en cach茅 la respuesta
|
106 |
cache[cache_key] = {"response": reply, "literal": user_input, "model_name": model_name}
|
107 |
|
108 |
return cache[cache_key]
|
@@ -185,14 +174,12 @@ async def generate_chat(request: ChatRequest):
|
|
185 |
"all_responses": responses
|
186 |
}
|
187 |
|
188 |
-
# Cargar los modelos en la memoria RAM de manera m谩s eficiente
|
189 |
def pre_load_models():
|
190 |
for model_name, model in global_data['models'].items():
|
191 |
-
model._load_model()
|
192 |
|
193 |
pre_load_models()
|
194 |
|
195 |
-
# Optimizaci贸n de la carga de modelos en lotes
|
196 |
def batch_load_models(model_configs):
|
197 |
for i in range(0, len(model_configs), cpu_count()):
|
198 |
batch = model_configs[i:i + cpu_count()]
|
@@ -200,4 +187,4 @@ def batch_load_models(model_configs):
|
|
200 |
model_manager.load_model(config)
|
201 |
|
202 |
if __name__ == "__main__":
|
203 |
-
uvicorn.run(app, host="0.0.0.0", port=
|
|
|
15 |
from multiprocessing import cpu_count
|
16 |
import queue
|
17 |
|
|
|
18 |
logging.basicConfig(level=logging.ERROR)
|
19 |
|
|
|
20 |
load_dotenv()
|
21 |
|
|
|
22 |
app = FastAPI()
|
23 |
|
|
|
24 |
cache_size = 2000
|
25 |
cache_ttl = 7200
|
26 |
cache = TTLCache(maxsize=cache_size, ttl=cache_ttl)
|
27 |
|
|
|
28 |
global_data = {
|
29 |
'models': {}
|
30 |
}
|
31 |
|
|
|
32 |
model_configs = [
|
33 |
{"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
|
34 |
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
|
|
|
41 |
{"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"}
|
42 |
]
|
43 |
|
|
|
44 |
class ModelManager:
|
45 |
def __init__(self):
|
46 |
self.models = {}
|
|
|
61 |
future.result()
|
62 |
return self.models
|
63 |
|
|
|
64 |
model_manager = ModelManager()
|
65 |
model_manager.load_all_models()
|
66 |
global_data['models'] = model_manager.models
|
67 |
|
|
|
68 |
class ChatRequest(BaseModel):
|
69 |
message: str
|
70 |
top_k: int = 50
|
71 |
top_p: float = 0.95
|
72 |
temperature: float = 0.7
|
73 |
|
|
|
74 |
@lru_cache(maxsize=20000)
|
75 |
def generate_chat_response(request: ChatRequest, model_name: str):
|
76 |
cache_key = f"{request.message}_{model_name}"
|
|
|
92 |
)
|
93 |
reply = response['choices'][0]['message']['content']
|
94 |
|
|
|
95 |
cache[cache_key] = {"response": reply, "literal": user_input, "model_name": model_name}
|
96 |
|
97 |
return cache[cache_key]
|
|
|
174 |
"all_responses": responses
|
175 |
}
|
176 |
|
|
|
177 |
def pre_load_models():
|
178 |
for model_name, model in global_data['models'].items():
|
179 |
+
model._load_model()
|
180 |
|
181 |
pre_load_models()
|
182 |
|
|
|
183 |
def batch_load_models(model_configs):
|
184 |
for i in range(0, len(model_configs), cpu_count()):
|
185 |
batch = model_configs[i:i + cpu_count()]
|
|
|
187 |
model_manager.load_model(config)
|
188 |
|
189 |
if __name__ == "__main__":
|
190 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|