Update app.py
Browse files
app.py
CHANGED
@@ -13,7 +13,6 @@ import numpy as np
|
|
13 |
from functools import lru_cache
|
14 |
from cachetools import TTLCache
|
15 |
from multiprocessing import cpu_count
|
16 |
-
import threading
|
17 |
import queue
|
18 |
|
19 |
# Configuraci贸n de logging para suprimir mensajes de depuraci贸n innecesarios
|
@@ -194,53 +193,11 @@ def pre_load_models():
|
|
194 |
pre_load_models()
|
195 |
|
196 |
# Optimizaci贸n de la carga de modelos en lotes
|
197 |
-
def
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
with ThreadPoolExecutor(max_workers=batch_size) as executor:
|
203 |
-
futures = [executor.submit(model_manager.load_model, config) for config in batch_configs]
|
204 |
-
for future in tqdm(as_completed(futures), total=len(batch_configs), desc="Optimizando carga de modelos", unit="modelo"):
|
205 |
-
try:
|
206 |
-
model = future.result()
|
207 |
-
global_data['models'][batch_configs[futures.index(future)]['name']] = model
|
208 |
-
except Exception as e:
|
209 |
-
logging.error(f"Error al optimizar la carga del modelo: {e}")
|
210 |
-
|
211 |
-
optimize_model_loading()
|
212 |
-
|
213 |
-
# Implementar t茅cnicas de paralelizaci贸n en la generaci贸n de respuestas
|
214 |
-
def parallelize_response_generation(request: ChatRequest):
|
215 |
-
response_queue = queue.Queue()
|
216 |
-
with ThreadPoolExecutor(max_workers=min(len(global_data['models']), cpu_count())) as executor:
|
217 |
-
futures = [executor.submit(worker_function, model_name, request, response_queue) for model_name in global_data['models']]
|
218 |
-
for future in tqdm(as_completed(futures), total=len(futures), desc="Generando respuestas en paralelo", unit="modelo"):
|
219 |
-
future.result()
|
220 |
-
|
221 |
-
responses = []
|
222 |
-
while not response_queue.empty():
|
223 |
-
responses.append(response_queue.get())
|
224 |
-
return responses
|
225 |
-
|
226 |
-
@app.post("/generate_chat_parallel")
|
227 |
-
async def generate_chat_parallel(request: ChatRequest):
|
228 |
-
if not request.message.strip():
|
229 |
-
raise HTTPException(status_code=400, detail="The message cannot be empty.")
|
230 |
-
|
231 |
-
responses = parallelize_response_generation(request)
|
232 |
-
best_response = select_best_response(responses)
|
233 |
-
|
234 |
-
return {
|
235 |
-
"best_response": best_response,
|
236 |
-
"all_responses": responses
|
237 |
-
}
|
238 |
-
|
239 |
-
# Optimizar el uso de memoria
|
240 |
-
def optimize_memory_usage():
|
241 |
-
import gc
|
242 |
-
gc.collect()
|
243 |
|
244 |
-
# Ejecutar el servidor FastAPI
|
245 |
if __name__ == "__main__":
|
246 |
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
13 |
from functools import lru_cache
|
14 |
from cachetools import TTLCache
|
15 |
from multiprocessing import cpu_count
|
|
|
16 |
import queue
|
17 |
|
18 |
# Configuraci贸n de logging para suprimir mensajes de depuraci贸n innecesarios
|
|
|
193 |
pre_load_models()
|
194 |
|
195 |
# Optimizaci贸n de la carga de modelos en lotes
|
196 |
+
def batch_load_models(model_configs):
|
197 |
+
for i in range(0, len(model_configs), cpu_count()):
|
198 |
+
batch = model_configs[i:i + cpu_count()]
|
199 |
+
for config in batch:
|
200 |
+
model_manager.load_model(config)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
|
|
|
202 |
if __name__ == "__main__":
|
203 |
uvicorn.run(app, host="0.0.0.0", port=8000)
|