Yjhhh commited on
Commit
55af72e
·
verified ·
1 Parent(s): f4d520f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -55
app.py CHANGED
@@ -2,12 +2,13 @@ from fastapi import FastAPI, HTTPException, Request
2
  from pydantic import BaseModel
3
  import uvicorn
4
  import requests
5
- import io
6
  import asyncio
 
 
 
7
  from typing import List, Dict, Any
8
  from llama_cpp import Llama # Ajusta según la biblioteca que estés utilizando
9
- import os
10
-
11
 
12
  app = FastAPI()
13
 
@@ -38,96 +39,124 @@ class ModelManager:
38
  self.load_lock = asyncio.Lock()
39
  self.index_lock = asyncio.Lock()
40
  self.part_size = 1024 * 1024 # Tamaño de cada parte en bytes (1 MB)
 
41
 
42
  async def download_model_to_memory(self, model_config):
43
  url = f"https://huggingface.co/{model_config['repo_id']}/resolve/main/{model_config['filename']}"
 
44
  try:
45
  response = requests.get(url)
46
  response.raise_for_status()
 
47
  return io.BytesIO(response.content)
48
  except requests.RequestException as e:
49
  raise HTTPException(status_code=500, detail=f"Error al descargar el modelo: {e}")
50
 
 
 
 
 
 
 
 
 
 
51
  async def load_model(self, model_config):
52
  async with self.load_lock:
53
  try:
54
- model_file = await self.download_model_to_memory(model_config)
55
- llama = Llama(model_file) # Ajusta según la biblioteca y clase correctas
56
-
57
- tokenizer = llama.tokenizer
58
- model_data = {
59
- 'model': llama,
60
- 'tokenizer': tokenizer,
61
- 'pad_token': tokenizer.pad_token,
62
- 'pad_token_id': tokenizer.pad_token_id,
63
- 'eos_token': tokenizer.eos_token,
64
- 'eos_token_id': tokenizer.eos_token_id,
65
- 'bos_token': tokenizer.bos_token,
66
- 'bos_token_id': tokenizer.bos_token_id,
67
- 'unk_token': tokenizer.unk_token,
68
- 'unk_token_id': tokenizer.unk_token_id
69
- }
70
-
71
- self.models[model_config['name']] = model_data
72
- await self.handle_large_model(model_config, model_file)
 
 
 
 
 
 
 
73
  except Exception as e:
74
  print(f"Error al cargar el modelo: {e}")
75
 
76
- async def handle_large_model(self, model_config, model_file):
77
- total_size = len(model_file.getvalue())
78
  num_parts = (total_size + self.part_size - 1) // self.part_size
79
 
80
- for i in range(num_parts):
81
- start = i * self.part_size
82
- end = min(start + self.part_size, total_size)
83
- model_part = io.BytesIO(model_file.getvalue()[start:end])
84
- await self.index_model_part(model_part, i)
 
 
 
85
 
86
  async def index_model_part(self, model_part, part_index):
87
  async with self.index_lock:
88
  part_name = f"part_{part_index}"
 
89
  llama_part = Llama(model_part)
90
  self.model_parts[part_name] = llama_part
 
91
 
92
  async def generate_response(self, user_input):
 
93
  tasks = [self.generate_chat_response(user_input, model_data) for model_data in self.models.values()]
94
  responses = await asyncio.gather(*tasks)
95
  return responses
96
 
97
  async def generate_chat_response(self, user_input, model_data):
98
  try:
99
- llama = model_data['model']
100
- tokenizer = model_data['tokenizer']
101
-
102
- response = await asyncio.get_event_loop().run_in_executor(
103
- None,
104
- lambda: llama.generate(user_input, max_length=1000, do_sample=True)
105
- )
106
- generated_text = response['generated_text']
107
-
108
- # Dividir el texto generado en partes si es necesario
109
- parts = []
110
- while len(generated_text) > 1000:
111
- part = generated_text[:1000]
112
- generated_text = generated_text[1000:]
113
- parts.append(part.strip())
114
- if generated_text:
115
- parts.append(generated_text.strip())
116
-
117
- return {"response": '\n'.join(parts), "model_name": model_data['name']}
 
118
  except Exception as e:
119
- print(f"Error al generar la respuesta: {e}")
120
- return {"response": "Error al generar la respuesta", "model_name": model_data['name']}
121
 
122
- @app.post("/chat")
123
- async def chat(request: Request):
124
- body = await request.json()
125
- user_input = body.get('message', '').strip()
126
  if not user_input:
127
- raise HTTPException(status_code=400, detail="El mensaje no puede estar vacío.")
128
 
129
  try:
130
  model_manager = ModelManager()
 
 
131
  responses = await model_manager.generate_response(user_input)
132
  return {"responses": responses}
133
  except Exception as e:
 
2
  from pydantic import BaseModel
3
  import uvicorn
4
  import requests
 
5
  import asyncio
6
+ import os
7
+ import io
8
+ import time
9
  from typing import List, Dict, Any
10
  from llama_cpp import Llama # Ajusta según la biblioteca que estés utilizando
11
+ from tqdm import tqdm
 
12
 
13
  app = FastAPI()
14
 
 
39
  self.load_lock = asyncio.Lock()
40
  self.index_lock = asyncio.Lock()
41
  self.part_size = 1024 * 1024 # Tamaño de cada parte en bytes (1 MB)
42
+ self.max_loading_time = 0 # Tiempo máximo en segundos para cargar un modelo
43
 
44
  async def download_model_to_memory(self, model_config):
45
  url = f"https://huggingface.co/{model_config['repo_id']}/resolve/main/{model_config['filename']}"
46
+ print(f"Descargando modelo desde {url}")
47
  try:
48
  response = requests.get(url)
49
  response.raise_for_status()
50
+ print(f"Descarga completa para {model_config['name']}")
51
  return io.BytesIO(response.content)
52
  except requests.RequestException as e:
53
  raise HTTPException(status_code=500, detail=f"Error al descargar el modelo: {e}")
54
 
55
+ async def save_model_to_temp_file(self, model_config):
56
+ model_file = await self.download_model_to_memory(model_config)
57
+ temp_filename = f"/tmp/{model_config['filename']}"
58
+ print(f"Guardando el modelo en {temp_filename}")
59
+ with open(temp_filename, 'wb') as f:
60
+ f.write(model_file.getvalue())
61
+ print(f"Modelo guardado en {temp_filename}")
62
+ return temp_filename
63
+
64
  async def load_model(self, model_config):
65
  async with self.load_lock:
66
  try:
67
+ start_time = time.time()
68
+ temp_filename = await self.save_model_to_temp_file(model_config)
69
+ elapsed_time = time.time() - start_time
70
+ if elapsed_time > self.max_loading_time:
71
+ print(f"El modelo {model_config['name']} tardó {elapsed_time:.2f} segundos en cargar. Dividiendo el modelo.")
72
+ await self.handle_large_model(temp_filename, model_config)
73
+ else:
74
+ print(f"Cargando modelo desde {temp_filename}")
75
+ llama = Llama(temp_filename) # Ajusta según la biblioteca y clase correctas
76
+
77
+ tokenizer = llama.tokenizer
78
+ model_data = {
79
+ 'model': llama,
80
+ 'tokenizer': tokenizer,
81
+ 'pad_token': tokenizer.pad_token,
82
+ 'pad_token_id': tokenizer.pad_token_id,
83
+ 'eos_token': tokenizer.eos_token,
84
+ 'eos_token_id': tokenizer.eos_token_id,
85
+ 'bos_token': tokenizer.bos_token,
86
+ 'bos_token_id': tokenizer.bos_token_id,
87
+ 'unk_token': tokenizer.unk_token,
88
+ 'unk_token_id': tokenizer.unk_token_id
89
+ }
90
+
91
+ self.models[model_config['name']] = model_data
92
+ print(f"Modelo {model_config['name']} cargado correctamente")
93
  except Exception as e:
94
  print(f"Error al cargar el modelo: {e}")
95
 
96
+ async def handle_large_model(self, model_filename, model_config):
97
+ total_size = os.path.getsize(model_filename)
98
  num_parts = (total_size + self.part_size - 1) // self.part_size
99
 
100
+ print(f"Modelo {model_config['name']} dividido en {num_parts} partes")
101
+ with open(model_filename, 'rb') as file:
102
+ for i in tqdm(range(num_parts), desc=f"Indexando {model_config['name']}"):
103
+ start = i * self.part_size
104
+ end = min(start + self.part_size, total_size)
105
+ file.seek(start)
106
+ model_part = io.BytesIO(file.read(end - start))
107
+ await self.index_model_part(model_part, i)
108
 
109
  async def index_model_part(self, model_part, part_index):
110
  async with self.index_lock:
111
  part_name = f"part_{part_index}"
112
+ print(f"Indexando parte {part_index}")
113
  llama_part = Llama(model_part)
114
  self.model_parts[part_name] = llama_part
115
+ print(f"Parte {part_index} indexada")
116
 
117
  async def generate_response(self, user_input):
118
+ print("Generando respuestas")
119
  tasks = [self.generate_chat_response(user_input, model_data) for model_data in self.models.values()]
120
  responses = await asyncio.gather(*tasks)
121
  return responses
122
 
123
  async def generate_chat_response(self, user_input, model_data):
124
  try:
125
+ print(f"Generando respuesta usando el modelo {model_data['model']}")
126
+ start_time = time.time()
127
+ generated_text = model_data['model'].generate(user_input)
128
+ elapsed_time = time.time() - start_time
129
+
130
+ if len(generated_text) > 1000:
131
+ parts = []
132
+ while len(generated_text) > 1000:
133
+ part = generated_text[:1000]
134
+ parts.append(part)
135
+ generated_text = generated_text[1000:]
136
+ parts.append(generated_text)
137
+ else:
138
+ parts = [generated_text]
139
+
140
+ print(f"Respuesta generada usando el modelo {model_data['model']} en {elapsed_time:.2f} segundos")
141
+ return {
142
+ 'model_name': model_data['model'],
143
+ 'generated_text_parts': parts
144
+ }
145
  except Exception as e:
146
+ print(f"Error al generar respuesta con el modelo {model_data['model']}: {e}")
147
+ return {'model_name': model_data['model'], 'error': str(e)}
148
 
149
+ @app.post("/generate/")
150
+ async def generate(request: Request):
151
+ data = await request.json()
152
+ user_input = data.get('input', '')
153
  if not user_input:
154
+ raise HTTPException(status_code=400, detail="Se requiere una entrada de usuario.")
155
 
156
  try:
157
  model_manager = ModelManager()
158
+ tasks = [model_manager.load_model(config) for config in model_configs]
159
+ await asyncio.gather(*tasks)
160
  responses = await model_manager.generate_response(user_input)
161
  return {"responses": responses}
162
  except Exception as e: