Spaces:

lilmeaty
/

Ghcg

Sleeping

App Files Files Community

Hjgugugjhuhjggg commited on Feb 6

Commit

3ef7ee3

verified ·

1 Parent(s): bd394da

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -52

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import torch
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
@@ -10,7 +11,6 @@ from transformers import (
     GenerationConfig,
     StoppingCriteriaList,
     StoppingCriteria,
-    TextStreamer,
     pipeline
 )
 import uvicorn
@@ -19,8 +19,28 @@ from io import BytesIO
 import soundfile as sf
 import traceback
 app = FastAPI()
 class GenerateRequest(BaseModel):
     model_name: str
     input_text: str = ""
@@ -54,16 +74,17 @@ class LocalModelLoader:
         self.loaded_models = {}
     async def load_model_and_tokenizer(self, model_name):
         if model_name in self.loaded_models:
             return self.loaded_models[model_name]
         try:
             config = AutoConfig.from_pretrained(model_name)
             tokenizer = AutoTokenizer.from_pretrained(model_name, config=config)
-            model = AutoModelForCausalLM.from_pretrained(model_name, config=config)
             if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
                 tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
             self.loaded_models[model_name] = (model, tokenizer)
             return model, tokenizer
         except Exception as e:
@@ -81,10 +102,10 @@ class StopOnTokens(StoppingCriteria):
                 return True
         return False
 @app.post("/generate")
 async def generate(request: GenerateRequest):
     try:
         model_name = request.model_name
         input_text = request.input_text
         task_type = request.task_type
@@ -119,66 +140,79 @@ async def generate(request: GenerateRequest):
         stopping_criteria_list = StoppingCriteriaList([StopOnTokens(stop_token_ids)]) if stop_token_ids else None
         if stream:
-            return StreamingResponse(
                 stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay),
                 media_type="text/plain"
             )
         else:
-            generated_text = generate_non_stream(model, tokenizer, input_text, generation_config, stopping_criteria_list, device)
-            return StreamingResponse(iter([generated_text]), media_type="text/plain")
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
-async def stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay, max_length=2048):
     encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
     encoded_input_len = encoded_input["input_ids"].shape[-1]
-    for output in model.generate(
-        **encoded_input,
-        generation_config=generation_config,
-        stopping_criteria=stopping_criteria_list,
-        stream=True,
-        return_dict_in_generate=True,
-        output_scores=True,
-    ):
-        new_tokens = output.sequences[:, encoded_input_len:]
-        for token_batch in new_tokens:
-            token = tokenizer.decode(token_batch, skip_special_tokens=True)
-            if token:
-                yield token
-                await asyncio.sleep(chunk_delay)
-async def generate_non_stream(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, max_length=2048):
     encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
-    output = model.generate(
-        **encoded_input,
-        generation_config=generation_config,
-        stopping_criteria=stopping_criteria_list,
-        return_dict_in_generate=True,
-        output_scores=True
-    )
     generated_text = tokenizer.decode(output.sequences[0][encoded_input["input_ids"].shape[-1]:], skip_special_tokens=True)
     return generated_text
 @app.post("/generate-image")
 async def generate_image(request: GenerateRequest):
     try:
         validated_body = request
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        image_generator = pipeline("text-to-image", model=validated_body.model_name, device=device)
-        image = image_generator(validated_body.input_text)[0]
         img_byte_arr = BytesIO()
         image.save(img_byte_arr, format="PNG")
         img_byte_arr.seek(0)
         return StreamingResponse(img_byte_arr, media_type="image/png")
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
@@ -186,18 +220,19 @@ async def generate_image(request: GenerateRequest):
 async def generate_text_to_speech(request: GenerateRequest):
     try:
         validated_body = request
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        audio_generator = pipeline("text-to-speech", model=validated_body.model_name, device=device)
-        audio = audio_generator(validated_body.input_text)
-        sampling_rate = audio_generator.sampling_rate
         audio_byte_arr = BytesIO()
         sf.write(audio_byte_arr, audio, sampling_rate, format='WAV')
         audio_byte_arr.seek(0)
         return StreamingResponse(audio_byte_arr, media_type="audio/wav")
     except Exception as e:
         traceback.print_exc()
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
@@ -206,19 +241,20 @@ async def generate_text_to_speech(request: GenerateRequest):
 async def generate_video(request: GenerateRequest):
     try:
         validated_body = request
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        video_generator = pipeline("text-to-video", model=validated_body.model_name, device=device)
-        video = video_generator(validated_body.input_text)
         video_byte_arr = BytesIO()
         video.save(video_byte_arr)
         video_byte_arr.seek(0)
         return StreamingResponse(video_byte_arr, media_type="video/mp4")
     except Exception as e:
         traceback.print_exc()
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 import os
+import gc
 import torch
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
     GenerationConfig,
     StoppingCriteriaList,
     StoppingCriteria,
     pipeline
 )
 import uvicorn
 import soundfile as sf
 import traceback
+# --- Bloque para limitar la RAM al 1% (sólo en entornos Unix) ---
+try:
+    import psutil
+    import resource
+    total_memory = psutil.virtual_memory().total
+    limit = int(total_memory * 0.01)  # 1% del total en bytes
+    resource.setrlimit(resource.RLIMIT_AS, (limit, limit))
+    print(f"Memory limit set to {limit} bytes (1% of total system memory).")
+except Exception as e:
+    print("No se pudo establecer el límite de memoria:", e)
+# --- Fin del bloque de limitación de RAM ---
 app = FastAPI()
+# Función asíncrona para limpiar la memoria (RAM y caché CUDA)
+async def cleanup_memory(device: str):
+    gc.collect()
+    if device == "cuda":
+        torch.cuda.empty_cache()
+    # Espera breve para permitir la liberación de memoria
+    await asyncio.sleep(0.01)
 class GenerateRequest(BaseModel):
     model_name: str
     input_text: str = ""
         self.loaded_models = {}
     async def load_model_and_tokenizer(self, model_name):
+        # Se utiliza el modelo indicado por el usuario
         if model_name in self.loaded_models:
             return self.loaded_models[model_name]
         try:
             config = AutoConfig.from_pretrained(model_name)
             tokenizer = AutoTokenizer.from_pretrained(model_name, config=config)
+            # Se usa torch_dtype=torch.float16 para reducir la huella en memoria (si es posible)
+            model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=torch.float16)
+            # Ajuste del token de relleno si es necesario
             if tokenizer.eos_token_id is not None and tokenizer.pad_token_id is None:
                 tokenizer.pad_token_id = config.pad_token_id or tokenizer.eos_token_id
             self.loaded_models[model_name] = (model, tokenizer)
             return model, tokenizer
         except Exception as e:
                 return True
         return False
 @app.post("/generate")
 async def generate(request: GenerateRequest):
     try:
+        # Extraer parámetros del request
         model_name = request.model_name
         input_text = request.input_text
         task_type = request.task_type
         stopping_criteria_list = StoppingCriteriaList([StopOnTokens(stop_token_ids)]) if stop_token_ids else None
         if stream:
+            # Se utiliza StreamingResponse con la función asíncrona que envía cada token en tiempo real.
+            response = StreamingResponse(
                 stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay),
                 media_type="text/plain"
             )
         else:
+            generated_text = await generate_non_stream(model, tokenizer, input_text, generation_config, stopping_criteria_list, device)
+            response = StreamingResponse(iter([generated_text]), media_type="text/plain")
+        await cleanup_memory(device)
+        return response
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+async def stream_text(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, chunk_delay, max_length=64):
+    """
+    Genera tokens de forma asíncrona y los envía al cliente en tiempo real.
+    """
+    # Limitar la entrada para minimizar el uso de memoria
     encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
     encoded_input_len = encoded_input["input_ids"].shape[-1]
+    # Con torch.no_grad() se evita almacenar información para gradientes
+    with torch.no_grad():
+        # Se genera el texto de forma iterativa (streaming)
+        for output in model.generate(
+            **encoded_input,
+            generation_config=generation_config,
+            stopping_criteria=stopping_criteria_list,
+            stream=True,
+            return_dict_in_generate=True,
+            output_scores=True,
+        ):
+            # Se extraen solo los tokens generados (excluyendo la entrada)
+            new_tokens = output.sequences[:, encoded_input_len:]
+            for token_batch in new_tokens:
+                token = tokenizer.decode(token_batch, skip_special_tokens=True)
+                if token:
+                    # Se envía cada token inmediatamente
+                    yield token
+                    await asyncio.sleep(chunk_delay)
+    await cleanup_memory(device)
+async def generate_non_stream(model, tokenizer, input_text, generation_config, stopping_criteria_list, device, max_length=64):
     encoded_input = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
+    with torch.no_grad():
+        output = model.generate(
+            **encoded_input,
+            generation_config=generation_config,
+            stopping_criteria=stopping_criteria_list,
+            return_dict_in_generate=True,
+            output_scores=True
+        )
     generated_text = tokenizer.decode(output.sequences[0][encoded_input["input_ids"].shape[-1]:], skip_special_tokens=True)
+    await cleanup_memory(device)
     return generated_text
 @app.post("/generate-image")
 async def generate_image(request: GenerateRequest):
     try:
         validated_body = request
+        device = 0 if torch.cuda.is_available() else -1  # pipeline espera int para CUDA
+        # Ejecutar el pipeline en un hilo separado
+        image_generator = await asyncio.to_thread(pipeline, "text-to-image", model=validated_body.model_name, device=device)
+        results = await asyncio.to_thread(image_generator, validated_body.input_text)
+        image = results[0]
         img_byte_arr = BytesIO()
         image.save(img_byte_arr, format="PNG")
         img_byte_arr.seek(0)
+        await cleanup_memory("cuda" if device == 0 else "cpu")
         return StreamingResponse(img_byte_arr, media_type="image/png")
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 async def generate_text_to_speech(request: GenerateRequest):
     try:
         validated_body = request
+        device = 0 if torch.cuda.is_available() else -1
+        # Ejecutar el pipeline en un hilo separado
+        tts_generator = await asyncio.to_thread(pipeline, "text-to-speech", model=validated_body.model_name, device=device)
+        tts_results = await asyncio.to_thread(tts_generator, validated_body.input_text)
+        audio = tts_results
+        sampling_rate = tts_generator.sampling_rate
         audio_byte_arr = BytesIO()
         sf.write(audio_byte_arr, audio, sampling_rate, format='WAV')
         audio_byte_arr.seek(0)
+        await cleanup_memory("cuda" if device == 0 else "cpu")
         return StreamingResponse(audio_byte_arr, media_type="audio/wav")
     except Exception as e:
         traceback.print_exc()
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 async def generate_video(request: GenerateRequest):
     try:
         validated_body = request
+        device = 0 if torch.cuda.is_available() else -1
+        # Ejecutar el pipeline en un hilo separado
+        video_generator = await asyncio.to_thread(pipeline, "text-to-video", model=validated_body.model_name, device=device)
+        video = await asyncio.to_thread(video_generator, validated_body.input_text)
         video_byte_arr = BytesIO()
         video.save(video_byte_arr)
         video_byte_arr.seek(0)
+        await cleanup_memory("cuda" if device == 0 else "cpu")
         return StreamingResponse(video_byte_arr, media_type="video/mp4")
     except Exception as e:
         traceback.print_exc()
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)