Spaces:

Hjgugugjhuhjggg
/

Hhhgg

Build error

App Files Files Community

Hjgugugjhuhjggg commited on 6 days ago

Commit

1cb967f

•

1 Parent(s): 84d1dae

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -73

app.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import gc
 import psutil
 import os
-import time
 import torch
 from fastapi import FastAPI
-from vllm import VLLM
 from chatgptcache import cache
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
@@ -15,13 +14,14 @@ from collections import Counter
 import asyncio
 import torch.nn.utils.prune as prune
 from concurrent.futures import ThreadPoolExecutor
 nltk.download('punkt')
 nltk.download('stopwords')
 app = FastAPI()
-# Definir los modelos (serán cargados más tarde)
 model_1 = None
 model_2 = None
 model_3 = None
@@ -37,12 +37,10 @@ previous_responses_2 = []
 previous_responses_3 = []
 previous_responses_4 = []
-MAX_TOKENS = 2048  # Máximo de tokens para entrada y salida del modelo
-# Usar ThreadPoolExecutor para ejecución en paralelo
 executor = ThreadPoolExecutor(max_workers=4)
-# Configuración del dispositivo (CPU)
 device = torch.device("cpu")
 def get_best_response(new_response, previous_responses):
@@ -90,17 +88,16 @@ def apply_pruning(model):
     for name, module in model.named_modules():
         if isinstance(module, torch.nn.Linear):
             prune.random_unstructured(module, name="weight", amount=0.2)
-            prune.remove(module, name="weight")  # Opcional: Eliminar la máscara de poda para conservar los pesos podados
     return model
 def split_input(input_text, max_tokens):
-    tokens = input_text.split()  # Dividir entrada en palabras (tokens)
     chunks = []
     chunk = []
     total_tokens = 0
     for word in tokens:
-        word_length = len(word.split())  # Estimar la longitud de los tokens
         if total_tokens + word_length > max_tokens:
             chunks.append(" ".join(chunk))
             chunk = [word]
@@ -108,20 +105,17 @@ def split_input(input_text, max_tokens):
         else:
             chunk.append(word)
             total_tokens += word_length
     if chunk:
-        chunks.append(" ".join(chunk))  # Agregar el último fragmento
     return chunks
 def split_output(output_text, max_tokens):
-    tokens = output_text.split()  # Dividir salida en palabras (tokens)
     chunks = []
     chunk = []
     total_tokens = 0
     for word in tokens:
-        word_length = len(word.split())  # Estimar la longitud de los tokens
         if total_tokens + word_length > max_tokens:
             chunks.append(" ".join(chunk))
             chunk = [word]
@@ -129,44 +123,48 @@ def split_output(output_text, max_tokens):
         else:
             chunk.append(word)
             total_tokens += word_length
     if chunk:
-        chunks.append(" ".join(chunk))  # Agregar el último fragmento
     return chunks
-async def load_model_async(model_name: str):
-    max_model_len = MAX_TOKENS  # Establecer la longitud máxima del modelo (tokens)
-    if model_name == "model_1":
-        return VLLM("Hjgugugjhuhjggg/llama-3.2-1B-spinquant-hf", device=device, max_model_len=max_model_len)
-    elif model_name == "model_2":
-        return VLLM("Qwen/Qwen2.5-Coder-1.5B", device=device, max_model_len=max_model_len)
-    elif model_name == "model_3":
-        return VLLM("Qwen/Qwen2.5-3B-Instruct", device=device, max_model_len=max_model_len)
-    elif model_name == "model_4":
-        return VLLM("gpt2", device=device, max_model_len=max_model_len)
-    return None
 async def load_models():
     global model_1, model_2, model_3, model_4
-    tasks = [
-        load_model_async("model_1"),
-        load_model_async("model_2"),
-        load_model_async("model_3"),
-        load_model_async("model_4"),
-    ]
-    results = await asyncio.gather(*tasks)
-    model_1, model_2, model_3, model_4 = results
-    model_1 = apply_pruning(model_1)
-    model_2 = apply_pruning(model_2)
-    model_3 = apply_pruning(model_3)
-    model_4 = apply_pruning(model_4)
-    print("Modelos cargados y podados exitosamente.")
 async def optimize_models_periodically():
     while True:
-        await load_models()  # Cargar y optimizar modelos automáticamente
-        await asyncio.sleep(3600)  # Optimizar modelos cada hora (ajustar intervalo según sea necesario)
 @app.on_event("startup")
 async def startup():
@@ -181,34 +179,16 @@ async def monitor_memory():
 @app.get("/generate")
 async def generate_response(model_name: str, input_text: str):
-    def generate_for_model(model, input_text, cache, previous_responses):
-        cached_output = cache.get(input_text)
-        if cached_output:
-            return cached_output
-        input_chunks = split_input(input_text, MAX_TOKENS)
-        output_text = ""
-        prev_output = ""
-        for chunk in input_chunks:
-            prompt = prev_output + chunk
-            output_text += model.generate(prompt)
-            prev_output = output_text.split()[-50:]
-        output_chunks = split_output(output_text, MAX_TOKENS)
-        best_response = get_best_response(output_chunks[0], previous_responses)
-        cache.put(input_text, best_response)
-        previous_responses.append(best_response)
-        return best_response
-    result = await asyncio.get_event_loop().run_in_executor(
-        executor,
-        generate_for_model,
-        model_1 if model_name == "model1" else model_2 if model_name == "model2" else model_3 if model_name == "model3" else model_4,
-        input_text,
-        cache_1 if model_name == "model1" else cache_2 if model_name == "model2" else cache_3 if model_name == "model3" else cache_4,
-        previous_responses_1 if model_name == "model1" else previous_responses_2 if model_name == "model2" else previous_responses_3 if model_name == "model3" else previous_responses_4
-    )
     return {f"{model_name}_output": result}
 @app.get("/unified_summary")

 import gc
 import psutil
 import os
 import torch
 from fastapi import FastAPI
+from langchain.llms import VLLM
 from chatgptcache import cache
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import asyncio
 import torch.nn.utils.prune as prune
 from concurrent.futures import ThreadPoolExecutor
+from langchain.prompts import PromptTemplate
+from langchain.chains import LLMChain
 nltk.download('punkt')
 nltk.download('stopwords')
 app = FastAPI()
 model_1 = None
 model_2 = None
 model_3 = None
 previous_responses_3 = []
 previous_responses_4 = []
+MAX_TOKENS = 2048
 executor = ThreadPoolExecutor(max_workers=4)
 device = torch.device("cpu")
 def get_best_response(new_response, previous_responses):
     for name, module in model.named_modules():
         if isinstance(module, torch.nn.Linear):
             prune.random_unstructured(module, name="weight", amount=0.2)
+            prune.remove(module, name="weight")
     return model
 def split_input(input_text, max_tokens):
+    tokens = input_text.split()
     chunks = []
     chunk = []
     total_tokens = 0
     for word in tokens:
+        word_length = len(word.split())
         if total_tokens + word_length > max_tokens:
             chunks.append(" ".join(chunk))
             chunk = [word]
         else:
             chunk.append(word)
             total_tokens += word_length
     if chunk:
+        chunks.append(" ".join(chunk))
     return chunks
 def split_output(output_text, max_tokens):
+    tokens = output_text.split()
     chunks = []
     chunk = []
     total_tokens = 0
     for word in tokens:
+        word_length = len(word.split())
         if total_tokens + word_length > max_tokens:
             chunks.append(" ".join(chunk))
             chunk = [word]
         else:
             chunk.append(word)
             total_tokens += word_length
     if chunk:
+        chunks.append(" ".join(chunk))
     return chunks
+def create_langchain_model(model_name: str, device: torch.device, cache, previous_responses):
+    vllm_llm = VLLM(model_name=model_name, device=device)
+    template = """
+    You are a helpful assistant. Given the following text, generate a meaningful response:
+    {input_text}
+    """
+    prompt = PromptTemplate(input_variables=["input_text"], template=template)
+    chain = LLMChain(llm=vllm_llm, prompt=prompt)
+    def generate_for_model(input_text):
+        cached_output = cache.get(input_text)
+        if cached_output:
+            return cached_output
+        input_chunks = split_input(input_text, MAX_TOKENS)
+        output_text = ""
+        prev_output = ""
+        for chunk in input_chunks:
+            prompt = prev_output + chunk
+            output_text += chain.run(input_text=prompt)
+            prev_output = output_text.split()[-50:]
+        output_chunks = split_output(output_text, MAX_TOKENS)
+        best_response = get_best_response(output_chunks[0], previous_responses)
+        cache.put(input_text, best_response)
+        previous_responses.append(best_response)
+        return best_response
+    return generate_for_model
 async def load_models():
     global model_1, model_2, model_3, model_4
+    model_1 = create_langchain_model("Hjgugugjhuhjggg/llama-3.2-1B-spinquant-hf", device, cache_1, previous_responses_1)
+    model_2 = create_langchain_model("Qwen/Qwen2.5-Coder-1.5B", device, cache_2, previous_responses_2)
+    model_3 = create_langchain_model("Qwen/Qwen2.5-3B-Instruct", device, cache_3, previous_responses_3)
+    model_4 = create_langchain_model("gpt2", device, cache_4, previous_responses_4)
+    print("Modelos cargados exitosamente.")
 async def optimize_models_periodically():
     while True:
+        await load_models()
+        await asyncio.sleep(3600)
 @app.on_event("startup")
 async def startup():
 @app.get("/generate")
 async def generate_response(model_name: str, input_text: str):
+    if model_name == "model1":
+        result = await asyncio.get_event_loop().run_in_executor(executor, model_1, input_text)
+    elif model_name == "model2":
+        result = await asyncio.get_event_loop().run_in_executor(executor, model_2, input_text)
+    elif model_name == "model3":
+        result = await asyncio.get_event_loop().run_in_executor(executor, model_3, input_text)
+    elif model_name == "model4":
+        result = await asyncio.get_event_loop().run_in_executor(executor, model_4, input_text)
+    else:
+        return {"error": "Model not found"}
     return {f"{model_name}_output": result}
 @app.get("/unified_summary")