Spaces:

Hjgugugjhuhjggg
/

Hhhgg

Build error

File size: 7,621 Bytes

import gc
import psutil
import os
import torch
from fastapi import FastAPI
from langchain.llms import VLLM
from cachetools import TTLCache
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import asyncio
import torch.nn.utils.prune as prune
from concurrent.futures import ThreadPoolExecutor
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

nltk.download('punkt')
nltk.download('stopwords')

app = FastAPI()

model_1 = None
model_2 = None
model_3 = None
model_4 = None

# Using TTLCache from cachetools
cache_1 = TTLCache(maxsize=100, ttl=600)  # maxsize=100 and ttl=600 (10 minutes)
cache_2 = TTLCache(maxsize=100, ttl=600)
cache_3 = TTLCache(maxsize=100, ttl=600)
cache_4 = TTLCache(maxsize=100, ttl=600)

previous_responses_1 = []
previous_responses_2 = []
previous_responses_3 = []
previous_responses_4 = []

MAX_TOKENS = 2048

executor = ThreadPoolExecutor(max_workers=4)

# Configuración para usar solo la CPU
device = torch.device("cpu")

def get_best_response(new_response, previous_responses):
    if not previous_responses:
        return new_response
    vectorizer = TfidfVectorizer().fit_transform(previous_responses + [new_response])
    cosine_sim = cosine_similarity(vectorizer[-1], vectorizer[:-1])
    max_sim_index = cosine_sim.argmax()
    max_sim_score = cosine_sim[0][max_sim_index]
    if max_sim_score > 0.7:
        return previous_responses[max_sim_index]
    return new_response

def summarize_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words("english"))
    word_frequencies = Counter()
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [word for word in words if word.isalpha() and word not in stop_words]
        word_frequencies.update(words)
    most_common_words = word_frequencies.most_common(50)
    most_common_words = {word: freq for word, freq in most_common_words}
    ranked_sentences = []
    for sentence in sentences:
        score = sum(most_common_words.get(word, 0) for word in word_tokenize(sentence.lower()))
        ranked_sentences.append((score, sentence))
    ranked_sentences.sort(reverse=True, key=lambda x: x[0])
    summary = ' '.join([sentence for _, sentence in ranked_sentences[:3]])
    return summary

def clear_memory():
    gc.collect()
    process = psutil.Process(os.getpid())
    memory_usage = psutil.virtual_memory().percent
    if memory_usage > 90:
        global model_1, model_2, model_3, model_4
        model_1 = None
        model_2 = None
        model_3 = None
        model_4 = None
        gc.collect()

def apply_pruning(model):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.random_unstructured(module, name="weight", amount=0.2)
            prune.remove(module, name="weight")
    return model

def split_input(input_text, max_tokens):
    tokens = input_text.split()
    chunks = []
    chunk = []
    total_tokens = 0
    for word in tokens:
        word_length = len(word.split())
        if total_tokens + word_length > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = [word]
            total_tokens = word_length
        else:
            chunk.append(word)
            total_tokens += word_length
    if chunk:
        chunks.append(" ".join(chunk))
    return chunks

def split_output(output_text, max_tokens):
    tokens = output_text.split()
    chunks = []
    chunk = []
    total_tokens = 0
    for word in tokens:
        word_length = len(word.split())
        if total_tokens + word_length > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = [word]
            total_tokens = word_length
        else:
            chunk.append(word)
            total_tokens += word_length
    if chunk:
        chunks.append(" ".join(chunk))
    return chunks

def create_langchain_model(model_name: str, device: torch.device, cache, previous_responses):
    vllm_llm = VLLM(model_name=model_name, device=device)
    template = """
    You are a helpful assistant. Given the following text, generate a meaningful response:
    {input_text}
    """
    prompt = PromptTemplate(input_variables=["input_text"], template=template)
    chain = LLMChain(llm=vllm_llm, prompt=prompt)
    def generate_for_model(input_text):
        cached_output = cache.get(input_text)
        if cached_output:
            return cached_output
        input_chunks = split_input(input_text, MAX_TOKENS)
        output_text = ""
        prev_output = ""
        for chunk in input_chunks:
            prompt = prev_output + chunk
            output_text += chain.run(input_text=prompt)
            prev_output = output_text.split()[-50:]
        output_chunks = split_output(output_text, MAX_TOKENS)
        best_response = get_best_response(output_chunks[0], previous_responses)
        cache[input_text] = best_response
        previous_responses.append(best_response)
        return best_response
    return generate_for_model

async def load_models():
    global model_1, model_2, model_3, model_4
    model_1 = create_langchain_model("Hjgugugjhuhjggg/llama-3.2-1B-spinquant-hf", device, cache_1, previous_responses_1)
    model_2 = create_langchain_model("Qwen/Qwen2.5-Coder-1.5B", device, cache_2, previous_responses_2)
    model_3 = create_langchain_model("Qwen/Qwen2.5-3B-Instruct", device, cache_3, previous_responses_3)
    model_4 = create_langchain_model("gpt2", device, cache_4, previous_responses_4)
    print("Modelos cargados exitosamente.")

async def optimize_models_periodically():
    while True:
        await load_models()
        await asyncio.sleep(3600)

@app.on_event("startup")
async def startup():
    await load_models()
    app.add_event_handler("startup", monitor_memory)
    app.add_event_handler("startup", optimize_models_periodically)

async def monitor_memory():
    while True:
        clear_memory()
        await asyncio.sleep(60)

@app.get("/generate")
async def generate_response(model_name: str, input_text: str):
    if model_name == "model1":
        result = await asyncio.get_event_loop().run_in_executor(executor, model_1, input_text)
    elif model_name == "model2":
        result = await asyncio.get_event_loop().run_in_executor(executor, model_2, input_text)
    elif model_name == "model3":
        result = await asyncio.get_event_loop().run_in_executor(executor, model_3, input_text)
    elif model_name == "model4":
        result = await asyncio.get_event_loop().run_in_executor(executor, model_4, input_text)
    else:
        return {"error": "Model not found"}
    return {f"{model_name}_output": result}

@app.get("/unified_summary")
async def unified_summary(input_text: str):
    output1 = await generate_response(model_name="model1", input_text=input_text)
    output2 = await generate_response(model_name="model2", input_text=input_text)
    output3 = await generate_response(model_name="model3", input_text=input_text)
    output4 = await generate_response(model_name="model4", input_text=input_text)
    combined_response = output1.get("model1_output", "") + " " + \
                         output2.get("model2_output", "") + " " + \
                         output3.get("model3_output", "") + " " + \
                         output4.get("model4_output", "")
    summarized_response = summarize_text(combined_response)
    return {"summary": summarized_response}