import gc import psutil import os import torch from fastapi import FastAPI from langchain.llms import VLLM from cachetools import TTLCache from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import nltk from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from collections import Counter import asyncio import torch.nn.utils.prune as prune from concurrent.futures import ThreadPoolExecutor from langchain.prompts import PromptTemplate from langchain.chains import LLMChain nltk.download('punkt') nltk.download('stopwords') app = FastAPI() model_1 = None model_2 = None model_3 = None model_4 = None # Using TTLCache from cachetools cache_1 = TTLCache(maxsize=100, ttl=600) # maxsize=100 and ttl=600 (10 minutes) cache_2 = TTLCache(maxsize=100, ttl=600) cache_3 = TTLCache(maxsize=100, ttl=600) cache_4 = TTLCache(maxsize=100, ttl=600) previous_responses_1 = [] previous_responses_2 = [] previous_responses_3 = [] previous_responses_4 = [] MAX_TOKENS = 2048 executor = ThreadPoolExecutor(max_workers=4) # ConfiguraciĆ³n para usar solo la CPU device = torch.device("cpu") def get_best_response(new_response, previous_responses): if not previous_responses: return new_response vectorizer = TfidfVectorizer().fit_transform(previous_responses + [new_response]) cosine_sim = cosine_similarity(vectorizer[-1], vectorizer[:-1]) max_sim_index = cosine_sim.argmax() max_sim_score = cosine_sim[0][max_sim_index] if max_sim_score > 0.7: return previous_responses[max_sim_index] return new_response def summarize_text(text): sentences = sent_tokenize(text) stop_words = set(stopwords.words("english")) word_frequencies = Counter() for sentence in sentences: words = word_tokenize(sentence.lower()) words = [word for word in words if word.isalpha() and word not in stop_words] word_frequencies.update(words) most_common_words = word_frequencies.most_common(50) most_common_words = {word: freq for word, freq in most_common_words} ranked_sentences = [] for sentence in sentences: score = sum(most_common_words.get(word, 0) for word in word_tokenize(sentence.lower())) ranked_sentences.append((score, sentence)) ranked_sentences.sort(reverse=True, key=lambda x: x[0]) summary = ' '.join([sentence for _, sentence in ranked_sentences[:3]]) return summary def clear_memory(): gc.collect() process = psutil.Process(os.getpid()) memory_usage = psutil.virtual_memory().percent if memory_usage > 90: global model_1, model_2, model_3, model_4 model_1 = None model_2 = None model_3 = None model_4 = None gc.collect() def apply_pruning(model): for name, module in model.named_modules(): if isinstance(module, torch.nn.Linear): prune.random_unstructured(module, name="weight", amount=0.2) prune.remove(module, name="weight") return model def split_input(input_text, max_tokens): tokens = input_text.split() chunks = [] chunk = [] total_tokens = 0 for word in tokens: word_length = len(word.split()) if total_tokens + word_length > max_tokens: chunks.append(" ".join(chunk)) chunk = [word] total_tokens = word_length else: chunk.append(word) total_tokens += word_length if chunk: chunks.append(" ".join(chunk)) return chunks def split_output(output_text, max_tokens): tokens = output_text.split() chunks = [] chunk = [] total_tokens = 0 for word in tokens: word_length = len(word.split()) if total_tokens + word_length > max_tokens: chunks.append(" ".join(chunk)) chunk = [word] total_tokens = word_length else: chunk.append(word) total_tokens += word_length if chunk: chunks.append(" ".join(chunk)) return chunks def create_langchain_model(model_name: str, device: torch.device, cache, previous_responses): vllm_llm = VLLM(model_name=model_name, device=device) template = """ You are a helpful assistant. Given the following text, generate a meaningful response: {input_text} """ prompt = PromptTemplate(input_variables=["input_text"], template=template) chain = LLMChain(llm=vllm_llm, prompt=prompt) def generate_for_model(input_text): cached_output = cache.get(input_text) if cached_output: return cached_output input_chunks = split_input(input_text, MAX_TOKENS) output_text = "" prev_output = "" for chunk in input_chunks: prompt = prev_output + chunk output_text += chain.run(input_text=prompt) prev_output = output_text.split()[-50:] output_chunks = split_output(output_text, MAX_TOKENS) best_response = get_best_response(output_chunks[0], previous_responses) cache[input_text] = best_response previous_responses.append(best_response) return best_response return generate_for_model async def load_models(): global model_1, model_2, model_3, model_4 model_1 = create_langchain_model("Hjgugugjhuhjggg/llama-3.2-1B-spinquant-hf", device, cache_1, previous_responses_1) model_2 = create_langchain_model("Qwen/Qwen2.5-Coder-1.5B", device, cache_2, previous_responses_2) model_3 = create_langchain_model("Qwen/Qwen2.5-3B-Instruct", device, cache_3, previous_responses_3) model_4 = create_langchain_model("gpt2", device, cache_4, previous_responses_4) print("Modelos cargados exitosamente.") async def optimize_models_periodically(): while True: await load_models() await asyncio.sleep(3600) @app.on_event("startup") async def startup(): await load_models() app.add_event_handler("startup", monitor_memory) app.add_event_handler("startup", optimize_models_periodically) async def monitor_memory(): while True: clear_memory() await asyncio.sleep(60) @app.get("/generate") async def generate_response(model_name: str, input_text: str): if model_name == "model1": result = await asyncio.get_event_loop().run_in_executor(executor, model_1, input_text) elif model_name == "model2": result = await asyncio.get_event_loop().run_in_executor(executor, model_2, input_text) elif model_name == "model3": result = await asyncio.get_event_loop().run_in_executor(executor, model_3, input_text) elif model_name == "model4": result = await asyncio.get_event_loop().run_in_executor(executor, model_4, input_text) else: return {"error": "Model not found"} return {f"{model_name}_output": result} @app.get("/unified_summary") async def unified_summary(input_text: str): output1 = await generate_response(model_name="model1", input_text=input_text) output2 = await generate_response(model_name="model2", input_text=input_text) output3 = await generate_response(model_name="model3", input_text=input_text) output4 = await generate_response(model_name="model4", input_text=input_text) combined_response = output1.get("model1_output", "") + " " + \ output2.get("model2_output", "") + " " + \ output3.get("model3_output", "") + " " + \ output4.get("model4_output", "") summarized_response = summarize_text(combined_response) return {"summary": summarized_response}