Spaces:
Build error
Build error
import gc | |
import psutil | |
import os | |
import torch | |
from fastapi import FastAPI | |
from langchain.llms import VLLM | |
from cachetools import TTLCache | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import nltk | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
from nltk.corpus import stopwords | |
from collections import Counter | |
import asyncio | |
import torch.nn.utils.prune as prune | |
from concurrent.futures import ThreadPoolExecutor | |
from langchain.prompts import PromptTemplate | |
from langchain.chains import LLMChain | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
app = FastAPI() | |
model_1 = None | |
model_2 = None | |
model_3 = None | |
model_4 = None | |
# Using TTLCache from cachetools | |
cache_1 = TTLCache(maxsize=100, ttl=600) # maxsize=100 and ttl=600 (10 minutes) | |
cache_2 = TTLCache(maxsize=100, ttl=600) | |
cache_3 = TTLCache(maxsize=100, ttl=600) | |
cache_4 = TTLCache(maxsize=100, ttl=600) | |
previous_responses_1 = [] | |
previous_responses_2 = [] | |
previous_responses_3 = [] | |
previous_responses_4 = [] | |
MAX_TOKENS = 2048 | |
executor = ThreadPoolExecutor(max_workers=4) | |
# Configuración para usar solo la CPU | |
device = torch.device("cpu") | |
def get_best_response(new_response, previous_responses): | |
if not previous_responses: | |
return new_response | |
vectorizer = TfidfVectorizer().fit_transform(previous_responses + [new_response]) | |
cosine_sim = cosine_similarity(vectorizer[-1], vectorizer[:-1]) | |
max_sim_index = cosine_sim.argmax() | |
max_sim_score = cosine_sim[0][max_sim_index] | |
if max_sim_score > 0.7: | |
return previous_responses[max_sim_index] | |
return new_response | |
def summarize_text(text): | |
sentences = sent_tokenize(text) | |
stop_words = set(stopwords.words("english")) | |
word_frequencies = Counter() | |
for sentence in sentences: | |
words = word_tokenize(sentence.lower()) | |
words = [word for word in words if word.isalpha() and word not in stop_words] | |
word_frequencies.update(words) | |
most_common_words = word_frequencies.most_common(50) | |
most_common_words = {word: freq for word, freq in most_common_words} | |
ranked_sentences = [] | |
for sentence in sentences: | |
score = sum(most_common_words.get(word, 0) for word in word_tokenize(sentence.lower())) | |
ranked_sentences.append((score, sentence)) | |
ranked_sentences.sort(reverse=True, key=lambda x: x[0]) | |
summary = ' '.join([sentence for _, sentence in ranked_sentences[:3]]) | |
return summary | |
def clear_memory(): | |
gc.collect() | |
process = psutil.Process(os.getpid()) | |
memory_usage = psutil.virtual_memory().percent | |
if memory_usage > 90: | |
global model_1, model_2, model_3, model_4 | |
model_1 = None | |
model_2 = None | |
model_3 = None | |
model_4 = None | |
gc.collect() | |
def apply_pruning(model): | |
for name, module in model.named_modules(): | |
if isinstance(module, torch.nn.Linear): | |
prune.random_unstructured(module, name="weight", amount=0.2) | |
prune.remove(module, name="weight") | |
return model | |
def split_input(input_text, max_tokens): | |
tokens = input_text.split() | |
chunks = [] | |
chunk = [] | |
total_tokens = 0 | |
for word in tokens: | |
word_length = len(word.split()) | |
if total_tokens + word_length > max_tokens: | |
chunks.append(" ".join(chunk)) | |
chunk = [word] | |
total_tokens = word_length | |
else: | |
chunk.append(word) | |
total_tokens += word_length | |
if chunk: | |
chunks.append(" ".join(chunk)) | |
return chunks | |
def split_output(output_text, max_tokens): | |
tokens = output_text.split() | |
chunks = [] | |
chunk = [] | |
total_tokens = 0 | |
for word in tokens: | |
word_length = len(word.split()) | |
if total_tokens + word_length > max_tokens: | |
chunks.append(" ".join(chunk)) | |
chunk = [word] | |
total_tokens = word_length | |
else: | |
chunk.append(word) | |
total_tokens += word_length | |
if chunk: | |
chunks.append(" ".join(chunk)) | |
return chunks | |
def create_langchain_model(model_name: str, device: torch.device, cache, previous_responses): | |
vllm_llm = VLLM(model_name=model_name, device=device) | |
template = """ | |
You are a helpful assistant. Given the following text, generate a meaningful response: | |
{input_text} | |
""" | |
prompt = PromptTemplate(input_variables=["input_text"], template=template) | |
chain = LLMChain(llm=vllm_llm, prompt=prompt) | |
def generate_for_model(input_text): | |
cached_output = cache.get(input_text) | |
if cached_output: | |
return cached_output | |
input_chunks = split_input(input_text, MAX_TOKENS) | |
output_text = "" | |
prev_output = "" | |
for chunk in input_chunks: | |
prompt = prev_output + chunk | |
output_text += chain.run(input_text=prompt) | |
prev_output = output_text.split()[-50:] | |
output_chunks = split_output(output_text, MAX_TOKENS) | |
best_response = get_best_response(output_chunks[0], previous_responses) | |
cache[input_text] = best_response | |
previous_responses.append(best_response) | |
return best_response | |
return generate_for_model | |
async def load_models(): | |
global model_1, model_2, model_3, model_4 | |
model_1 = create_langchain_model("Hjgugugjhuhjggg/llama-3.2-1B-spinquant-hf", device, cache_1, previous_responses_1) | |
model_2 = create_langchain_model("Qwen/Qwen2.5-Coder-1.5B", device, cache_2, previous_responses_2) | |
model_3 = create_langchain_model("Qwen/Qwen2.5-3B-Instruct", device, cache_3, previous_responses_3) | |
model_4 = create_langchain_model("gpt2", device, cache_4, previous_responses_4) | |
print("Modelos cargados exitosamente.") | |
async def optimize_models_periodically(): | |
while True: | |
await load_models() | |
await asyncio.sleep(3600) | |
async def startup(): | |
await load_models() | |
app.add_event_handler("startup", monitor_memory) | |
app.add_event_handler("startup", optimize_models_periodically) | |
async def monitor_memory(): | |
while True: | |
clear_memory() | |
await asyncio.sleep(60) | |
async def generate_response(model_name: str, input_text: str): | |
if model_name == "model1": | |
result = await asyncio.get_event_loop().run_in_executor(executor, model_1, input_text) | |
elif model_name == "model2": | |
result = await asyncio.get_event_loop().run_in_executor(executor, model_2, input_text) | |
elif model_name == "model3": | |
result = await asyncio.get_event_loop().run_in_executor(executor, model_3, input_text) | |
elif model_name == "model4": | |
result = await asyncio.get_event_loop().run_in_executor(executor, model_4, input_text) | |
else: | |
return {"error": "Model not found"} | |
return {f"{model_name}_output": result} | |
async def unified_summary(input_text: str): | |
output1 = await generate_response(model_name="model1", input_text=input_text) | |
output2 = await generate_response(model_name="model2", input_text=input_text) | |
output3 = await generate_response(model_name="model3", input_text=input_text) | |
output4 = await generate_response(model_name="model4", input_text=input_text) | |
combined_response = output1.get("model1_output", "") + " " + \ | |
output2.get("model2_output", "") + " " + \ | |
output3.get("model3_output", "") + " " + \ | |
output4.get("model4_output", "") | |
summarized_response = summarize_text(combined_response) | |
return {"summary": summarized_response} | |