Spaces:
Build error
Build error
File size: 7,621 Bytes
a6c0d65 9800fab a6c0d65 1cb967f 1e6f7d7 a6c0d65 1cb967f a6c0d65 9800fab a6c0d65 1e6f7d7 a6c0d65 1cb967f a6c0d65 d18daae a6c0d65 1cb967f a6c0d65 1cb967f a6c0d65 1cb967f a6c0d65 1cb967f a6c0d65 1cb967f a6c0d65 1cb967f a6c0d65 1cb967f a6c0d65 1cb967f 1e6f7d7 1cb967f a6c0d65 1cb967f a6c0d65 1cb967f a6c0d65 1cb967f a6c0d65 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
import gc
import psutil
import os
import torch
from fastapi import FastAPI
from langchain.llms import VLLM
from cachetools import TTLCache
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import asyncio
import torch.nn.utils.prune as prune
from concurrent.futures import ThreadPoolExecutor
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
nltk.download('punkt')
nltk.download('stopwords')
app = FastAPI()
model_1 = None
model_2 = None
model_3 = None
model_4 = None
# Using TTLCache from cachetools
cache_1 = TTLCache(maxsize=100, ttl=600) # maxsize=100 and ttl=600 (10 minutes)
cache_2 = TTLCache(maxsize=100, ttl=600)
cache_3 = TTLCache(maxsize=100, ttl=600)
cache_4 = TTLCache(maxsize=100, ttl=600)
previous_responses_1 = []
previous_responses_2 = []
previous_responses_3 = []
previous_responses_4 = []
MAX_TOKENS = 2048
executor = ThreadPoolExecutor(max_workers=4)
# Configuración para usar solo la CPU
device = torch.device("cpu")
def get_best_response(new_response, previous_responses):
if not previous_responses:
return new_response
vectorizer = TfidfVectorizer().fit_transform(previous_responses + [new_response])
cosine_sim = cosine_similarity(vectorizer[-1], vectorizer[:-1])
max_sim_index = cosine_sim.argmax()
max_sim_score = cosine_sim[0][max_sim_index]
if max_sim_score > 0.7:
return previous_responses[max_sim_index]
return new_response
def summarize_text(text):
sentences = sent_tokenize(text)
stop_words = set(stopwords.words("english"))
word_frequencies = Counter()
for sentence in sentences:
words = word_tokenize(sentence.lower())
words = [word for word in words if word.isalpha() and word not in stop_words]
word_frequencies.update(words)
most_common_words = word_frequencies.most_common(50)
most_common_words = {word: freq for word, freq in most_common_words}
ranked_sentences = []
for sentence in sentences:
score = sum(most_common_words.get(word, 0) for word in word_tokenize(sentence.lower()))
ranked_sentences.append((score, sentence))
ranked_sentences.sort(reverse=True, key=lambda x: x[0])
summary = ' '.join([sentence for _, sentence in ranked_sentences[:3]])
return summary
def clear_memory():
gc.collect()
process = psutil.Process(os.getpid())
memory_usage = psutil.virtual_memory().percent
if memory_usage > 90:
global model_1, model_2, model_3, model_4
model_1 = None
model_2 = None
model_3 = None
model_4 = None
gc.collect()
def apply_pruning(model):
for name, module in model.named_modules():
if isinstance(module, torch.nn.Linear):
prune.random_unstructured(module, name="weight", amount=0.2)
prune.remove(module, name="weight")
return model
def split_input(input_text, max_tokens):
tokens = input_text.split()
chunks = []
chunk = []
total_tokens = 0
for word in tokens:
word_length = len(word.split())
if total_tokens + word_length > max_tokens:
chunks.append(" ".join(chunk))
chunk = [word]
total_tokens = word_length
else:
chunk.append(word)
total_tokens += word_length
if chunk:
chunks.append(" ".join(chunk))
return chunks
def split_output(output_text, max_tokens):
tokens = output_text.split()
chunks = []
chunk = []
total_tokens = 0
for word in tokens:
word_length = len(word.split())
if total_tokens + word_length > max_tokens:
chunks.append(" ".join(chunk))
chunk = [word]
total_tokens = word_length
else:
chunk.append(word)
total_tokens += word_length
if chunk:
chunks.append(" ".join(chunk))
return chunks
def create_langchain_model(model_name: str, device: torch.device, cache, previous_responses):
vllm_llm = VLLM(model_name=model_name, device=device)
template = """
You are a helpful assistant. Given the following text, generate a meaningful response:
{input_text}
"""
prompt = PromptTemplate(input_variables=["input_text"], template=template)
chain = LLMChain(llm=vllm_llm, prompt=prompt)
def generate_for_model(input_text):
cached_output = cache.get(input_text)
if cached_output:
return cached_output
input_chunks = split_input(input_text, MAX_TOKENS)
output_text = ""
prev_output = ""
for chunk in input_chunks:
prompt = prev_output + chunk
output_text += chain.run(input_text=prompt)
prev_output = output_text.split()[-50:]
output_chunks = split_output(output_text, MAX_TOKENS)
best_response = get_best_response(output_chunks[0], previous_responses)
cache[input_text] = best_response
previous_responses.append(best_response)
return best_response
return generate_for_model
async def load_models():
global model_1, model_2, model_3, model_4
model_1 = create_langchain_model("Hjgugugjhuhjggg/llama-3.2-1B-spinquant-hf", device, cache_1, previous_responses_1)
model_2 = create_langchain_model("Qwen/Qwen2.5-Coder-1.5B", device, cache_2, previous_responses_2)
model_3 = create_langchain_model("Qwen/Qwen2.5-3B-Instruct", device, cache_3, previous_responses_3)
model_4 = create_langchain_model("gpt2", device, cache_4, previous_responses_4)
print("Modelos cargados exitosamente.")
async def optimize_models_periodically():
while True:
await load_models()
await asyncio.sleep(3600)
@app.on_event("startup")
async def startup():
await load_models()
app.add_event_handler("startup", monitor_memory)
app.add_event_handler("startup", optimize_models_periodically)
async def monitor_memory():
while True:
clear_memory()
await asyncio.sleep(60)
@app.get("/generate")
async def generate_response(model_name: str, input_text: str):
if model_name == "model1":
result = await asyncio.get_event_loop().run_in_executor(executor, model_1, input_text)
elif model_name == "model2":
result = await asyncio.get_event_loop().run_in_executor(executor, model_2, input_text)
elif model_name == "model3":
result = await asyncio.get_event_loop().run_in_executor(executor, model_3, input_text)
elif model_name == "model4":
result = await asyncio.get_event_loop().run_in_executor(executor, model_4, input_text)
else:
return {"error": "Model not found"}
return {f"{model_name}_output": result}
@app.get("/unified_summary")
async def unified_summary(input_text: str):
output1 = await generate_response(model_name="model1", input_text=input_text)
output2 = await generate_response(model_name="model2", input_text=input_text)
output3 = await generate_response(model_name="model3", input_text=input_text)
output4 = await generate_response(model_name="model4", input_text=input_text)
combined_response = output1.get("model1_output", "") + " " + \
output2.get("model2_output", "") + " " + \
output3.get("model3_output", "") + " " + \
output4.get("model4_output", "")
summarized_response = summarize_text(combined_response)
return {"summary": summarized_response}
|