Hhhgg / app.py
Hjgugugjhuhjggg's picture
Update app.py
d18daae verified
raw
history blame
7.51 kB
import gc
import psutil
import os
import torch
from fastapi import FastAPI
from langchain.llms import VLLM
from chatgptcache import cache
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import asyncio
import torch.nn.utils.prune as prune
from concurrent.futures import ThreadPoolExecutor
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
nltk.download('punkt')
nltk.download('stopwords')
app = FastAPI()
model_1 = None
model_2 = None
model_3 = None
model_4 = None
cache_1 = cache.SimpleCache()
cache_2 = cache.SimpleCache()
cache_3 = cache.SimpleCache()
cache_4 = cache.SimpleCache()
previous_responses_1 = []
previous_responses_2 = []
previous_responses_3 = []
previous_responses_4 = []
MAX_TOKENS = 2048
executor = ThreadPoolExecutor(max_workers=4)
# Configuración para usar solo la CPU
device = torch.device("cpu")
def get_best_response(new_response, previous_responses):
if not previous_responses:
return new_response
vectorizer = TfidfVectorizer().fit_transform(previous_responses + [new_response])
cosine_sim = cosine_similarity(vectorizer[-1], vectorizer[:-1])
max_sim_index = cosine_sim.argmax()
max_sim_score = cosine_sim[0][max_sim_index]
if max_sim_score > 0.7:
return previous_responses[max_sim_index]
return new_response
def summarize_text(text):
sentences = sent_tokenize(text)
stop_words = set(stopwords.words("english"))
word_frequencies = Counter()
for sentence in sentences:
words = word_tokenize(sentence.lower())
words = [word for word in words if word.isalpha() and word not in stop_words]
word_frequencies.update(words)
most_common_words = word_frequencies.most_common(50)
most_common_words = {word: freq for word, freq in most_common_words}
ranked_sentences = []
for sentence in sentences:
score = sum(most_common_words.get(word, 0) for word in word_tokenize(sentence.lower()))
ranked_sentences.append((score, sentence))
ranked_sentences.sort(reverse=True, key=lambda x: x[0])
summary = ' '.join([sentence for _, sentence in ranked_sentences[:3]])
return summary
def clear_memory():
gc.collect()
process = psutil.Process(os.getpid())
memory_usage = psutil.virtual_memory().percent
if memory_usage > 90:
global model_1, model_2, model_3, model_4
model_1 = None
model_2 = None
model_3 = None
model_4 = None
gc.collect()
def apply_pruning(model):
for name, module in model.named_modules():
if isinstance(module, torch.nn.Linear):
prune.random_unstructured(module, name="weight", amount=0.2)
prune.remove(module, name="weight")
return model
def split_input(input_text, max_tokens):
tokens = input_text.split()
chunks = []
chunk = []
total_tokens = 0
for word in tokens:
word_length = len(word.split())
if total_tokens + word_length > max_tokens:
chunks.append(" ".join(chunk))
chunk = [word]
total_tokens = word_length
else:
chunk.append(word)
total_tokens += word_length
if chunk:
chunks.append(" ".join(chunk))
return chunks
def split_output(output_text, max_tokens):
tokens = output_text.split()
chunks = []
chunk = []
total_tokens = 0
for word in tokens:
word_length = len(word.split())
if total_tokens + word_length > max_tokens:
chunks.append(" ".join(chunk))
chunk = [word]
total_tokens = word_length
else:
chunk.append(word)
total_tokens += word_length
if chunk:
chunks.append(" ".join(chunk))
return chunks
def create_langchain_model(model_name: str, device: torch.device, cache, previous_responses):
vllm_llm = VLLM(model_name=model_name, device=device)
template = """
You are a helpful assistant. Given the following text, generate a meaningful response:
{input_text}
"""
prompt = PromptTemplate(input_variables=["input_text"], template=template)
chain = LLMChain(llm=vllm_llm, prompt=prompt)
def generate_for_model(input_text):
cached_output = cache.get(input_text)
if cached_output:
return cached_output
input_chunks = split_input(input_text, MAX_TOKENS)
output_text = ""
prev_output = ""
for chunk in input_chunks:
prompt = prev_output + chunk
output_text += chain.run(input_text=prompt)
prev_output = output_text.split()[-50:]
output_chunks = split_output(output_text, MAX_TOKENS)
best_response = get_best_response(output_chunks[0], previous_responses)
cache.put(input_text, best_response)
previous_responses.append(best_response)
return best_response
return generate_for_model
async def load_models():
global model_1, model_2, model_3, model_4
model_1 = create_langchain_model("Hjgugugjhuhjggg/llama-3.2-1B-spinquant-hf", device, cache_1, previous_responses_1)
model_2 = create_langchain_model("Qwen/Qwen2.5-Coder-1.5B", device, cache_2, previous_responses_2)
model_3 = create_langchain_model("Qwen/Qwen2.5-3B-Instruct", device, cache_3, previous_responses_3)
model_4 = create_langchain_model("gpt2", device, cache_4, previous_responses_4)
print("Modelos cargados exitosamente.")
async def optimize_models_periodically():
while True:
await load_models()
await asyncio.sleep(3600)
@app.on_event("startup")
async def startup():
await load_models()
app.add_event_handler("startup", monitor_memory)
app.add_event_handler("startup", optimize_models_periodically)
async def monitor_memory():
while True:
clear_memory()
await asyncio.sleep(60)
@app.get("/generate")
async def generate_response(model_name: str, input_text: str):
if model_name == "model1":
result = await asyncio.get_event_loop().run_in_executor(executor, model_1, input_text)
elif model_name == "model2":
result = await asyncio.get_event_loop().run_in_executor(executor, model_2, input_text)
elif model_name == "model3":
result = await asyncio.get_event_loop().run_in_executor(executor, model_3, input_text)
elif model_name == "model4":
result = await asyncio.get_event_loop().run_in_executor(executor, model_4, input_text)
else:
return {"error": "Model not found"}
return {f"{model_name}_output": result}
@app.get("/unified_summary")
async def unified_summary(input_text: str):
output1 = await generate_response(model_name="model1", input_text=input_text)
output2 = await generate_response(model_name="model2", input_text=input_text)
output3 = await generate_response(model_name="model3", input_text=input_text)
output4 = await generate_response(model_name="model4", input_text=input_text)
combined_response = output1.get("model1_output", "") + " " + \
output2.get("model2_output", "") + " " + \
output3.get("model3_output", "") + " " + \
output4.get("model4_output", "")
summarized_response = summarize_text(combined_response)
return {"summary": summarized_response}