Spaces:

Hjgugugjhuhjggg
/

Hhhgg

Build error

App Files Files Community

Hhhgg / app.py

Hjgugugjhuhjggg

Update app.py

1e6f7d7 verified 6 days ago

raw

history blame

7.62 kB

	import gc
	import psutil
	import os
	import torch
	from fastapi import FastAPI
	from langchain.llms import VLLM
	from cachetools import TTLCache
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import nltk
	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.corpus import stopwords
	from collections import Counter
	import asyncio
	import torch.nn.utils.prune as prune
	from concurrent.futures import ThreadPoolExecutor
	from langchain.prompts import PromptTemplate
	from langchain.chains import LLMChain

	nltk.download('punkt')
	nltk.download('stopwords')

	app = FastAPI()

	model_1 = None
	model_2 = None
	model_3 = None
	model_4 = None

	# Using TTLCache from cachetools
	cache_1 = TTLCache(maxsize=100, ttl=600) # maxsize=100 and ttl=600 (10 minutes)
	cache_2 = TTLCache(maxsize=100, ttl=600)
	cache_3 = TTLCache(maxsize=100, ttl=600)
	cache_4 = TTLCache(maxsize=100, ttl=600)

	previous_responses_1 = []
	previous_responses_2 = []
	previous_responses_3 = []
	previous_responses_4 = []

	MAX_TOKENS = 2048

	executor = ThreadPoolExecutor(max_workers=4)

	# Configuración para usar solo la CPU
	device = torch.device("cpu")

	def get_best_response(new_response, previous_responses):
	if not previous_responses:
	return new_response
	vectorizer = TfidfVectorizer().fit_transform(previous_responses + [new_response])
	cosine_sim = cosine_similarity(vectorizer[-1], vectorizer[:-1])
	max_sim_index = cosine_sim.argmax()
	max_sim_score = cosine_sim[0][max_sim_index]
	if max_sim_score > 0.7:
	return previous_responses[max_sim_index]
	return new_response

	def summarize_text(text):
	sentences = sent_tokenize(text)
	stop_words = set(stopwords.words("english"))
	word_frequencies = Counter()
	for sentence in sentences:
	words = word_tokenize(sentence.lower())
	words = [word for word in words if word.isalpha() and word not in stop_words]
	word_frequencies.update(words)
	most_common_words = word_frequencies.most_common(50)
	most_common_words = {word: freq for word, freq in most_common_words}
	ranked_sentences = []
	for sentence in sentences:
	score = sum(most_common_words.get(word, 0) for word in word_tokenize(sentence.lower()))
	ranked_sentences.append((score, sentence))
	ranked_sentences.sort(reverse=True, key=lambda x: x[0])
	summary = ' '.join([sentence for _, sentence in ranked_sentences[:3]])
	return summary

	def clear_memory():
	gc.collect()
	process = psutil.Process(os.getpid())
	memory_usage = psutil.virtual_memory().percent
	if memory_usage > 90:
	global model_1, model_2, model_3, model_4
	model_1 = None
	model_2 = None
	model_3 = None
	model_4 = None
	gc.collect()

	def apply_pruning(model):
	for name, module in model.named_modules():
	if isinstance(module, torch.nn.Linear):
	prune.random_unstructured(module, name="weight", amount=0.2)
	prune.remove(module, name="weight")
	return model

	def split_input(input_text, max_tokens):
	tokens = input_text.split()
	chunks = []
	chunk = []
	total_tokens = 0
	for word in tokens:
	word_length = len(word.split())
	if total_tokens + word_length > max_tokens:
	chunks.append(" ".join(chunk))
	chunk = [word]
	total_tokens = word_length
	else:
	chunk.append(word)
	total_tokens += word_length
	if chunk:
	chunks.append(" ".join(chunk))
	return chunks

	def split_output(output_text, max_tokens):
	tokens = output_text.split()
	chunks = []
	chunk = []
	total_tokens = 0
	for word in tokens:
	word_length = len(word.split())
	if total_tokens + word_length > max_tokens:
	chunks.append(" ".join(chunk))
	chunk = [word]
	total_tokens = word_length
	else:
	chunk.append(word)
	total_tokens += word_length
	if chunk:
	chunks.append(" ".join(chunk))
	return chunks

	def create_langchain_model(model_name: str, device: torch.device, cache, previous_responses):
	vllm_llm = VLLM(model_name=model_name, device=device)
	template = """
	You are a helpful assistant. Given the following text, generate a meaningful response:
	{input_text}
	"""
	prompt = PromptTemplate(input_variables=["input_text"], template=template)
	chain = LLMChain(llm=vllm_llm, prompt=prompt)
	def generate_for_model(input_text):
	cached_output = cache.get(input_text)
	if cached_output:
	return cached_output
	input_chunks = split_input(input_text, MAX_TOKENS)
	output_text = ""
	prev_output = ""
	for chunk in input_chunks:
	prompt = prev_output + chunk
	output_text += chain.run(input_text=prompt)
	prev_output = output_text.split()[-50:]
	output_chunks = split_output(output_text, MAX_TOKENS)
	best_response = get_best_response(output_chunks[0], previous_responses)
	cache[input_text] = best_response
	previous_responses.append(best_response)
	return best_response
	return generate_for_model

	async def load_models():
	global model_1, model_2, model_3, model_4
	model_1 = create_langchain_model("Hjgugugjhuhjggg/llama-3.2-1B-spinquant-hf", device, cache_1, previous_responses_1)
	model_2 = create_langchain_model("Qwen/Qwen2.5-Coder-1.5B", device, cache_2, previous_responses_2)
	model_3 = create_langchain_model("Qwen/Qwen2.5-3B-Instruct", device, cache_3, previous_responses_3)
	model_4 = create_langchain_model("gpt2", device, cache_4, previous_responses_4)
	print("Modelos cargados exitosamente.")

	async def optimize_models_periodically():
	while True:
	await load_models()
	await asyncio.sleep(3600)

	@app.on_event("startup")
	async def startup():
	await load_models()
	app.add_event_handler("startup", monitor_memory)
	app.add_event_handler("startup", optimize_models_periodically)

	async def monitor_memory():
	while True:
	clear_memory()
	await asyncio.sleep(60)

	@app.get("/generate")
	async def generate_response(model_name: str, input_text: str):
	if model_name == "model1":
	result = await asyncio.get_event_loop().run_in_executor(executor, model_1, input_text)
	elif model_name == "model2":
	result = await asyncio.get_event_loop().run_in_executor(executor, model_2, input_text)
	elif model_name == "model3":
	result = await asyncio.get_event_loop().run_in_executor(executor, model_3, input_text)
	elif model_name == "model4":
	result = await asyncio.get_event_loop().run_in_executor(executor, model_4, input_text)
	else:
	return {"error": "Model not found"}
	return {f"{model_name}_output": result}

	@app.get("/unified_summary")
	async def unified_summary(input_text: str):
	output1 = await generate_response(model_name="model1", input_text=input_text)
	output2 = await generate_response(model_name="model2", input_text=input_text)
	output3 = await generate_response(model_name="model3", input_text=input_text)
	output4 = await generate_response(model_name="model4", input_text=input_text)
	combined_response = output1.get("model1_output", "") + " " + \
	output2.get("model2_output", "") + " " + \
	output3.get("model3_output", "") + " " + \
	output4.get("model4_output", "")
	summarized_response = summarize_text(combined_response)
	return {"summary": summarized_response}