File size: 7,621 Bytes
a6c0d65
 
9800fab
 
a6c0d65
1cb967f
1e6f7d7
a6c0d65
 
 
 
 
 
 
 
 
1cb967f
 
a6c0d65
 
 
9800fab
 
 
a6c0d65
 
 
 
 
1e6f7d7
 
 
 
 
a6c0d65
 
 
 
 
 
1cb967f
a6c0d65
 
 
d18daae
a6c0d65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cb967f
a6c0d65
 
 
1cb967f
a6c0d65
 
 
 
1cb967f
a6c0d65
 
 
 
 
 
 
 
1cb967f
a6c0d65
 
 
1cb967f
a6c0d65
 
 
 
1cb967f
a6c0d65
 
 
 
 
 
 
 
1cb967f
a6c0d65
 
1cb967f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e6f7d7
1cb967f
 
 
a6c0d65
 
 
1cb967f
 
 
 
 
a6c0d65
 
 
1cb967f
 
a6c0d65
 
 
 
 
 
 
 
 
 
 
 
 
 
1cb967f
 
 
 
 
 
 
 
 
 
a6c0d65
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import gc
import psutil
import os
import torch
from fastapi import FastAPI
from langchain.llms import VLLM
from cachetools import TTLCache
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import asyncio
import torch.nn.utils.prune as prune
from concurrent.futures import ThreadPoolExecutor
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

nltk.download('punkt')
nltk.download('stopwords')

app = FastAPI()

model_1 = None
model_2 = None
model_3 = None
model_4 = None

# Using TTLCache from cachetools
cache_1 = TTLCache(maxsize=100, ttl=600)  # maxsize=100 and ttl=600 (10 minutes)
cache_2 = TTLCache(maxsize=100, ttl=600)
cache_3 = TTLCache(maxsize=100, ttl=600)
cache_4 = TTLCache(maxsize=100, ttl=600)

previous_responses_1 = []
previous_responses_2 = []
previous_responses_3 = []
previous_responses_4 = []

MAX_TOKENS = 2048

executor = ThreadPoolExecutor(max_workers=4)

# Configuración para usar solo la CPU
device = torch.device("cpu")

def get_best_response(new_response, previous_responses):
    if not previous_responses:
        return new_response
    vectorizer = TfidfVectorizer().fit_transform(previous_responses + [new_response])
    cosine_sim = cosine_similarity(vectorizer[-1], vectorizer[:-1])
    max_sim_index = cosine_sim.argmax()
    max_sim_score = cosine_sim[0][max_sim_index]
    if max_sim_score > 0.7:
        return previous_responses[max_sim_index]
    return new_response

def summarize_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words("english"))
    word_frequencies = Counter()
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [word for word in words if word.isalpha() and word not in stop_words]
        word_frequencies.update(words)
    most_common_words = word_frequencies.most_common(50)
    most_common_words = {word: freq for word, freq in most_common_words}
    ranked_sentences = []
    for sentence in sentences:
        score = sum(most_common_words.get(word, 0) for word in word_tokenize(sentence.lower()))
        ranked_sentences.append((score, sentence))
    ranked_sentences.sort(reverse=True, key=lambda x: x[0])
    summary = ' '.join([sentence for _, sentence in ranked_sentences[:3]])
    return summary

def clear_memory():
    gc.collect()
    process = psutil.Process(os.getpid())
    memory_usage = psutil.virtual_memory().percent
    if memory_usage > 90:
        global model_1, model_2, model_3, model_4
        model_1 = None
        model_2 = None
        model_3 = None
        model_4 = None
        gc.collect()

def apply_pruning(model):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.random_unstructured(module, name="weight", amount=0.2)
            prune.remove(module, name="weight")
    return model

def split_input(input_text, max_tokens):
    tokens = input_text.split()
    chunks = []
    chunk = []
    total_tokens = 0
    for word in tokens:
        word_length = len(word.split())
        if total_tokens + word_length > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = [word]
            total_tokens = word_length
        else:
            chunk.append(word)
            total_tokens += word_length
    if chunk:
        chunks.append(" ".join(chunk))
    return chunks

def split_output(output_text, max_tokens):
    tokens = output_text.split()
    chunks = []
    chunk = []
    total_tokens = 0
    for word in tokens:
        word_length = len(word.split())
        if total_tokens + word_length > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = [word]
            total_tokens = word_length
        else:
            chunk.append(word)
            total_tokens += word_length
    if chunk:
        chunks.append(" ".join(chunk))
    return chunks

def create_langchain_model(model_name: str, device: torch.device, cache, previous_responses):
    vllm_llm = VLLM(model_name=model_name, device=device)
    template = """
    You are a helpful assistant. Given the following text, generate a meaningful response:
    {input_text}
    """
    prompt = PromptTemplate(input_variables=["input_text"], template=template)
    chain = LLMChain(llm=vllm_llm, prompt=prompt)
    def generate_for_model(input_text):
        cached_output = cache.get(input_text)
        if cached_output:
            return cached_output
        input_chunks = split_input(input_text, MAX_TOKENS)
        output_text = ""
        prev_output = ""
        for chunk in input_chunks:
            prompt = prev_output + chunk
            output_text += chain.run(input_text=prompt)
            prev_output = output_text.split()[-50:]
        output_chunks = split_output(output_text, MAX_TOKENS)
        best_response = get_best_response(output_chunks[0], previous_responses)
        cache[input_text] = best_response
        previous_responses.append(best_response)
        return best_response
    return generate_for_model

async def load_models():
    global model_1, model_2, model_3, model_4
    model_1 = create_langchain_model("Hjgugugjhuhjggg/llama-3.2-1B-spinquant-hf", device, cache_1, previous_responses_1)
    model_2 = create_langchain_model("Qwen/Qwen2.5-Coder-1.5B", device, cache_2, previous_responses_2)
    model_3 = create_langchain_model("Qwen/Qwen2.5-3B-Instruct", device, cache_3, previous_responses_3)
    model_4 = create_langchain_model("gpt2", device, cache_4, previous_responses_4)
    print("Modelos cargados exitosamente.")

async def optimize_models_periodically():
    while True:
        await load_models()
        await asyncio.sleep(3600)

@app.on_event("startup")
async def startup():
    await load_models()
    app.add_event_handler("startup", monitor_memory)
    app.add_event_handler("startup", optimize_models_periodically)

async def monitor_memory():
    while True:
        clear_memory()
        await asyncio.sleep(60)

@app.get("/generate")
async def generate_response(model_name: str, input_text: str):
    if model_name == "model1":
        result = await asyncio.get_event_loop().run_in_executor(executor, model_1, input_text)
    elif model_name == "model2":
        result = await asyncio.get_event_loop().run_in_executor(executor, model_2, input_text)
    elif model_name == "model3":
        result = await asyncio.get_event_loop().run_in_executor(executor, model_3, input_text)
    elif model_name == "model4":
        result = await asyncio.get_event_loop().run_in_executor(executor, model_4, input_text)
    else:
        return {"error": "Model not found"}
    return {f"{model_name}_output": result}

@app.get("/unified_summary")
async def unified_summary(input_text: str):
    output1 = await generate_response(model_name="model1", input_text=input_text)
    output2 = await generate_response(model_name="model2", input_text=input_text)
    output3 = await generate_response(model_name="model3", input_text=input_text)
    output4 = await generate_response(model_name="model4", input_text=input_text)
    combined_response = output1.get("model1_output", "") + " " + \
                         output2.get("model2_output", "") + " " + \
                         output3.get("model3_output", "") + " " + \
                         output4.get("model4_output", "")
    summarized_response = summarize_text(combined_response)
    return {"summary": summarized_response}