File size: 8,517 Bytes
a707ccd
ea0faa1
 
 
a707ccd
d858dc3
ea0faa1
 
 
d858dc3
ea0faa1
a707ccd
d858dc3
7714f74
b27451f
7714f74
b27451f
7714f74
 
b27451f
7714f74
 
b27451f
 
 
d858dc3
 
7714f74
 
 
d858dc3
7714f74
b27451f
7714f74
 
b27451f
7714f74
 
d858dc3
 
 
 
 
7714f74
 
b27451f
ea0faa1
 
 
 
 
 
 
 
 
 
d858dc3
ea0faa1
 
 
 
 
 
 
b27451f
ea0faa1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d858dc3
ea0faa1
d858dc3
ea0faa1
 
 
d858dc3
ea0faa1
b27451f
d858dc3
ea0faa1
 
7714f74
 
 
 
ea0faa1
5138a85
 
 
 
 
 
 
ea0faa1
 
 
 
d858dc3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5138a85
 
d858dc3
 
 
ea0faa1
d858dc3
ea0faa1
 
d858dc3
 
5138a85
 
 
7714f74
 
d858dc3
7714f74
5138a85
 
 
 
ea0faa1
5138a85
ea0faa1
d858dc3
5138a85
b27451f
5138a85
 
7714f74
 
d858dc3
5138a85
d858dc3
 
ea0faa1
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import spaces  # Import the spaces library

# Model IDs from Hugging Face Hub (now 1.5B, 7B, and 14B)
model_ids = {
    "1.5B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "7B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    "14B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", # Added 14B back
}

# Revised Default Prompts (as defined above)
default_prompt_1_5b = """**Code Analysis Task**
As a Senior Code Analyst, analyze this programming problem:

**User Request:**
{user_prompt}

**Relevant Context:**
{context_1_5b}

**Analysis Required:**
1. Briefly break down the problem, including key constraints and edge cases.
2. Suggest 2-3 potential approach options (algorithms/data structures).
3. Recommend ONE primary strategy and briefly justify your choice. 
4. Provide a very brief initial pseudocode sketch of the core logic.""" 


default_prompt_7b = """**Code Implementation Task**
As a Principal Software Engineer, provide production-ready Streamlit/Python code based on this analysis:

**Initial Analysis:**
{response_1_5b}

**Relevant Context:**
{context_7b}

**Code Requirements:**
1.  Generate concise, production-grade Python code for a Streamlit app.
2.  Include necessary imports, UI elements, and basic functionality.
3.  Add comments for clarity.
    """


# Function to load model and tokenizer (same)
def load_model_and_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16, # Or torch.float16 if you prefer
        device_map='auto', # Let accelerate decide (will use GPU when @spaces.GPU active)
        trust_remote_code=True
    )
    return model, tokenizer

# Load the selected models and tokenizers (now loads 1.5B, 7B, 14B)
models = {}
tokenizers = {}
for size, model_id in model_ids.items():
    print(f"Loading {size} model: {model_id}")
    models[size], tokenizers[size] = load_model_and_tokenizer(model_id)
    print(f"Loaded {size} model.")

# --- Shared Memory Implementation --- (Same)
shared_memory = []

def store_in_memory(memory_item):
    shared_memory.append(memory_item)
    print(f"\n[Memory Stored]: {memory_item[:50]}...")

def retrieve_from_memory(query, top_k=2):
    relevant_memories = []
    query_lower = query.lower()
    for memory_item in shared_memory:
        if query_lower in memory_item.lower():
            relevant_memories.append(memory_item)

    if not relevant_memories:
        print("\n[Memory Retrieval]: No relevant memories found.")
        return []

    print(f"\n[Memory Retrieval]: Found {len(relevant_memories)} relevant memories.")
    return relevant_memories[:top_k]


# --- Swarm Agent Function with Model Swapping ---
@spaces.GPU  # <----  GPU DECORATOR ADDED HERE!
def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, final_model_size="7B", temperature=0.5, top_p=0.9, max_new_tokens=300): # Added final_model_size
    global shared_memory
    shared_memory = [] # Clear memory for each new request

    print(f"\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED - Final Model: {final_model_size} ---") # Updated message

    # 1.5B Model - Brainstorming/Initial Draft (same logic)
    print("\n[1.5B Model - Brainstorming] - GPU Accelerated")
    retrieved_memory_1_5b = retrieve_from_memory(user_prompt)
    context_1_5b = "\n".join([f"- {mem}" for mem in retrieved_memory_1_5b]) if retrieved_memory_1_5b else "No relevant context found in memory."

    # Use user-provided prompt template for 1.5B model
    prompt_1_5b = prompt_1_5b_template.format(user_prompt=user_prompt, context_1_5b=context_1_5b)

    input_ids_1_5b = tokenizers["1.5B"].encode(prompt_1_5b, return_tensors="pt").to(models["1.5B"].device)
    output_1_5b = models["1.5B"].generate(
        input_ids_1_5b,
        max_new_tokens=max_new_tokens, # Use user-defined max_new_tokens
        temperature=temperature, # Use user-defined temperature
        top_p=top_p,           # Use user-defined top_p
        do_sample=True
    )
    response_1_5b = tokenizers["1.5B"].decode(output_1_5b[0], skip_special_tokens=True)
    print(f"1.5B Response:\n{response_1_5b}")
    store_in_memory(f"1.5B Model Initial Response: {response_1_5b[:200]}...")

    # Final Stage Model Selection (7B or 14B)
    if final_model_size == "7B":
        final_model = models["7B"]
        final_tokenizer = tokenizers["7B"]
        print("\n[7B Model - Final Code Generation] - GPU Accelerated") # Model-specific message
        model_stage_name = "7B Model - Final Code"
        final_max_new_tokens = max_new_tokens + 100 # Slightly more tokens for 7B

    elif final_model_size == "14B":
        final_model = models["14B"]
        final_tokenizer = tokenizers["14B"]
        print("\n[14B Model - Final Code Generation] - GPU Accelerated") # Model-specific message
        model_stage_name = "14B Model - Final Code"
        final_max_new_tokens = max_new_tokens + 200 # Even more tokens for 14B

    else: # Default to 7B if selection is somehow invalid
        final_model = models["7B"]
        final_tokenizer = tokenizers["7B"]
        print("\n[7B Model - Final Code Generation] - GPU Accelerated (Default)")
        model_stage_name = "7B Model - Final Code (Default)"
        final_max_new_tokens = max_new_tokens + 100


    retrieved_memory_final = retrieve_from_memory(response_1_5b)
    context_final = "\n".join([f"- {mem}" for mem in retrieved_memory_final]) if retrieved_memory_final else "No relevant context found in memory."

    # Use user-provided prompt template for final model (currently using 7B prompt for both 7B and 14B for simplicity, you can create a separate 14B prompt if needed)
    prompt_final = prompt_7b_template.format(response_1_5b=response_1_5b, context_7b=context_final) # Using prompt_7b_template for final stage for now


    input_ids_final = final_tokenizer.encode(prompt_final, return_tensors="pt").to(final_model.device)
    output_final = final_model.generate(
        input_ids_final,
        max_new_tokens=final_max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True
    )
    response_final = final_tokenizer.decode(output_final[0], skip_special_tokens=True)
    print(f"{model_stage_name} Response:\n{response_final}")
    store_in_memory(f"{model_stage_name} Response: {response_final[:200]}...")

    return response_final # Returns final model's response


# --- Gradio ChatInterface --- (with Model Selection Dropdown)
def gradio_interface(message, history, temp, top_p, max_tokens, prompt_1_5b_text, prompt_7b_text, final_model_selector): # Added final_model_selector
    # history is automatically managed by ChatInterface
    response = swarm_agent_sequential_rag(
        message,
        prompt_1_5b_template=prompt_1_5b_text, # Pass prompt templates
        prompt_7b_template=prompt_7b_text,
        final_model_size=final_model_selector, # Pass model selection
        temperature=temp,
        top_p=top_p,
        max_new_tokens=int(max_tokens) # Ensure max_tokens is an integer
    )
    return response

iface = gr.ChatInterface( # Using ChatInterface now
    fn=gradio_interface,
    # Define additional inputs for settings, prompts, and model selection
    additional_inputs=[
        gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
        gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
        gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
        gr.Textbox(value=default_prompt_1_5b, lines=10, label="1.5B Model Prompt Template"), # Textbox for 1.5B prompt
        gr.Textbox(value=default_prompt_7b, lines=10, label="7B Model Prompt Template"),   # Textbox for 7B prompt
        gr.Dropdown(choices=["7B", "14B"], value="7B", label="Final Stage Model (7B or 14B)") # Model selection dropdown
    ],
    title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - 2 Models + Model Swap)", # Updated title
    description="Chat with a DeepSeek agent swarm (1.5B + 7B/14B selectable) with shared memory, adjustable settings, **customizable prompts, and model swapping!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
)

if __name__ == "__main__":
    iface.launch() # Only launch locally if running this script directly