File size: 7,619 Bytes
a707ccd
ea0faa1
 
 
a707ccd
3e97d45
ea0faa1
 
 
 
a707ccd
7714f74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea0faa1
 
 
 
 
 
 
 
 
 
 
3e97d45
ea0faa1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7714f74
ea0faa1
 
 
 
 
 
 
 
 
7714f74
 
 
 
ea0faa1
5138a85
 
 
 
 
 
 
ea0faa1
 
 
 
 
 
 
 
7714f74
 
 
 
 
ea0faa1
5138a85
 
 
 
 
 
 
ea0faa1
 
 
 
3e97d45
ea0faa1
 
5138a85
7714f74
5138a85
 
 
7714f74
 
 
5138a85
 
 
 
ea0faa1
5138a85
ea0faa1
7714f74
5138a85
 
 
 
7714f74
 
5138a85
7714f74
 
ea0faa1
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import spaces  # Import the spaces library

# Model IDs from Hugging Face Hub (now only 1.5B and 7B)
model_ids = {
    "1.5B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "7B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
}

# Default Prompts - User can override these in the UI
default_prompt_1_5b = """**Code Analysis Task**
As a Senior Code Analyst, process this programming problem:

**User Request**
{user_prompt}

**Context from Memory**
{context_1_5b}

**Required Output Format**
1. Problem Breakdown:
   - Input/Output requirements
   - Key constraints
   - Edge cases to consider

2. Approach Options:
   - [Option 1] Algorithm/data structure choices
   - [Option 2] Alternative solutions
   - Time/space complexity analysis

3. Recommended Strategy:
   - Best approach selection rationale
   - Potential pitfalls to avoid

4. Initial Pseudocode Sketch:
   - High-level structure
   - Critical function definitions"""

default_prompt_7b = """**Code Implementation Task**
As a Principal Software Engineer, finalize this solution:

**Initial Analysis**
{response_1_5b}

**Context from Memory**
{context_7b}

**Required Output Format**
1. Optimized Solution:
   - Final algorithm choice justification
   - Complexity analysis (Big O)

2. Production-Grade Code:
   - Clean, modular implementation
   - Language: [Python/JS/etc] (infer from question)
   - Error handling
   - Documentation

3. Testing Plan:
   - Sample test cases (normal/edge cases)
   - Potential failure points

4. Optimization Opportunities:
   - Alternative approaches for different constraints
   - Parallelization/performance tips
   - Memory management considerations

5. Debugging Guide:
   - Common mistakes
   - Logging suggestions
   - Step-through example"""


# Function to load model and tokenizer (slightly adjusted device_map)
def load_model_and_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16, # Or torch.float16 if you prefer
        device_map='auto', # Let accelerate decide (will use GPU when @spaces.GPU active)
        trust_remote_code=True
    )
    return model, tokenizer

# Load the selected models and tokenizers
models = {}
tokenizers = {}
for size, model_id in model_ids.items():
    print(f"Loading {size} model: {model_id}")
    models[size], tokenizers[size] = load_model_and_tokenizer(model_id)
    print(f"Loaded {size} model.")

# --- Shared Memory Implementation --- (Same as before)
shared_memory = []

def store_in_memory(memory_item):
    shared_memory.append(memory_item)
    print(f"\n[Memory Stored]: {memory_item[:50]}...")

def retrieve_from_memory(query, top_k=2):
    relevant_memories = []
    query_lower = query.lower()
    for memory_item in shared_memory:
        if query_lower in memory_item.lower():
            relevant_memories.append(memory_item)

    if not relevant_memories:
        print("\n[Memory Retrieval]: No relevant memories found.")
        return []

    print(f"\n[Memory Retrieval]: Found {len(relevant_memories)} relevant memories.")
    return relevant_memories[:top_k]


# --- Swarm Agent Function with Shared Memory (RAG) - DECORATED with @spaces.GPU ---
@spaces.GPU  # <----  GPU DECORATOR ADDED HERE!
def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.7, top_p=0.9, max_new_tokens=300): # Added prompt templates as arguments
    global shared_memory
    shared_memory = [] # Clear memory for each new request

    print("\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED ---") # Updated message

    # 1.5B Model - Brainstorming/Initial Draft
    print("\n[1.5B Model - Brainstorming] - GPU Accelerated") # Added GPU indication
    retrieved_memory_1_5b = retrieve_from_memory(user_prompt)
    context_1_5b = "\n".join([f"- {mem}" for mem in retrieved_memory_1_5b]) if retrieved_memory_1_5b else "No relevant context found in memory."

    # Use user-provided prompt template for 1.5B model
    prompt_1_5b = prompt_1_5b_template.format(user_prompt=user_prompt, context_1_5b=context_1_5b)

    input_ids_1_5b = tokenizers["1.5B"].encode(prompt_1_5b, return_tensors="pt").to(models["1.5B"].device)
    output_1_5b = models["1.5B"].generate(
        input_ids_1_5b,
        max_new_tokens=max_new_tokens, # Use user-defined max_new_tokens
        temperature=temperature, # Use user-defined temperature
        top_p=top_p,           # Use user-defined top_p
        do_sample=True
    )
    response_1_5b = tokenizers["1.5B"].decode(output_1_5b[0], skip_special_tokens=True)
    print(f"1.5B Response:\n{response_1_5b}")
    store_in_memory(f"1.5B Model Initial Response: {response_1_5b[:200]}...")

    # 7B Model - Elaboration and Detail
    print("\n[7B Model - Elaboration] - GPU Accelerated") # Added GPU indication
    retrieved_memory_7b = retrieve_from_memory(response_1_5b)
    context_7b = "\n".join([f"- {mem}" for mem in retrieved_memory_7b]) if retrieved_memory_7b else "No relevant context found in memory."

    # Use user-provided prompt template for 7B model
    prompt_7b = prompt_7b_template.format(response_1_5b=response_1_5b, context_7b=context_7b)


    input_ids_7b = tokenizers["7B"].encode(prompt_7b, return_tensors="pt").to(models["7B"].device)
    output_7b = models["7B"].generate(
        input_ids_7b,
        max_new_tokens=max_new_tokens + 100, # Slightly more tokens for 7B
        temperature=temperature, # Use user-defined temperature
        top_p=top_p,           # Use user-defined top_p
        do_sample=True
    )
    response_7b = tokenizers["7B"].decode(output_7b[0], skip_special_tokens=True)
    print(f"7B Response:\n{response_7b}")
    store_in_memory(f"7B Model Elaborated Response: {response_7b[:200]}...")

    return response_7b # Now returns the 7B model's response as final


# --- Gradio ChatInterface ---
def gradio_interface(message, history, temp, top_p, max_tokens, prompt_1_5b_text, prompt_7b_text): # Accept prompt textboxes
    # history is automatically managed by ChatInterface
    response = swarm_agent_sequential_rag(
        message,
        prompt_1_5b_template=prompt_1_5b_text, # Pass prompt templates
        prompt_7b_template=prompt_7b_text,
        temperature=temp,
        top_p=top_p,
        max_new_tokens=int(max_tokens) # Ensure max_tokens is an integer
    )
    return response

iface = gr.ChatInterface( # Using ChatInterface now
    fn=gradio_interface,
    # Define additional inputs for settings and prompts
    additional_inputs=[
        gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.7, label="Temperature"),
        gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
        gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
        gr.Textbox(value=default_prompt_1_5b, lines=10, label="1.5B Model Prompt Template"), # Textbox for 1.5B prompt
        gr.Textbox(value=default_prompt_7b, lines=10, label="7B Model Prompt Template"),   # Textbox for 7B prompt
    ],
    title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - 2 Models) - PROMPT CUSTOMIZATION", # Updated title
    description="Chat with a DeepSeek agent swarm (1.5B, 7B) with shared memory, adjustable settings, **and customizable prompts!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
)

if __name__ == "__main__":
    iface.launch() # Only launch locally if running this script directly