File size: 8,344 Bytes
a707ccd
6a12f54
ea0faa1
 
a707ccd
85bfd55
ea0faa1
85bfd55
 
ea0faa1
a707ccd
6a12f54
 
 
85bfd55
6a12f54
 
 
 
 
 
 
 
 
 
 
 
85bfd55
 
 
 
 
 
 
6a12f54
 
 
 
 
 
 
7714f74
b27451f
7714f74
b27451f
7714f74
 
b27451f
7714f74
 
b27451f
 
 
6a12f54
 
7714f74
 
 
d858dc3
7714f74
b27451f
7714f74
 
b27451f
7714f74
 
d858dc3
 
 
 
 
7714f74
 
b27451f
ea0faa1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85bfd55
ea0faa1
6a12f54
ea0faa1
 
 
6a12f54
ea0faa1
85bfd55
 
 
6a12f54
 
7714f74
6a12f54
 
7714f74
6a12f54
 
 
5138a85
 
 
 
 
6a12f54
85bfd55
 
6a12f54
 
 
 
 
 
 
 
d858dc3
 
6a12f54
 
d858dc3
 
 
 
 
 
 
 
5138a85
 
d858dc3
 
 
ea0faa1
d858dc3
ea0faa1
 
6a12f54
 
5138a85
 
 
7714f74
 
 
5138a85
 
 
 
ea0faa1
5138a85
ea0faa1
6a12f54
5138a85
b27451f
5138a85
 
85bfd55
 
5138a85
85bfd55
 
ea0faa1
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import spaces  # Import the spaces library

# Model IDs from Hugging Face Hub (Fixed to Unsloth 7B and 32B Unsloth 4bit)
model_ids = {
    "7B-Unsloth": "unsloth/DeepSeek-R1-Distill-Qwen-7B-unsloth-bnb-4bit", # Unsloth 7B model
    "32B-Unsloth": "unsloth/DeepSeek-R1-Distill-Qwen-32B-bnb-4bit",       # Unsloth 32B model
}

models = {}  # Keep models as a dictionary, but initially empty
tokenizers = {} # Keep tokenizers as a dictionary, initially empty

# BitsAndBytesConfig for 4-bit quantization (for BOTH models now)
bnb_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, # Or torch.float16 if needed
)


def get_model_and_tokenizer(size): # Function to load model on demand
    if size not in models: # Load only if not already loaded
        model_id = model_ids[size]
        print(f"Loading {size} model: {model_id} on demand")
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=bnb_config_4bit, # Apply 4-bit config for BOTH models
            torch_dtype=torch.bfloat16, # Or torch.float16 if needed
            device_map='auto',
            trust_remote_code=True
        )
        models[size] = model
        tokenizers[size] = tokenizer
        print(f"Loaded {size} model on demand.")
    return models[size], tokenizers[size]


# Revised Default Prompts (as defined previously - these are still good)
default_prompt_1_5b = """**Code Analysis Task**
As a Senior Code Analyst, analyze this programming problem:

**User Request:**
{user_prompt}

**Relevant Context:**
{context_1_5b}

**Analysis Required:**
1. Briefly break down the problem, including key constraints and edge cases.
2. Suggest 2-3 potential approach options (algorithms/data structures).
3. Recommend ONE primary strategy and briefly justify your choice.
4. Provide a very brief initial pseudocode sketch of the core logic."""


default_prompt_7b = """**Code Implementation Task**
As a Principal Software Engineer, provide production-ready Streamlit/Python code based on this analysis:

**Initial Analysis:**
{response_1_5b}

**Relevant Context:**
{context_7b}

**Code Requirements:**
1.  Generate concise, production-grade Python code for a Streamlit app.
2.  Include necessary imports, UI elements, and basic functionality.
3.  Add comments for clarity.
    """


# --- Shared Memory Implementation --- (Same)
shared_memory = []

def store_in_memory(memory_item):
    shared_memory.append(memory_item)
    print(f"\n[Memory Stored]: {memory_item[:50]}...")

def retrieve_from_memory(query, top_k=2):
    relevant_memories = []
    query_lower = query.lower()
    for memory_item in shared_memory:
        if query_lower in memory_item.lower():
            relevant_memories.append(memory_item)

    if not relevant_memories:
        print("\n[Memory Retrieval]: No relevant memories found.")
        return []

    print(f"\n[Memory Retrieval]: Found {len(relevant_memories)} relevant memories.")
    return relevant_memories[:top_k]


# --- Swarm Agent Function - Fixed Models (Unsloth 7B and 32B Unsloth) ---
@spaces.GPU  # <----  GPU DECORATOR ADDED HERE!
def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.5, top_p=0.9, max_new_tokens=300): # Removed final_model_size
    global shared_memory
    shared_memory = [] # Clear memory for each new request

    print(f"\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED - Final Model: 32B Unsloth ---") # Updated message

    # 7B Unsloth Model - Brainstorming/Initial Draft (Lazy Load and get model)
    print("\n[7B Unsloth Model - Brainstorming] - GPU Accelerated") # Now 7B Unsloth is brainstorming
    model_7b, tokenizer_7b = get_model_and_tokenizer("7B-Unsloth") # Lazy load 7B Unsloth
    retrieved_memory_7b = retrieve_from_memory(user_prompt)
    context_7b = "\n".join([f"- {mem}" for mem in retrieved_memory_7b]) if retrieved_memory_7b else "No relevant context found in memory."

    # Use user-provided prompt template for 7B model (as brainstorming model now)
    prompt_7b_brainstorm = prompt_1_5b_template.format(user_prompt=user_prompt, context_1_5b=context_7b) # Reusing 1.5B template - adjust if needed

    input_ids_7b = tokenizer_7b.encode(prompt_7b_brainstorm, return_tensors="pt").to(model_7b.device)
    output_7b = model_7b.generate(
        input_ids_7b,
        max_new_tokens=max_new_tokens, # Use user-defined max_new_tokens
        temperature=temperature, # Use user-defined temperature
        top_p=top_p,           # Use user-defined top_p
        do_sample=True
    )
    response_7b = tokenizer_7b.decode(output_7b[0], skip_special_tokens=True)
    print(f"7B Unsloth Response (Brainstorming):\n{response_7b}") # Updated message
    store_in_memory(f"7B Unsloth Model Initial Response: {response_7b[:200]}...")

    # 32B Unsloth Model - Final Code Generation (Lazy Load and get model)
    final_model, final_tokenizer = get_model_and_tokenizer("32B-Unsloth") # Lazy load 32B Unsloth
    print("\n[32B Unsloth Model - Final Code Generation] - GPU Accelerated") # Model-specific message
    model_stage_name = "32B Unsloth Model - Final Code"
    final_max_new_tokens = max_new_tokens + 200 # More tokens for 32B model

    retrieved_memory_final = retrieve_from_memory(response_7b) # Memory from 7B brainstorm
    context_final = "\n".join([f"- {mem}" for mem in retrieved_memory_final]) if retrieved_memory_final else "No relevant context found in memory."

    # Use user-provided prompt template for final model (using 7B template)
    prompt_final = prompt_7b_template.format(response_1_5b=response_7b, context_7b=context_final) # Using prompt_7b_template for final stage


    input_ids_final = final_tokenizer.encode(prompt_final, return_tensors="pt").to(final_model.device)
    output_final = final_model.generate(
        input_ids_final,
        max_new_tokens=final_max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True
    )
    response_final = final_tokenizer.decode(output_final[0], skip_special_tokens=True)
    print(f"{model_stage_name} Response:\n{response_final}")
    store_in_memory(f"{model_stage_name} Response: {response_final[:200]}...")

    return response_final # Returns final model's response


# --- Gradio ChatInterface --- (No Model Selection Dropdown anymore)
def gradio_interface(message, history, temp, top_p, max_tokens, prompt_1_5b_text, prompt_7b_text): # Removed final_model_selector
    # history is automatically managed by ChatInterface
    response = swarm_agent_sequential_rag(
        message,
        prompt_1_5b_template=prompt_1_5b_text, # Pass prompt templates
        prompt_7b_template=prompt_7b_text,
        temperature=temp,
        top_p=top_p,
        max_new_tokens=int(max_tokens) # Ensure max_tokens is an integer
    )
    return response

iface = gr.ChatInterface( # Using ChatInterface now
    fn=gradio_interface,
    # Define additional inputs for settings and prompts (NO model dropdown)
    additional_inputs=[
        gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
        gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
        gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
        gr.Textbox(value=default_prompt_1_5b, lines=10, label="Brainstorming Model Prompt Template (Unsloth 7B)"), # Updated label - Unsloth 7B now brainstormer
        gr.Textbox(value=default_prompt_7b, lines=10, label="Code Generation Prompt Template (Unsloth 32B)"),   # Updated label - Unsloth 32B is code generator
    ],
    title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - Fixed Models: Unsloth 7B + 32B)", # Updated title
    description="Chat with a DeepSeek agent swarm (Unsloth 7B + 32B) with shared memory, adjustable settings, **and customizable prompts!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
)

if __name__ == "__main__":
    iface.launch() # Only launch locally if running this script directly