File size: 7,619 Bytes
a707ccd ea0faa1 a707ccd 3e97d45 ea0faa1 a707ccd 7714f74 ea0faa1 3e97d45 ea0faa1 7714f74 ea0faa1 7714f74 ea0faa1 5138a85 ea0faa1 7714f74 ea0faa1 5138a85 ea0faa1 3e97d45 ea0faa1 5138a85 7714f74 5138a85 7714f74 5138a85 ea0faa1 5138a85 ea0faa1 7714f74 5138a85 7714f74 5138a85 7714f74 ea0faa1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import spaces # Import the spaces library
# Model IDs from Hugging Face Hub (now only 1.5B and 7B)
model_ids = {
"1.5B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"7B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
}
# Default Prompts - User can override these in the UI
default_prompt_1_5b = """**Code Analysis Task**
As a Senior Code Analyst, process this programming problem:
**User Request**
{user_prompt}
**Context from Memory**
{context_1_5b}
**Required Output Format**
1. Problem Breakdown:
- Input/Output requirements
- Key constraints
- Edge cases to consider
2. Approach Options:
- [Option 1] Algorithm/data structure choices
- [Option 2] Alternative solutions
- Time/space complexity analysis
3. Recommended Strategy:
- Best approach selection rationale
- Potential pitfalls to avoid
4. Initial Pseudocode Sketch:
- High-level structure
- Critical function definitions"""
default_prompt_7b = """**Code Implementation Task**
As a Principal Software Engineer, finalize this solution:
**Initial Analysis**
{response_1_5b}
**Context from Memory**
{context_7b}
**Required Output Format**
1. Optimized Solution:
- Final algorithm choice justification
- Complexity analysis (Big O)
2. Production-Grade Code:
- Clean, modular implementation
- Language: [Python/JS/etc] (infer from question)
- Error handling
- Documentation
3. Testing Plan:
- Sample test cases (normal/edge cases)
- Potential failure points
4. Optimization Opportunities:
- Alternative approaches for different constraints
- Parallelization/performance tips
- Memory management considerations
5. Debugging Guide:
- Common mistakes
- Logging suggestions
- Step-through example"""
# Function to load model and tokenizer (slightly adjusted device_map)
def load_model_and_tokenizer(model_id):
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Or torch.float16 if you prefer
device_map='auto', # Let accelerate decide (will use GPU when @spaces.GPU active)
trust_remote_code=True
)
return model, tokenizer
# Load the selected models and tokenizers
models = {}
tokenizers = {}
for size, model_id in model_ids.items():
print(f"Loading {size} model: {model_id}")
models[size], tokenizers[size] = load_model_and_tokenizer(model_id)
print(f"Loaded {size} model.")
# --- Shared Memory Implementation --- (Same as before)
shared_memory = []
def store_in_memory(memory_item):
shared_memory.append(memory_item)
print(f"\n[Memory Stored]: {memory_item[:50]}...")
def retrieve_from_memory(query, top_k=2):
relevant_memories = []
query_lower = query.lower()
for memory_item in shared_memory:
if query_lower in memory_item.lower():
relevant_memories.append(memory_item)
if not relevant_memories:
print("\n[Memory Retrieval]: No relevant memories found.")
return []
print(f"\n[Memory Retrieval]: Found {len(relevant_memories)} relevant memories.")
return relevant_memories[:top_k]
# --- Swarm Agent Function with Shared Memory (RAG) - DECORATED with @spaces.GPU ---
@spaces.GPU # <---- GPU DECORATOR ADDED HERE!
def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.7, top_p=0.9, max_new_tokens=300): # Added prompt templates as arguments
global shared_memory
shared_memory = [] # Clear memory for each new request
print("\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED ---") # Updated message
# 1.5B Model - Brainstorming/Initial Draft
print("\n[1.5B Model - Brainstorming] - GPU Accelerated") # Added GPU indication
retrieved_memory_1_5b = retrieve_from_memory(user_prompt)
context_1_5b = "\n".join([f"- {mem}" for mem in retrieved_memory_1_5b]) if retrieved_memory_1_5b else "No relevant context found in memory."
# Use user-provided prompt template for 1.5B model
prompt_1_5b = prompt_1_5b_template.format(user_prompt=user_prompt, context_1_5b=context_1_5b)
input_ids_1_5b = tokenizers["1.5B"].encode(prompt_1_5b, return_tensors="pt").to(models["1.5B"].device)
output_1_5b = models["1.5B"].generate(
input_ids_1_5b,
max_new_tokens=max_new_tokens, # Use user-defined max_new_tokens
temperature=temperature, # Use user-defined temperature
top_p=top_p, # Use user-defined top_p
do_sample=True
)
response_1_5b = tokenizers["1.5B"].decode(output_1_5b[0], skip_special_tokens=True)
print(f"1.5B Response:\n{response_1_5b}")
store_in_memory(f"1.5B Model Initial Response: {response_1_5b[:200]}...")
# 7B Model - Elaboration and Detail
print("\n[7B Model - Elaboration] - GPU Accelerated") # Added GPU indication
retrieved_memory_7b = retrieve_from_memory(response_1_5b)
context_7b = "\n".join([f"- {mem}" for mem in retrieved_memory_7b]) if retrieved_memory_7b else "No relevant context found in memory."
# Use user-provided prompt template for 7B model
prompt_7b = prompt_7b_template.format(response_1_5b=response_1_5b, context_7b=context_7b)
input_ids_7b = tokenizers["7B"].encode(prompt_7b, return_tensors="pt").to(models["7B"].device)
output_7b = models["7B"].generate(
input_ids_7b,
max_new_tokens=max_new_tokens + 100, # Slightly more tokens for 7B
temperature=temperature, # Use user-defined temperature
top_p=top_p, # Use user-defined top_p
do_sample=True
)
response_7b = tokenizers["7B"].decode(output_7b[0], skip_special_tokens=True)
print(f"7B Response:\n{response_7b}")
store_in_memory(f"7B Model Elaborated Response: {response_7b[:200]}...")
return response_7b # Now returns the 7B model's response as final
# --- Gradio ChatInterface ---
def gradio_interface(message, history, temp, top_p, max_tokens, prompt_1_5b_text, prompt_7b_text): # Accept prompt textboxes
# history is automatically managed by ChatInterface
response = swarm_agent_sequential_rag(
message,
prompt_1_5b_template=prompt_1_5b_text, # Pass prompt templates
prompt_7b_template=prompt_7b_text,
temperature=temp,
top_p=top_p,
max_new_tokens=int(max_tokens) # Ensure max_tokens is an integer
)
return response
iface = gr.ChatInterface( # Using ChatInterface now
fn=gradio_interface,
# Define additional inputs for settings and prompts
additional_inputs=[
gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.7, label="Temperature"),
gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
gr.Textbox(value=default_prompt_1_5b, lines=10, label="1.5B Model Prompt Template"), # Textbox for 1.5B prompt
gr.Textbox(value=default_prompt_7b, lines=10, label="7B Model Prompt Template"), # Textbox for 7B prompt
],
title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - 2 Models) - PROMPT CUSTOMIZATION", # Updated title
description="Chat with a DeepSeek agent swarm (1.5B, 7B) with shared memory, adjustable settings, **and customizable prompts!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
)
if __name__ == "__main__":
iface.launch() # Only launch locally if running this script directly |