File size: 8,517 Bytes
a707ccd ea0faa1 a707ccd d858dc3 ea0faa1 d858dc3 ea0faa1 a707ccd d858dc3 7714f74 b27451f 7714f74 b27451f 7714f74 b27451f 7714f74 b27451f d858dc3 7714f74 d858dc3 7714f74 b27451f 7714f74 b27451f 7714f74 d858dc3 7714f74 b27451f ea0faa1 d858dc3 ea0faa1 b27451f ea0faa1 d858dc3 ea0faa1 d858dc3 ea0faa1 d858dc3 ea0faa1 b27451f d858dc3 ea0faa1 7714f74 ea0faa1 5138a85 ea0faa1 d858dc3 5138a85 d858dc3 ea0faa1 d858dc3 ea0faa1 d858dc3 5138a85 7714f74 d858dc3 7714f74 5138a85 ea0faa1 5138a85 ea0faa1 d858dc3 5138a85 b27451f 5138a85 7714f74 d858dc3 5138a85 d858dc3 ea0faa1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import spaces # Import the spaces library
# Model IDs from Hugging Face Hub (now 1.5B, 7B, and 14B)
model_ids = {
"1.5B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"7B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
"14B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", # Added 14B back
}
# Revised Default Prompts (as defined above)
default_prompt_1_5b = """**Code Analysis Task**
As a Senior Code Analyst, analyze this programming problem:
**User Request:**
{user_prompt}
**Relevant Context:**
{context_1_5b}
**Analysis Required:**
1. Briefly break down the problem, including key constraints and edge cases.
2. Suggest 2-3 potential approach options (algorithms/data structures).
3. Recommend ONE primary strategy and briefly justify your choice.
4. Provide a very brief initial pseudocode sketch of the core logic."""
default_prompt_7b = """**Code Implementation Task**
As a Principal Software Engineer, provide production-ready Streamlit/Python code based on this analysis:
**Initial Analysis:**
{response_1_5b}
**Relevant Context:**
{context_7b}
**Code Requirements:**
1. Generate concise, production-grade Python code for a Streamlit app.
2. Include necessary imports, UI elements, and basic functionality.
3. Add comments for clarity.
"""
# Function to load model and tokenizer (same)
def load_model_and_tokenizer(model_id):
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Or torch.float16 if you prefer
device_map='auto', # Let accelerate decide (will use GPU when @spaces.GPU active)
trust_remote_code=True
)
return model, tokenizer
# Load the selected models and tokenizers (now loads 1.5B, 7B, 14B)
models = {}
tokenizers = {}
for size, model_id in model_ids.items():
print(f"Loading {size} model: {model_id}")
models[size], tokenizers[size] = load_model_and_tokenizer(model_id)
print(f"Loaded {size} model.")
# --- Shared Memory Implementation --- (Same)
shared_memory = []
def store_in_memory(memory_item):
shared_memory.append(memory_item)
print(f"\n[Memory Stored]: {memory_item[:50]}...")
def retrieve_from_memory(query, top_k=2):
relevant_memories = []
query_lower = query.lower()
for memory_item in shared_memory:
if query_lower in memory_item.lower():
relevant_memories.append(memory_item)
if not relevant_memories:
print("\n[Memory Retrieval]: No relevant memories found.")
return []
print(f"\n[Memory Retrieval]: Found {len(relevant_memories)} relevant memories.")
return relevant_memories[:top_k]
# --- Swarm Agent Function with Model Swapping ---
@spaces.GPU # <---- GPU DECORATOR ADDED HERE!
def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, final_model_size="7B", temperature=0.5, top_p=0.9, max_new_tokens=300): # Added final_model_size
global shared_memory
shared_memory = [] # Clear memory for each new request
print(f"\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED - Final Model: {final_model_size} ---") # Updated message
# 1.5B Model - Brainstorming/Initial Draft (same logic)
print("\n[1.5B Model - Brainstorming] - GPU Accelerated")
retrieved_memory_1_5b = retrieve_from_memory(user_prompt)
context_1_5b = "\n".join([f"- {mem}" for mem in retrieved_memory_1_5b]) if retrieved_memory_1_5b else "No relevant context found in memory."
# Use user-provided prompt template for 1.5B model
prompt_1_5b = prompt_1_5b_template.format(user_prompt=user_prompt, context_1_5b=context_1_5b)
input_ids_1_5b = tokenizers["1.5B"].encode(prompt_1_5b, return_tensors="pt").to(models["1.5B"].device)
output_1_5b = models["1.5B"].generate(
input_ids_1_5b,
max_new_tokens=max_new_tokens, # Use user-defined max_new_tokens
temperature=temperature, # Use user-defined temperature
top_p=top_p, # Use user-defined top_p
do_sample=True
)
response_1_5b = tokenizers["1.5B"].decode(output_1_5b[0], skip_special_tokens=True)
print(f"1.5B Response:\n{response_1_5b}")
store_in_memory(f"1.5B Model Initial Response: {response_1_5b[:200]}...")
# Final Stage Model Selection (7B or 14B)
if final_model_size == "7B":
final_model = models["7B"]
final_tokenizer = tokenizers["7B"]
print("\n[7B Model - Final Code Generation] - GPU Accelerated") # Model-specific message
model_stage_name = "7B Model - Final Code"
final_max_new_tokens = max_new_tokens + 100 # Slightly more tokens for 7B
elif final_model_size == "14B":
final_model = models["14B"]
final_tokenizer = tokenizers["14B"]
print("\n[14B Model - Final Code Generation] - GPU Accelerated") # Model-specific message
model_stage_name = "14B Model - Final Code"
final_max_new_tokens = max_new_tokens + 200 # Even more tokens for 14B
else: # Default to 7B if selection is somehow invalid
final_model = models["7B"]
final_tokenizer = tokenizers["7B"]
print("\n[7B Model - Final Code Generation] - GPU Accelerated (Default)")
model_stage_name = "7B Model - Final Code (Default)"
final_max_new_tokens = max_new_tokens + 100
retrieved_memory_final = retrieve_from_memory(response_1_5b)
context_final = "\n".join([f"- {mem}" for mem in retrieved_memory_final]) if retrieved_memory_final else "No relevant context found in memory."
# Use user-provided prompt template for final model (currently using 7B prompt for both 7B and 14B for simplicity, you can create a separate 14B prompt if needed)
prompt_final = prompt_7b_template.format(response_1_5b=response_1_5b, context_7b=context_final) # Using prompt_7b_template for final stage for now
input_ids_final = final_tokenizer.encode(prompt_final, return_tensors="pt").to(final_model.device)
output_final = final_model.generate(
input_ids_final,
max_new_tokens=final_max_new_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True
)
response_final = final_tokenizer.decode(output_final[0], skip_special_tokens=True)
print(f"{model_stage_name} Response:\n{response_final}")
store_in_memory(f"{model_stage_name} Response: {response_final[:200]}...")
return response_final # Returns final model's response
# --- Gradio ChatInterface --- (with Model Selection Dropdown)
def gradio_interface(message, history, temp, top_p, max_tokens, prompt_1_5b_text, prompt_7b_text, final_model_selector): # Added final_model_selector
# history is automatically managed by ChatInterface
response = swarm_agent_sequential_rag(
message,
prompt_1_5b_template=prompt_1_5b_text, # Pass prompt templates
prompt_7b_template=prompt_7b_text,
final_model_size=final_model_selector, # Pass model selection
temperature=temp,
top_p=top_p,
max_new_tokens=int(max_tokens) # Ensure max_tokens is an integer
)
return response
iface = gr.ChatInterface( # Using ChatInterface now
fn=gradio_interface,
# Define additional inputs for settings, prompts, and model selection
additional_inputs=[
gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
gr.Textbox(value=default_prompt_1_5b, lines=10, label="1.5B Model Prompt Template"), # Textbox for 1.5B prompt
gr.Textbox(value=default_prompt_7b, lines=10, label="7B Model Prompt Template"), # Textbox for 7B prompt
gr.Dropdown(choices=["7B", "14B"], value="7B", label="Final Stage Model (7B or 14B)") # Model selection dropdown
],
title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - 2 Models + Model Swap)", # Updated title
description="Chat with a DeepSeek agent swarm (1.5B + 7B/14B selectable) with shared memory, adjustable settings, **customizable prompts, and model swapping!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
)
if __name__ == "__main__":
iface.launch() # Only launch locally if running this script directly |