File size: 8,344 Bytes
a707ccd 6a12f54 ea0faa1 a707ccd 85bfd55 ea0faa1 85bfd55 ea0faa1 a707ccd 6a12f54 85bfd55 6a12f54 85bfd55 6a12f54 7714f74 b27451f 7714f74 b27451f 7714f74 b27451f 7714f74 b27451f 6a12f54 7714f74 d858dc3 7714f74 b27451f 7714f74 b27451f 7714f74 d858dc3 7714f74 b27451f ea0faa1 85bfd55 ea0faa1 6a12f54 ea0faa1 6a12f54 ea0faa1 85bfd55 6a12f54 7714f74 6a12f54 7714f74 6a12f54 5138a85 6a12f54 85bfd55 6a12f54 d858dc3 6a12f54 d858dc3 5138a85 d858dc3 ea0faa1 d858dc3 ea0faa1 6a12f54 5138a85 7714f74 5138a85 ea0faa1 5138a85 ea0faa1 6a12f54 5138a85 b27451f 5138a85 85bfd55 5138a85 85bfd55 ea0faa1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import spaces # Import the spaces library
# Model IDs from Hugging Face Hub (Fixed to Unsloth 7B and 32B Unsloth 4bit)
model_ids = {
"7B-Unsloth": "unsloth/DeepSeek-R1-Distill-Qwen-7B-unsloth-bnb-4bit", # Unsloth 7B model
"32B-Unsloth": "unsloth/DeepSeek-R1-Distill-Qwen-32B-bnb-4bit", # Unsloth 32B model
}
models = {} # Keep models as a dictionary, but initially empty
tokenizers = {} # Keep tokenizers as a dictionary, initially empty
# BitsAndBytesConfig for 4-bit quantization (for BOTH models now)
bnb_config_4bit = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16, # Or torch.float16 if needed
)
def get_model_and_tokenizer(size): # Function to load model on demand
if size not in models: # Load only if not already loaded
model_id = model_ids[size]
print(f"Loading {size} model: {model_id} on demand")
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=bnb_config_4bit, # Apply 4-bit config for BOTH models
torch_dtype=torch.bfloat16, # Or torch.float16 if needed
device_map='auto',
trust_remote_code=True
)
models[size] = model
tokenizers[size] = tokenizer
print(f"Loaded {size} model on demand.")
return models[size], tokenizers[size]
# Revised Default Prompts (as defined previously - these are still good)
default_prompt_1_5b = """**Code Analysis Task**
As a Senior Code Analyst, analyze this programming problem:
**User Request:**
{user_prompt}
**Relevant Context:**
{context_1_5b}
**Analysis Required:**
1. Briefly break down the problem, including key constraints and edge cases.
2. Suggest 2-3 potential approach options (algorithms/data structures).
3. Recommend ONE primary strategy and briefly justify your choice.
4. Provide a very brief initial pseudocode sketch of the core logic."""
default_prompt_7b = """**Code Implementation Task**
As a Principal Software Engineer, provide production-ready Streamlit/Python code based on this analysis:
**Initial Analysis:**
{response_1_5b}
**Relevant Context:**
{context_7b}
**Code Requirements:**
1. Generate concise, production-grade Python code for a Streamlit app.
2. Include necessary imports, UI elements, and basic functionality.
3. Add comments for clarity.
"""
# --- Shared Memory Implementation --- (Same)
shared_memory = []
def store_in_memory(memory_item):
shared_memory.append(memory_item)
print(f"\n[Memory Stored]: {memory_item[:50]}...")
def retrieve_from_memory(query, top_k=2):
relevant_memories = []
query_lower = query.lower()
for memory_item in shared_memory:
if query_lower in memory_item.lower():
relevant_memories.append(memory_item)
if not relevant_memories:
print("\n[Memory Retrieval]: No relevant memories found.")
return []
print(f"\n[Memory Retrieval]: Found {len(relevant_memories)} relevant memories.")
return relevant_memories[:top_k]
# --- Swarm Agent Function - Fixed Models (Unsloth 7B and 32B Unsloth) ---
@spaces.GPU # <---- GPU DECORATOR ADDED HERE!
def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.5, top_p=0.9, max_new_tokens=300): # Removed final_model_size
global shared_memory
shared_memory = [] # Clear memory for each new request
print(f"\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED - Final Model: 32B Unsloth ---") # Updated message
# 7B Unsloth Model - Brainstorming/Initial Draft (Lazy Load and get model)
print("\n[7B Unsloth Model - Brainstorming] - GPU Accelerated") # Now 7B Unsloth is brainstorming
model_7b, tokenizer_7b = get_model_and_tokenizer("7B-Unsloth") # Lazy load 7B Unsloth
retrieved_memory_7b = retrieve_from_memory(user_prompt)
context_7b = "\n".join([f"- {mem}" for mem in retrieved_memory_7b]) if retrieved_memory_7b else "No relevant context found in memory."
# Use user-provided prompt template for 7B model (as brainstorming model now)
prompt_7b_brainstorm = prompt_1_5b_template.format(user_prompt=user_prompt, context_1_5b=context_7b) # Reusing 1.5B template - adjust if needed
input_ids_7b = tokenizer_7b.encode(prompt_7b_brainstorm, return_tensors="pt").to(model_7b.device)
output_7b = model_7b.generate(
input_ids_7b,
max_new_tokens=max_new_tokens, # Use user-defined max_new_tokens
temperature=temperature, # Use user-defined temperature
top_p=top_p, # Use user-defined top_p
do_sample=True
)
response_7b = tokenizer_7b.decode(output_7b[0], skip_special_tokens=True)
print(f"7B Unsloth Response (Brainstorming):\n{response_7b}") # Updated message
store_in_memory(f"7B Unsloth Model Initial Response: {response_7b[:200]}...")
# 32B Unsloth Model - Final Code Generation (Lazy Load and get model)
final_model, final_tokenizer = get_model_and_tokenizer("32B-Unsloth") # Lazy load 32B Unsloth
print("\n[32B Unsloth Model - Final Code Generation] - GPU Accelerated") # Model-specific message
model_stage_name = "32B Unsloth Model - Final Code"
final_max_new_tokens = max_new_tokens + 200 # More tokens for 32B model
retrieved_memory_final = retrieve_from_memory(response_7b) # Memory from 7B brainstorm
context_final = "\n".join([f"- {mem}" for mem in retrieved_memory_final]) if retrieved_memory_final else "No relevant context found in memory."
# Use user-provided prompt template for final model (using 7B template)
prompt_final = prompt_7b_template.format(response_1_5b=response_7b, context_7b=context_final) # Using prompt_7b_template for final stage
input_ids_final = final_tokenizer.encode(prompt_final, return_tensors="pt").to(final_model.device)
output_final = final_model.generate(
input_ids_final,
max_new_tokens=final_max_new_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True
)
response_final = final_tokenizer.decode(output_final[0], skip_special_tokens=True)
print(f"{model_stage_name} Response:\n{response_final}")
store_in_memory(f"{model_stage_name} Response: {response_final[:200]}...")
return response_final # Returns final model's response
# --- Gradio ChatInterface --- (No Model Selection Dropdown anymore)
def gradio_interface(message, history, temp, top_p, max_tokens, prompt_1_5b_text, prompt_7b_text): # Removed final_model_selector
# history is automatically managed by ChatInterface
response = swarm_agent_sequential_rag(
message,
prompt_1_5b_template=prompt_1_5b_text, # Pass prompt templates
prompt_7b_template=prompt_7b_text,
temperature=temp,
top_p=top_p,
max_new_tokens=int(max_tokens) # Ensure max_tokens is an integer
)
return response
iface = gr.ChatInterface( # Using ChatInterface now
fn=gradio_interface,
# Define additional inputs for settings and prompts (NO model dropdown)
additional_inputs=[
gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
gr.Textbox(value=default_prompt_1_5b, lines=10, label="Brainstorming Model Prompt Template (Unsloth 7B)"), # Updated label - Unsloth 7B now brainstormer
gr.Textbox(value=default_prompt_7b, lines=10, label="Code Generation Prompt Template (Unsloth 32B)"), # Updated label - Unsloth 32B is code generator
],
title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - Fixed Models: Unsloth 7B + 32B)", # Updated title
description="Chat with a DeepSeek agent swarm (Unsloth 7B + 32B) with shared memory, adjustable settings, **and customizable prompts!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
)
if __name__ == "__main__":
iface.launch() # Only launch locally if running this script directly |