File size: 7,218 Bytes
a707ccd ea0faa1 a707ccd 3e97d45 ea0faa1 a707ccd b27451f 7714f74 b27451f 7714f74 b27451f 7714f74 b27451f 7714f74 b27451f 7714f74 b27451f 7714f74 b27451f 7714f74 b27451f 7714f74 b27451f 7714f74 b27451f ea0faa1 b27451f ea0faa1 b27451f ea0faa1 b27451f ea0faa1 b27451f ea0faa1 7714f74 ea0faa1 5138a85 ea0faa1 b27451f ea0faa1 7714f74 ea0faa1 5138a85 ea0faa1 3e97d45 ea0faa1 b27451f 7714f74 5138a85 7714f74 5138a85 ea0faa1 5138a85 ea0faa1 7714f74 5138a85 b27451f 5138a85 7714f74 5138a85 7714f74 ea0faa1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import spaces # Import the spaces library
# Model IDs from Hugging Face Hub (now only 1.5B and 7B)
model_ids = {
"1.5B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
"7B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
}
# Revised Default Prompts
default_prompt_1_5b = """**Code Analysis Task**
As a Senior Code Analyst, analyze this programming problem:
**User Request:**
{user_prompt}
**Relevant Context:**
{context_1_5b}
**Analysis Required:**
1. Briefly break down the problem, including key constraints and edge cases.
2. Suggest 2-3 potential approach options (algorithms/data structures).
3. Recommend a primary strategy and explain your reasoning concisely.
4. Provide a very brief initial pseudocode sketch of the core logic."""
default_prompt_7b = """**Code Implementation Task**
As a Principal Software Engineer, develop a solution based on this analysis:
**Initial Analysis:**
{response_1_5b}
**Relevant Context:**
{context_7b}
**Solution Development Requirements:**
1. Present an optimized solution approach, justifying your algorithm choices.
2. Provide production-grade code in [Python/JS/etc.] (infer language). Include error handling and comments.
3. Outline a testing plan with key test cases.
4. Briefly suggest optimization opportunities and debugging tips."""
# Function to load model and tokenizer (same)
def load_model_and_tokenizer(model_id):
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16, # Or torch.float16 if you prefer
device_map='auto', # Let accelerate decide (will use GPU when @spaces.GPU active)
trust_remote_code=True
)
return model, tokenizer
# Load the selected models and tokenizers (same)
models = {}
tokenizers = {}
for size, model_id in model_ids.items():
print(f"Loading {size} model: {model_id}")
models[size], tokenizers[size] = load_model_and_tokenizer(model_id)
print(f"Loaded {size} model.")
# --- Shared Memory Implementation --- (Same)
shared_memory = []
def store_in_memory(memory_item):
shared_memory.append(memory_item)
print(f"\n[Memory Stored]: {memory_item[:50]}...")
def retrieve_from_memory(query, top_k=2):
relevant_memories = []
query_lower = query.lower()
for memory_item in shared_memory:
if query_lower in memory_item.lower():
relevant_memories.append(memory_item)
if not relevant_memories:
print("\n[Memory Retrieval]: No relevant memories found.")
return []
print(f"\n[Memory Retrieval]: Found {len(relevant_memories)} relevant memories.")
return relevant_memories[:top_k]
# --- Swarm Agent Function with Shared Memory (RAG) - DECORATED with @spaces.GPU ---
@spaces.GPU # <---- GPU DECORATOR ADDED HERE!
def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.5, top_p=0.9, max_new_tokens=300): # Lowered default temperature
global shared_memory
shared_memory = [] # Clear memory for each new request
print("\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED ---") # Updated message
# 1.5B Model - Brainstorming/Initial Draft (same logic)
print("\n[1.5B Model - Brainstorming] - GPU Accelerated") # Added GPU indication
retrieved_memory_1_5b = retrieve_from_memory(user_prompt)
context_1_5b = "\n".join([f"- {mem}" for mem in retrieved_memory_1_5b]) if retrieved_memory_1_5b else "No relevant context found in memory."
# Use user-provided prompt template for 1.5B model
prompt_1_5b = prompt_1_5b_template.format(user_prompt=user_prompt, context_1_5b=context_1_5b)
input_ids_1_5b = tokenizers["1.5B"].encode(prompt_1_5b, return_tensors="pt").to(models["1.5B"].device)
output_1_5b = models["1.5B"].generate(
input_ids_1_5b,
max_new_tokens=max_new_tokens, # Use user-defined max_new_tokens
temperature=temperature, # Use user-defined temperature
top_p=top_p, # Use user-defined top_p
do_sample=True
)
response_1_5b = tokenizers["1.5B"].decode(output_1_5b[0], skip_special_tokens=True)
print(f"1.5B Response:\n{response_1_5b}")
store_in_memory(f"1.5B Model Initial Response: {response_1_5b[:200]}...")
# 7B Model - Elaboration and Detail (same logic)
print("\n[7B Model - Elaboration] - GPU Accelerated") # Added GPU indication
retrieved_memory_7b = retrieve_from_memory(response_1_5b)
context_7b = "\n".join([f"- {mem}" for mem in retrieved_memory_7b]) if retrieved_memory_7b else "No relevant context found in memory."
# Use user-provided prompt template for 7B model
prompt_7b = prompt_7b_template.format(response_1_5b=response_1_5b, context_7b=context_7b)
input_ids_7b = tokenizers["7B"].encode(prompt_7b, return_tensors="pt").to(models["7B"].device)
output_7b = models["7B"].generate(
input_ids_7b,
max_new_tokens=max_new_tokens + 100, # Slightly more tokens for 7B
temperature=temperature, # Use user-defined temperature
top_p=top_p, # Use user-defined top_p
do_sample=True
)
response_7b = tokenizers["7B"].decode(output_7b[0], skip_special_tokens=True)
print(f"7B Response:\n{response_7b}")
store_in_memory(f"7B Model Elaborated Response: {response_7b[:200]}...")
return response_7b # Now returns the 7B model's response as final
# --- Gradio ChatInterface --- (same interface definition)
def gradio_interface(message, history, temp, top_p, max_tokens, prompt_1_5b_text, prompt_7b_text): # Accept prompt textboxes
# history is automatically managed by ChatInterface
response = swarm_agent_sequential_rag(
message,
prompt_1_5b_template=prompt_1_5b_text, # Pass prompt templates
prompt_7b_template=prompt_7b_text,
temperature=temp,
top_p=top_p,
max_new_tokens=int(max_tokens) # Ensure max_tokens is an integer
)
return response
iface = gr.ChatInterface( # Using ChatInterface now
fn=gradio_interface,
# Define additional inputs for settings and prompts
additional_inputs=[
gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
gr.Textbox(value=default_prompt_1_5b, lines=10, label="1.5B Model Prompt Template"), # Textbox for 1.5B prompt
gr.Textbox(value=default_prompt_7b, lines=10, label="7B Model Prompt Template"), # Textbox for 7B prompt
],
title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - 2 Models) - PROMPT CUSTOMIZATION", # Updated title
description="Chat with a DeepSeek agent swarm (1.5B, 7B) with shared memory, adjustable settings, **and customizable prompts!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
)
if __name__ == "__main__":
iface.launch() # Only launch locally if running this script directly |