Spaces:

wuhp
/

r1-agents

Sleeping

File size: 7,218 Bytes

a707ccd
ea0faa1
 
 
a707ccd
3e97d45
ea0faa1
 
 
 
a707ccd
b27451f
7714f74
b27451f
7714f74
b27451f
7714f74
 
b27451f
7714f74
 
b27451f
 
 
 
 
7714f74
 
 
b27451f
7714f74
b27451f
7714f74
 
b27451f
7714f74
 
b27451f
 
 
 
 
7714f74
 
b27451f
ea0faa1
 
 
 
 
 
 
 
 
 
b27451f
ea0faa1
 
 
 
 
 
 
b27451f
ea0faa1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b27451f
ea0faa1
 
 
 
 
b27451f
ea0faa1
 
 
7714f74
 
 
 
ea0faa1
5138a85
 
 
 
 
 
 
ea0faa1
 
 
 
b27451f
ea0faa1
 
 
7714f74
 
 
 
 
ea0faa1
5138a85
 
 
 
 
 
 
ea0faa1
 
 
 
3e97d45
ea0faa1
 
b27451f
7714f74
5138a85
 
 
7714f74
 
 
5138a85
 
 
 
ea0faa1
5138a85
ea0faa1
7714f74
5138a85
b27451f
5138a85
 
7714f74
 
5138a85
7714f74
 
ea0faa1

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import spaces  # Import the spaces library

# Model IDs from Hugging Face Hub (now only 1.5B and 7B)
model_ids = {
    "1.5B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "7B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
}

# Revised Default Prompts
default_prompt_1_5b = """**Code Analysis Task**
As a Senior Code Analyst, analyze this programming problem:

**User Request:**
{user_prompt}

**Relevant Context:**
{context_1_5b}

**Analysis Required:**
1. Briefly break down the problem, including key constraints and edge cases.
2. Suggest 2-3 potential approach options (algorithms/data structures).
3. Recommend a primary strategy and explain your reasoning concisely.
4. Provide a very brief initial pseudocode sketch of the core logic."""


default_prompt_7b = """**Code Implementation Task**
As a Principal Software Engineer, develop a solution based on this analysis:

**Initial Analysis:**
{response_1_5b}

**Relevant Context:**
{context_7b}

**Solution Development Requirements:**
1. Present an optimized solution approach, justifying your algorithm choices.
2. Provide production-grade code in [Python/JS/etc.] (infer language). Include error handling and comments.
3. Outline a testing plan with key test cases.
4. Briefly suggest optimization opportunities and debugging tips."""


# Function to load model and tokenizer (same)
def load_model_and_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16, # Or torch.float16 if you prefer
        device_map='auto', # Let accelerate decide (will use GPU when @spaces.GPU active)
        trust_remote_code=True
    )
    return model, tokenizer

# Load the selected models and tokenizers (same)
models = {}
tokenizers = {}
for size, model_id in model_ids.items():
    print(f"Loading {size} model: {model_id}")
    models[size], tokenizers[size] = load_model_and_tokenizer(model_id)
    print(f"Loaded {size} model.")

# --- Shared Memory Implementation --- (Same)
shared_memory = []

def store_in_memory(memory_item):
    shared_memory.append(memory_item)
    print(f"\n[Memory Stored]: {memory_item[:50]}...")

def retrieve_from_memory(query, top_k=2):
    relevant_memories = []
    query_lower = query.lower()
    for memory_item in shared_memory:
        if query_lower in memory_item.lower():
            relevant_memories.append(memory_item)

    if not relevant_memories:
        print("\n[Memory Retrieval]: No relevant memories found.")
        return []

    print(f"\n[Memory Retrieval]: Found {len(relevant_memories)} relevant memories.")
    return relevant_memories[:top_k]


# --- Swarm Agent Function with Shared Memory (RAG) - DECORATED with @spaces.GPU ---
@spaces.GPU  # <----  GPU DECORATOR ADDED HERE!
def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.5, top_p=0.9, max_new_tokens=300): # Lowered default temperature
    global shared_memory
    shared_memory = [] # Clear memory for each new request

    print("\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED ---") # Updated message

    # 1.5B Model - Brainstorming/Initial Draft (same logic)
    print("\n[1.5B Model - Brainstorming] - GPU Accelerated") # Added GPU indication
    retrieved_memory_1_5b = retrieve_from_memory(user_prompt)
    context_1_5b = "\n".join([f"- {mem}" for mem in retrieved_memory_1_5b]) if retrieved_memory_1_5b else "No relevant context found in memory."

    # Use user-provided prompt template for 1.5B model
    prompt_1_5b = prompt_1_5b_template.format(user_prompt=user_prompt, context_1_5b=context_1_5b)

    input_ids_1_5b = tokenizers["1.5B"].encode(prompt_1_5b, return_tensors="pt").to(models["1.5B"].device)
    output_1_5b = models["1.5B"].generate(
        input_ids_1_5b,
        max_new_tokens=max_new_tokens, # Use user-defined max_new_tokens
        temperature=temperature, # Use user-defined temperature
        top_p=top_p,           # Use user-defined top_p
        do_sample=True
    )
    response_1_5b = tokenizers["1.5B"].decode(output_1_5b[0], skip_special_tokens=True)
    print(f"1.5B Response:\n{response_1_5b}")
    store_in_memory(f"1.5B Model Initial Response: {response_1_5b[:200]}...")

    # 7B Model - Elaboration and Detail (same logic)
    print("\n[7B Model - Elaboration] - GPU Accelerated") # Added GPU indication
    retrieved_memory_7b = retrieve_from_memory(response_1_5b)
    context_7b = "\n".join([f"- {mem}" for mem in retrieved_memory_7b]) if retrieved_memory_7b else "No relevant context found in memory."

    # Use user-provided prompt template for 7B model
    prompt_7b = prompt_7b_template.format(response_1_5b=response_1_5b, context_7b=context_7b)


    input_ids_7b = tokenizers["7B"].encode(prompt_7b, return_tensors="pt").to(models["7B"].device)
    output_7b = models["7B"].generate(
        input_ids_7b,
        max_new_tokens=max_new_tokens + 100, # Slightly more tokens for 7B
        temperature=temperature, # Use user-defined temperature
        top_p=top_p,           # Use user-defined top_p
        do_sample=True
    )
    response_7b = tokenizers["7B"].decode(output_7b[0], skip_special_tokens=True)
    print(f"7B Response:\n{response_7b}")
    store_in_memory(f"7B Model Elaborated Response: {response_7b[:200]}...")

    return response_7b # Now returns the 7B model's response as final


# --- Gradio ChatInterface --- (same interface definition)
def gradio_interface(message, history, temp, top_p, max_tokens, prompt_1_5b_text, prompt_7b_text): # Accept prompt textboxes
    # history is automatically managed by ChatInterface
    response = swarm_agent_sequential_rag(
        message,
        prompt_1_5b_template=prompt_1_5b_text, # Pass prompt templates
        prompt_7b_template=prompt_7b_text,
        temperature=temp,
        top_p=top_p,
        max_new_tokens=int(max_tokens) # Ensure max_tokens is an integer
    )
    return response

iface = gr.ChatInterface( # Using ChatInterface now
    fn=gradio_interface,
    # Define additional inputs for settings and prompts
    additional_inputs=[
        gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
        gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
        gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
        gr.Textbox(value=default_prompt_1_5b, lines=10, label="1.5B Model Prompt Template"), # Textbox for 1.5B prompt
        gr.Textbox(value=default_prompt_7b, lines=10, label="7B Model Prompt Template"),   # Textbox for 7B prompt
    ],
    title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - 2 Models) - PROMPT CUSTOMIZATION", # Updated title
    description="Chat with a DeepSeek agent swarm (1.5B, 7B) with shared memory, adjustable settings, **and customizable prompts!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
)

if __name__ == "__main__":
    iface.launch() # Only launch locally if running this script directly