File size: 5,844 Bytes
a707ccd
ea0faa1
 
 
a707ccd
3e97d45
ea0faa1
 
 
 
a707ccd
ea0faa1
 
 
 
 
 
 
 
 
 
 
3e97d45
ea0faa1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5138a85
ea0faa1
 
 
 
 
 
 
 
 
 
 
5138a85
 
 
 
 
 
 
ea0faa1
 
 
 
 
 
 
 
 
 
5138a85
 
 
 
 
 
 
ea0faa1
 
 
 
3e97d45
ea0faa1
 
5138a85
 
 
 
 
 
 
 
 
 
ea0faa1
5138a85
ea0faa1
5138a85
 
 
 
 
 
 
 
ea0faa1
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import spaces  # Import the spaces library

# Model IDs from Hugging Face Hub (now only 1.5B and 7B)
model_ids = {
    "1.5B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "7B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
}

# Function to load model and tokenizer (slightly adjusted device_map)
def load_model_and_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16, # Or torch.float16 if you prefer
        device_map='auto', # Let accelerate decide (will use GPU when @spaces.GPU active)
        trust_remote_code=True
    )
    return model, tokenizer

# Load the selected models and tokenizers
models = {}
tokenizers = {}
for size, model_id in model_ids.items():
    print(f"Loading {size} model: {model_id}")
    models[size], tokenizers[size] = load_model_and_tokenizer(model_id)
    print(f"Loaded {size} model.")

# --- Shared Memory Implementation --- (Same as before)
shared_memory = []

def store_in_memory(memory_item):
    shared_memory.append(memory_item)
    print(f"\n[Memory Stored]: {memory_item[:50]}...")

def retrieve_from_memory(query, top_k=2):
    relevant_memories = []
    query_lower = query.lower()
    for memory_item in shared_memory:
        if query_lower in memory_item.lower():
            relevant_memories.append(memory_item)

    if not relevant_memories:
        print("\n[Memory Retrieval]: No relevant memories found.")
        return []

    print(f"\n[Memory Retrieval]: Found {len(relevant_memories)} relevant memories.")
    return relevant_memories[:top_k]


# --- Swarm Agent Function with Shared Memory (RAG) - DECORATED with @spaces.GPU ---
@spaces.GPU  # <----  GPU DECORATOR ADDED HERE!
def swarm_agent_sequential_rag(user_prompt, temperature=0.7, top_p=0.9, max_new_tokens=300): # Added settings as arguments
    global shared_memory
    shared_memory = [] # Clear memory for each new request

    print("\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED ---") # Updated message

    # 1.5B Model - Brainstorming/Initial Draft
    print("\n[1.5B Model - Brainstorming] - GPU Accelerated") # Added GPU indication
    retrieved_memory_1_5b = retrieve_from_memory(user_prompt)
    context_1_5b = "\n".join([f"- {mem}" for mem in retrieved_memory_1_5b]) if retrieved_memory_1_5b else "No relevant context found in memory."
    prompt_1_5b = f"Context from Shared Memory:\n{context_1_5b}\n\nYou are a quick idea generator. Generate an initial response to the following user request, considering the context above:\n\nUser Request: {user_prompt}\n\nInitial Response:"
    input_ids_1_5b = tokenizers["1.5B"].encode(prompt_1_5b, return_tensors="pt").to(models["1.5B"].device)
    output_1_5b = models["1.5B"].generate(
        input_ids_1_5b,
        max_new_tokens=max_new_tokens, # Use user-defined max_new_tokens
        temperature=temperature, # Use user-defined temperature
        top_p=top_p,           # Use user-defined top_p
        do_sample=True
    )
    response_1_5b = tokenizers["1.5B"].decode(output_1_5b[0], skip_special_tokens=True)
    print(f"1.5B Response:\n{response_1_5b}")
    store_in_memory(f"1.5B Model Initial Response: {response_1_5b[:200]}...")

    # 7B Model - Elaboration and Detail
    print("\n[7B Model - Elaboration] - GPU Accelerated") # Added GPU indication
    retrieved_memory_7b = retrieve_from_memory(response_1_5b)
    context_7b = "\n".join([f"- {mem}" for mem in retrieved_memory_7b]) if retrieved_memory_7b else "No relevant context found in memory."
    prompt_7b = f"Context from Shared Memory:\n{context_7b}\n\nYou are a detailed elaborator. Take the following initial response and elaborate on it, adding more detail and reasoning, considering the context above. \n\nInitial Response:\n{response_1_5b}\n\nElaborated Response:"
    input_ids_7b = tokenizers["7B"].encode(prompt_7b, return_tensors="pt").to(models["7B"].device)
    output_7b = models["7B"].generate(
        input_ids_7b,
        max_new_tokens=max_new_tokens + 100, # Slightly more tokens for 7B
        temperature=temperature, # Use user-defined temperature
        top_p=top_p,           # Use user-defined top_p
        do_sample=True
    )
    response_7b = tokenizers["7B"].decode(output_7b[0], skip_special_tokens=True)
    print(f"7B Response:\n{response_7b}")
    store_in_memory(f"7B Model Elaborated Response: {response_7b[:200]}...")

    return response_7b # Now returns the 7B model's response as final


# --- Gradio ChatInterface ---
def gradio_interface(message, history, temperature, top_p, max_tokens): # Accept settings from interface
    # history is automatically managed by ChatInterface
    response = swarm_agent_sequential_rag(
        message,
        temperature=temperature,
        top_p=top_p,
        max_new_tokens=int(max_tokens) # Ensure max_tokens is an integer
    )
    return response

iface = gr.ChatInterface( # Using ChatInterface now
    fn=gradio_interface,
    # Define additional inputs for settings
    additional_inputs=[
        gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.7, label="Temperature"),
        gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
        gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
    ],
    title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - 2 Models)", # Updated title
    description="Chat with a DeepSeek agent swarm (1.5B, 7B) with shared memory and adjustable settings. **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
)

if __name__ == "__main__":
    iface.launch() # Only launch locally if running this script directly