Update app.py
Browse files
app.py
CHANGED
@@ -3,16 +3,16 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
|
3 |
import torch
|
4 |
import spaces # Import the spaces library
|
5 |
|
6 |
-
# Model IDs from Hugging Face Hub (Fixed to 7B and 32B Unsloth)
|
7 |
model_ids = {
|
8 |
-
"7B": "
|
9 |
-
"32B-Unsloth": "unsloth/DeepSeek-R1-Distill-Qwen-32B-bnb-4bit",
|
10 |
}
|
11 |
|
12 |
models = {} # Keep models as a dictionary, but initially empty
|
13 |
tokenizers = {} # Keep tokenizers as a dictionary, initially empty
|
14 |
|
15 |
-
# BitsAndBytesConfig for 4-bit quantization (for
|
16 |
bnb_config_4bit = BitsAndBytesConfig(
|
17 |
load_in_4bit=True,
|
18 |
bnb_4bit_quant_type="nf4",
|
@@ -25,21 +25,13 @@ def get_model_and_tokenizer(size): # Function to load model on demand
|
|
25 |
model_id = model_ids[size]
|
26 |
print(f"Loading {size} model: {model_id} on demand")
|
27 |
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
)
|
36 |
-
else: # 7B model - standard loading
|
37 |
-
model = AutoModelForCausalLM.from_pretrained(
|
38 |
-
model_id,
|
39 |
-
torch_dtype=torch.bfloat16, # Or torch.float16 if needed
|
40 |
-
device_map='auto',
|
41 |
-
trust_remote_code=True
|
42 |
-
)
|
43 |
models[size] = model
|
44 |
tokenizers[size] = tokenizer
|
45 |
print(f"Loaded {size} model on demand.")
|
@@ -101,7 +93,7 @@ def retrieve_from_memory(query, top_k=2):
|
|
101 |
return relevant_memories[:top_k]
|
102 |
|
103 |
|
104 |
-
# --- Swarm Agent Function - Fixed Models (7B and 32B Unsloth) ---
|
105 |
@spaces.GPU # <---- GPU DECORATOR ADDED HERE!
|
106 |
def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.5, top_p=0.9, max_new_tokens=300): # Removed final_model_size
|
107 |
global shared_memory
|
@@ -109,9 +101,9 @@ def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_temp
|
|
109 |
|
110 |
print(f"\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED - Final Model: 32B Unsloth ---") # Updated message
|
111 |
|
112 |
-
# 7B Model - Brainstorming/Initial Draft (Lazy Load and get model)
|
113 |
-
print("\n[7B Model - Brainstorming] - GPU Accelerated") # Now 7B is brainstorming
|
114 |
-
model_7b, tokenizer_7b = get_model_and_tokenizer("7B") # Lazy load 7B
|
115 |
retrieved_memory_7b = retrieve_from_memory(user_prompt)
|
116 |
context_7b = "\n".join([f"- {mem}" for mem in retrieved_memory_7b]) if retrieved_memory_7b else "No relevant context found in memory."
|
117 |
|
@@ -127,8 +119,8 @@ def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_temp
|
|
127 |
do_sample=True
|
128 |
)
|
129 |
response_7b = tokenizer_7b.decode(output_7b[0], skip_special_tokens=True)
|
130 |
-
print(f"7B Response (Brainstorming):\n{response_7b}") # Updated message
|
131 |
-
store_in_memory(f"7B Model Initial Response: {response_7b[:200]}...")
|
132 |
|
133 |
# 32B Unsloth Model - Final Code Generation (Lazy Load and get model)
|
134 |
final_model, final_tokenizer = get_model_and_tokenizer("32B-Unsloth") # Lazy load 32B Unsloth
|
@@ -178,11 +170,11 @@ iface = gr.ChatInterface( # Using ChatInterface now
|
|
178 |
gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
|
179 |
gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
|
180 |
gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
|
181 |
-
gr.Textbox(value=default_prompt_1_5b, lines=10, label="Brainstorming Model Prompt Template (7B
|
182 |
-
gr.Textbox(value=default_prompt_7b, lines=10, label="Code Generation Prompt Template (
|
183 |
],
|
184 |
-
title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - Fixed Models: 7B + 32B
|
185 |
-
description="Chat with a DeepSeek agent swarm (7B + 32B
|
186 |
)
|
187 |
|
188 |
if __name__ == "__main__":
|
|
|
3 |
import torch
|
4 |
import spaces # Import the spaces library
|
5 |
|
6 |
+
# Model IDs from Hugging Face Hub (Fixed to Unsloth 7B and 32B Unsloth 4bit)
|
7 |
model_ids = {
|
8 |
+
"7B-Unsloth": "unsloth/DeepSeek-R1-Distill-Qwen-7B-unsloth-bnb-4bit", # Unsloth 7B model
|
9 |
+
"32B-Unsloth": "unsloth/DeepSeek-R1-Distill-Qwen-32B-bnb-4bit", # Unsloth 32B model
|
10 |
}
|
11 |
|
12 |
models = {} # Keep models as a dictionary, but initially empty
|
13 |
tokenizers = {} # Keep tokenizers as a dictionary, initially empty
|
14 |
|
15 |
+
# BitsAndBytesConfig for 4-bit quantization (for BOTH models now)
|
16 |
bnb_config_4bit = BitsAndBytesConfig(
|
17 |
load_in_4bit=True,
|
18 |
bnb_4bit_quant_type="nf4",
|
|
|
25 |
model_id = model_ids[size]
|
26 |
print(f"Loading {size} model: {model_id} on demand")
|
27 |
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
28 |
+
model = AutoModelForCausalLM.from_pretrained(
|
29 |
+
model_id,
|
30 |
+
quantization_config=bnb_config_4bit, # Apply 4-bit config for BOTH models
|
31 |
+
torch_dtype=torch.bfloat16, # Or torch.float16 if needed
|
32 |
+
device_map='auto',
|
33 |
+
trust_remote_code=True
|
34 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
models[size] = model
|
36 |
tokenizers[size] = tokenizer
|
37 |
print(f"Loaded {size} model on demand.")
|
|
|
93 |
return relevant_memories[:top_k]
|
94 |
|
95 |
|
96 |
+
# --- Swarm Agent Function - Fixed Models (Unsloth 7B and 32B Unsloth) ---
|
97 |
@spaces.GPU # <---- GPU DECORATOR ADDED HERE!
|
98 |
def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.5, top_p=0.9, max_new_tokens=300): # Removed final_model_size
|
99 |
global shared_memory
|
|
|
101 |
|
102 |
print(f"\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED - Final Model: 32B Unsloth ---") # Updated message
|
103 |
|
104 |
+
# 7B Unsloth Model - Brainstorming/Initial Draft (Lazy Load and get model)
|
105 |
+
print("\n[7B Unsloth Model - Brainstorming] - GPU Accelerated") # Now 7B Unsloth is brainstorming
|
106 |
+
model_7b, tokenizer_7b = get_model_and_tokenizer("7B-Unsloth") # Lazy load 7B Unsloth
|
107 |
retrieved_memory_7b = retrieve_from_memory(user_prompt)
|
108 |
context_7b = "\n".join([f"- {mem}" for mem in retrieved_memory_7b]) if retrieved_memory_7b else "No relevant context found in memory."
|
109 |
|
|
|
119 |
do_sample=True
|
120 |
)
|
121 |
response_7b = tokenizer_7b.decode(output_7b[0], skip_special_tokens=True)
|
122 |
+
print(f"7B Unsloth Response (Brainstorming):\n{response_7b}") # Updated message
|
123 |
+
store_in_memory(f"7B Unsloth Model Initial Response: {response_7b[:200]}...")
|
124 |
|
125 |
# 32B Unsloth Model - Final Code Generation (Lazy Load and get model)
|
126 |
final_model, final_tokenizer = get_model_and_tokenizer("32B-Unsloth") # Lazy load 32B Unsloth
|
|
|
170 |
gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
|
171 |
gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
|
172 |
gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
|
173 |
+
gr.Textbox(value=default_prompt_1_5b, lines=10, label="Brainstorming Model Prompt Template (Unsloth 7B)"), # Updated label - Unsloth 7B now brainstormer
|
174 |
+
gr.Textbox(value=default_prompt_7b, lines=10, label="Code Generation Prompt Template (Unsloth 32B)"), # Updated label - Unsloth 32B is code generator
|
175 |
],
|
176 |
+
title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - Fixed Models: Unsloth 7B + 32B)", # Updated title
|
177 |
+
description="Chat with a DeepSeek agent swarm (Unsloth 7B + 32B) with shared memory, adjustable settings, **and customizable prompts!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
|
178 |
)
|
179 |
|
180 |
if __name__ == "__main__":
|