Spaces:

wuhp
/

r1-agents

Running

App Files Files Community

wuhp commited on Feb 3

Commit

85bfd55

verified ·

1 Parent(s): 254808b

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -29

app.py CHANGED Viewed

@@ -3,16 +3,16 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 import torch
 import spaces  # Import the spaces library
-# Model IDs from Hugging Face Hub (Fixed to 7B and 32B Unsloth)
 model_ids = {
-    "7B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
-    "32B-Unsloth": "unsloth/DeepSeek-R1-Distill-Qwen-32B-bnb-4bit", # Unsloth 32B model
 }
 models = {}  # Keep models as a dictionary, but initially empty
 tokenizers = {} # Keep tokenizers as a dictionary, initially empty
-# BitsAndBytesConfig for 4-bit quantization (for the 32B model)
 bnb_config_4bit = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
@@ -25,21 +25,13 @@ def get_model_and_tokenizer(size): # Function to load model on demand
         model_id = model_ids[size]
         print(f"Loading {size} model: {model_id} on demand")
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-        if size == "32B-Unsloth": # Apply 4-bit config for 32B model
-            model = AutoModelForCausalLM.from_pretrained(
-                model_id,
-                quantization_config=bnb_config_4bit,
-                torch_dtype=torch.bfloat16, # Or torch.float16 if needed
-                device_map='auto',
-                trust_remote_code=True
-            )
-        else: # 7B model - standard loading
-            model = AutoModelForCausalLM.from_pretrained(
-                model_id,
-                torch_dtype=torch.bfloat16, # Or torch.float16 if needed
-                device_map='auto',
-                trust_remote_code=True
-            )
         models[size] = model
         tokenizers[size] = tokenizer
         print(f"Loaded {size} model on demand.")
@@ -101,7 +93,7 @@ def retrieve_from_memory(query, top_k=2):
     return relevant_memories[:top_k]
-# --- Swarm Agent Function - Fixed Models (7B and 32B Unsloth) ---
 @spaces.GPU  # <----  GPU DECORATOR ADDED HERE!
 def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.5, top_p=0.9, max_new_tokens=300): # Removed final_model_size
     global shared_memory
@@ -109,9 +101,9 @@ def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_temp
     print(f"\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED - Final Model: 32B Unsloth ---") # Updated message
-    # 7B Model - Brainstorming/Initial Draft (Lazy Load and get model)
-    print("\n[7B Model - Brainstorming] - GPU Accelerated") # Now 7B is brainstorming
-    model_7b, tokenizer_7b = get_model_and_tokenizer("7B") # Lazy load 7B
     retrieved_memory_7b = retrieve_from_memory(user_prompt)
     context_7b = "\n".join([f"- {mem}" for mem in retrieved_memory_7b]) if retrieved_memory_7b else "No relevant context found in memory."
@@ -127,8 +119,8 @@ def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_temp
         do_sample=True
     )
     response_7b = tokenizer_7b.decode(output_7b[0], skip_special_tokens=True)
-    print(f"7B Response (Brainstorming):\n{response_7b}") # Updated message
-    store_in_memory(f"7B Model Initial Response: {response_7b[:200]}...")
     # 32B Unsloth Model - Final Code Generation (Lazy Load and get model)
     final_model, final_tokenizer = get_model_and_tokenizer("32B-Unsloth") # Lazy load 32B Unsloth
@@ -178,11 +170,11 @@ iface = gr.ChatInterface( # Using ChatInterface now
         gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
         gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
         gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
-        gr.Textbox(value=default_prompt_1_5b, lines=10, label="Brainstorming Model Prompt Template (7B Model)"), # Updated label - 7B now brainstormer
-        gr.Textbox(value=default_prompt_7b, lines=10, label="Code Generation Prompt Template (32B Unsloth Model)"),   # Updated label - 32B is code generator
     ],
-    title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - Fixed Models: 7B + 32B Unsloth)", # Updated title
-    description="Chat with a DeepSeek agent swarm (7B + 32B Unsloth) with shared memory, adjustable settings, **and customizable prompts!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
 )
 if __name__ == "__main__":

 import torch
 import spaces  # Import the spaces library
+# Model IDs from Hugging Face Hub (Fixed to Unsloth 7B and 32B Unsloth 4bit)
 model_ids = {
+    "7B-Unsloth": "unsloth/DeepSeek-R1-Distill-Qwen-7B-unsloth-bnb-4bit", # Unsloth 7B model
+    "32B-Unsloth": "unsloth/DeepSeek-R1-Distill-Qwen-32B-bnb-4bit",       # Unsloth 32B model
 }
 models = {}  # Keep models as a dictionary, but initially empty
 tokenizers = {} # Keep tokenizers as a dictionary, initially empty
+# BitsAndBytesConfig for 4-bit quantization (for BOTH models now)
 bnb_config_4bit = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
         model_id = model_ids[size]
         print(f"Loading {size} model: {model_id} on demand")
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            quantization_config=bnb_config_4bit, # Apply 4-bit config for BOTH models
+            torch_dtype=torch.bfloat16, # Or torch.float16 if needed
+            device_map='auto',
+            trust_remote_code=True
+        )
         models[size] = model
         tokenizers[size] = tokenizer
         print(f"Loaded {size} model on demand.")
     return relevant_memories[:top_k]
+# --- Swarm Agent Function - Fixed Models (Unsloth 7B and 32B Unsloth) ---
 @spaces.GPU  # <----  GPU DECORATOR ADDED HERE!
 def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.5, top_p=0.9, max_new_tokens=300): # Removed final_model_size
     global shared_memory
     print(f"\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED - Final Model: 32B Unsloth ---") # Updated message
+    # 7B Unsloth Model - Brainstorming/Initial Draft (Lazy Load and get model)
+    print("\n[7B Unsloth Model - Brainstorming] - GPU Accelerated") # Now 7B Unsloth is brainstorming
+    model_7b, tokenizer_7b = get_model_and_tokenizer("7B-Unsloth") # Lazy load 7B Unsloth
     retrieved_memory_7b = retrieve_from_memory(user_prompt)
     context_7b = "\n".join([f"- {mem}" for mem in retrieved_memory_7b]) if retrieved_memory_7b else "No relevant context found in memory."
         do_sample=True
     )
     response_7b = tokenizer_7b.decode(output_7b[0], skip_special_tokens=True)
+    print(f"7B Unsloth Response (Brainstorming):\n{response_7b}") # Updated message
+    store_in_memory(f"7B Unsloth Model Initial Response: {response_7b[:200]}...")
     # 32B Unsloth Model - Final Code Generation (Lazy Load and get model)
     final_model, final_tokenizer = get_model_and_tokenizer("32B-Unsloth") # Lazy load 32B Unsloth
         gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
         gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
         gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
+        gr.Textbox(value=default_prompt_1_5b, lines=10, label="Brainstorming Model Prompt Template (Unsloth 7B)"), # Updated label - Unsloth 7B now brainstormer
+        gr.Textbox(value=default_prompt_7b, lines=10, label="Code Generation Prompt Template (Unsloth 32B)"),   # Updated label - Unsloth 32B is code generator
     ],
+    title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - Fixed Models: Unsloth 7B + 32B)", # Updated title
+    description="Chat with a DeepSeek agent swarm (Unsloth 7B + 32B) with shared memory, adjustable settings, **and customizable prompts!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
 )
 if __name__ == "__main__":