Spaces:

wuhp
/

r1-agents

Running

App Files Files Community

wuhp commited on Feb 3

Commit

361c4d3

verified ·

1 Parent(s): 85bfd55

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -39

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-import torch
 import spaces  # Import the spaces library
 # Model IDs from Hugging Face Hub (Fixed to Unsloth 7B and 32B Unsloth 4bit)
 model_ids = {
@@ -93,8 +95,8 @@ def retrieve_from_memory(query, top_k=2):
     return relevant_memories[:top_k]
-# --- Swarm Agent Function - Fixed Models (Unsloth 7B and 32B Unsloth) ---
-@spaces.GPU  # <----  GPU DECORATOR ADDED HERE!
 def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.5, top_p=0.9, max_new_tokens=300): # Removed final_model_size
     global shared_memory
     shared_memory = [] # Clear memory for each new request
@@ -111,16 +113,29 @@ def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_temp
     prompt_7b_brainstorm = prompt_1_5b_template.format(user_prompt=user_prompt, context_1_5b=context_7b) # Reusing 1.5B template - adjust if needed
     input_ids_7b = tokenizer_7b.encode(prompt_7b_brainstorm, return_tensors="pt").to(model_7b.device)
-    output_7b = model_7b.generate(
-        input_ids_7b,
         max_new_tokens=max_new_tokens, # Use user-defined max_new_tokens
-        temperature=temperature, # Use user-defined temperature
-        top_p=top_p,           # Use user-defined top_p
-        do_sample=True
     )
-    response_7b = tokenizer_7b.decode(output_7b[0], skip_special_tokens=True)
-    print(f"7B Unsloth Response (Brainstorming):\n{response_7b}") # Updated message
-    store_in_memory(f"7B Unsloth Model Initial Response: {response_7b[:200]}...")
     # 32B Unsloth Model - Final Code Generation (Lazy Load and get model)
     final_model, final_tokenizer = get_model_and_tokenizer("32B-Unsloth") # Lazy load 32B Unsloth
@@ -128,54 +143,113 @@ def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_temp
     model_stage_name = "32B Unsloth Model - Final Code"
     final_max_new_tokens = max_new_tokens + 200 # More tokens for 32B model
-    retrieved_memory_final = retrieve_from_memory(response_7b) # Memory from 7B brainstorm
     context_final = "\n".join([f"- {mem}" for mem in retrieved_memory_final]) if retrieved_memory_final else "No relevant context found in memory."
     # Use user-provided prompt template for final model (using 7B template)
-    prompt_final = prompt_7b_template.format(response_1_5b=response_7b, context_7b=context_final) # Using prompt_7b_template for final stage
     input_ids_final = final_tokenizer.encode(prompt_final, return_tensors="pt").to(final_model.device)
-    output_final = final_model.generate(
-        input_ids_final,
         max_new_tokens=final_max_new_tokens,
         temperature=temperature,
         top_p=top_p,
-        do_sample=True
     )
-    response_final = final_tokenizer.decode(output_final[0], skip_special_tokens=True)
-    print(f"{model_stage_name} Response:\n{response_final}")
-    store_in_memory(f"{model_stage_name} Response: {response_final[:200]}...")
-    return response_final # Returns final model's response
 # --- Gradio ChatInterface --- (No Model Selection Dropdown anymore)
 def gradio_interface(message, history, temp, top_p, max_tokens, prompt_1_5b_text, prompt_7b_text): # Removed final_model_selector
     # history is automatically managed by ChatInterface
-    response = swarm_agent_sequential_rag(
         message,
         prompt_1_5b_template=prompt_1_5b_text, # Pass prompt templates
         prompt_7b_template=prompt_7b_text,
         temperature=temp,
         top_p=top_p,
         max_new_tokens=int(max_tokens) # Ensure max_tokens is an integer
     )
-    return response
-iface = gr.ChatInterface( # Using ChatInterface now
-    fn=gradio_interface,
-    # Define additional inputs for settings and prompts (NO model dropdown)
-    additional_inputs=[
-        gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
-        gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
-        gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
-        gr.Textbox(value=default_prompt_1_5b, lines=10, label="Brainstorming Model Prompt Template (Unsloth 7B)"), # Updated label - Unsloth 7B now brainstormer
-        gr.Textbox(value=default_prompt_7b, lines=10, label="Code Generation Prompt Template (Unsloth 32B)"),   # Updated label - Unsloth 32B is code generator
-    ],
-    title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - Fixed Models: Unsloth 7B + 32B)", # Updated title
-    description="Chat with a DeepSeek agent swarm (Unsloth 7B + 32B) with shared memory, adjustable settings, **and customizable prompts!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
-)
 if __name__ == "__main__":
-    iface.launch() # Only launch locally if running this script directly

 import gradio as gr
+import os
 import spaces  # Import the spaces library
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
+import torch
+from threading import Thread
 # Model IDs from Hugging Face Hub (Fixed to Unsloth 7B and 32B Unsloth 4bit)
 model_ids = {
     return relevant_memories[:top_k]
+# --- Streaming Swarm Agent Function - Fixed Models (Unsloth 7B and 32B Unsloth) ---
+@spaces.GPU(duration=120) # Added duration
 def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.5, top_p=0.9, max_new_tokens=300): # Removed final_model_size
     global shared_memory
     shared_memory = [] # Clear memory for each new request
     prompt_7b_brainstorm = prompt_1_5b_template.format(user_prompt=user_prompt, context_1_5b=context_7b) # Reusing 1.5B template - adjust if needed
     input_ids_7b = tokenizer_7b.encode(prompt_7b_brainstorm, return_tensors="pt").to(model_7b.device)
+    streamer_7b = TextIteratorStreamer(tokenizer_7b, timeout=10.0, skip_prompt=True, skip_special_tokens=True) # Streamer for 7B
+    generate_kwargs_7b = dict( # Generation kwargs for 7B
+        input_ids= input_ids_7b,
+        streamer=streamer_7b,
         max_new_tokens=max_new_tokens, # Use user-defined max_new_tokens
+        do_sample=True,
+        temperature=temperature,
+        top_p=top_p,
+        # eos_token_id=tokenizer_7b.eos_token_id, # Not strictly needed as streamer handles EOS
     )
+    thread_7b = Thread(target=model_7b.generate, kwargs=generate_kwargs_7b) # Thread for 7B generation
+    thread_7b.start()
+    response_7b_stream = "" # Accumulate streamed 7B response
+    print(f"7B Unsloth Response (Brainstorming):\n", end="")
+    for text in streamer_7b: # Stream and print 7B response
+        print(text, end="", flush=True) # Print in place
+        response_7b_stream += text
+        yield response_7b_stream # Yield intermediate 7B response
+    store_in_memory(f"7B Unsloth Model Initial Response: {response_7b_stream[:200]}...") # Store accumulated 7B response
     # 32B Unsloth Model - Final Code Generation (Lazy Load and get model)
     final_model, final_tokenizer = get_model_and_tokenizer("32B-Unsloth") # Lazy load 32B Unsloth
     model_stage_name = "32B Unsloth Model - Final Code"
     final_max_new_tokens = max_new_tokens + 200 # More tokens for 32B model
+    retrieved_memory_final = retrieve_from_memory(response_7b_stream) # Memory from streamed 7B response
     context_final = "\n".join([f"- {mem}" for mem in retrieved_memory_final]) if retrieved_memory_final else "No relevant context found in memory."
     # Use user-provided prompt template for final model (using 7B template)
+    prompt_final = prompt_7b_template.format(response_1_5b=response_7b_stream, context_7b=context_final) # Using prompt_7b_template for final stage
     input_ids_final = final_tokenizer.encode(prompt_final, return_tensors="pt").to(final_model.device)
+    streamer_final = TextIteratorStreamer(final_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True) # Streamer for 32B
+    generate_kwargs_final = dict( # Generation kwargs for 32B
+        input_ids= input_ids_final,
+        streamer=streamer_final,
         max_new_tokens=final_max_new_tokens,
         temperature=temperature,
         top_p=top_p,
+        # eos_token_id=final_tokenizer.eos_token_id, # Not strictly needed as streamer handles EOS
     )
+    thread_final = Thread(target=final_model.generate, kwargs=generate_kwargs_final) # Thread for 32B generation
+    thread_final.start()
+    response_final_stream = "" # Accumulate streamed 32B response
+    print(f"\n{model_stage_name} Response:\n", end="")
+    for text in streamer_final: # Stream and print 32B response
+        print(text, end="", flush=True) # Print in place
+        response_final_stream += text
+        yield response_final_stream # Yield intermediate 32B response
+    store_in_memory(f"{model_stage_name} Response: {response_final_stream[:200]}...") # Store accumulated 32B response
+    return response_final_stream # Returns final streamed response
 # --- Gradio ChatInterface --- (No Model Selection Dropdown anymore)
 def gradio_interface(message, history, temp, top_p, max_tokens, prompt_1_5b_text, prompt_7b_text): # Removed final_model_selector
     # history is automatically managed by ChatInterface
+    full_response = "" # Accumulate full response from generator
+    for partial_response in swarm_agent_sequential_rag( # Iterate through generator
         message,
         prompt_1_5b_template=prompt_1_5b_text, # Pass prompt templates
         prompt_7b_template=prompt_7b_text,
         temperature=temp,
         top_p=top_p,
         max_new_tokens=int(max_tokens) # Ensure max_tokens is an integer
+    ):
+        full_response = partial_response # Update full response with partial response
+        yield full_response # Yield intermediate full response
+DESCRIPTION = '''
+<div>
+<h1 style="text-align: center;">DeepSeek Agent Swarm Chat (Unsloth 7B + 32B) - Streaming Demo</h1>
+<p style="text-align: center;">Agent swarm using Unsloth DeepSeek-R1-Distill models (7B + 32B) with shared memory, adjustable settings, and customizable prompts.  GPU accelerated using ZeroGPU! (Requires Pro Space)</p>
+</div>
+'''
+LICENSE = """
+<p/>
+---
+"""
+PLACEHOLDER = """
+Ask me anything...
+"""
+css = """
+h1 {
+  text-align: center;
+  display: block;
+}
+#duplicate-button {
+  margin: auto;
+  color: white;
+  background: #1565c0;
+  border-radius: 100vh;
+}
+"""
+# Gradio ChatInterface with streaming
+chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Agent Swarm Output')
+with gr.Blocks(fill_height=True, css=css) as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.ChatInterface(
+        fn=gradio_interface,
+        chatbot=chatbot,
+        fill_height=True,
+        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False), # Accordion for params
+        additional_inputs=[
+            gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"),
+            gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
+            gr.Number(value=300, label="Max Tokens", precision=0),
+            gr.Textbox(value=default_prompt_1_5b, lines=7, label="Brainstorming Model Prompt Template (Unsloth 7B)"),
+            gr.Textbox(value=default_prompt_7b, lines=7, label="Code Generation Prompt Template (Unsloth 32B)"),
+        ],
+        examples=[
+            ['How to setup a human base on Mars? Give short answer.'],
+            ['Explain theory of relativity to me like I’m 8 years old.'],
+            ['Write a streamlit app to track my finances'],
+            ['Write a pun-filled happy birthday message to my friend Alex.'],
+            ['Justify why a penguin might make a good king of the jungle.']
+        ],
+        cache_examples=False,
     )
+    gr.Markdown(LICENSE)
 if __name__ == "__main__":
+    demo.launch()