wuhp commited on
Commit
6a12f54
·
verified ·
1 Parent(s): d858dc3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -77
app.py CHANGED
@@ -1,16 +1,52 @@
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
4
  import spaces # Import the spaces library
5
 
6
- # Model IDs from Hugging Face Hub (now 1.5B, 7B, and 14B)
7
  model_ids = {
8
- "1.5B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
9
  "7B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
10
- "14B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", # Added 14B back
11
  }
12
 
13
- # Revised Default Prompts (as defined above)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  default_prompt_1_5b = """**Code Analysis Task**
15
  As a Senior Code Analyst, analyze this programming problem:
16
 
@@ -23,8 +59,8 @@ As a Senior Code Analyst, analyze this programming problem:
23
  **Analysis Required:**
24
  1. Briefly break down the problem, including key constraints and edge cases.
25
  2. Suggest 2-3 potential approach options (algorithms/data structures).
26
- 3. Recommend ONE primary strategy and briefly justify your choice.
27
- 4. Provide a very brief initial pseudocode sketch of the core logic."""
28
 
29
 
30
  default_prompt_7b = """**Code Implementation Task**
@@ -43,25 +79,6 @@ As a Principal Software Engineer, provide production-ready Streamlit/Python code
43
  """
44
 
45
 
46
- # Function to load model and tokenizer (same)
47
- def load_model_and_tokenizer(model_id):
48
- tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
49
- model = AutoModelForCausalLM.from_pretrained(
50
- model_id,
51
- torch_dtype=torch.bfloat16, # Or torch.float16 if you prefer
52
- device_map='auto', # Let accelerate decide (will use GPU when @spaces.GPU active)
53
- trust_remote_code=True
54
- )
55
- return model, tokenizer
56
-
57
- # Load the selected models and tokenizers (now loads 1.5B, 7B, 14B)
58
- models = {}
59
- tokenizers = {}
60
- for size, model_id in model_ids.items():
61
- print(f"Loading {size} model: {model_id}")
62
- models[size], tokenizers[size] = load_model_and_tokenizer(model_id)
63
- print(f"Loaded {size} model.")
64
-
65
  # --- Shared Memory Implementation --- (Same)
66
  shared_memory = []
67
 
@@ -84,62 +101,46 @@ def retrieve_from_memory(query, top_k=2):
84
  return relevant_memories[:top_k]
85
 
86
 
87
- # --- Swarm Agent Function with Model Swapping ---
88
  @spaces.GPU # <---- GPU DECORATOR ADDED HERE!
89
- def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, final_model_size="7B", temperature=0.5, top_p=0.9, max_new_tokens=300): # Added final_model_size
90
  global shared_memory
91
  shared_memory = [] # Clear memory for each new request
92
 
93
- print(f"\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED - Final Model: {final_model_size} ---") # Updated message
94
 
95
- # 1.5B Model - Brainstorming/Initial Draft (same logic)
96
- print("\n[1.5B Model - Brainstorming] - GPU Accelerated")
97
- retrieved_memory_1_5b = retrieve_from_memory(user_prompt)
98
- context_1_5b = "\n".join([f"- {mem}" for mem in retrieved_memory_1_5b]) if retrieved_memory_1_5b else "No relevant context found in memory."
 
99
 
100
- # Use user-provided prompt template for 1.5B model
101
- prompt_1_5b = prompt_1_5b_template.format(user_prompt=user_prompt, context_1_5b=context_1_5b)
102
 
103
- input_ids_1_5b = tokenizers["1.5B"].encode(prompt_1_5b, return_tensors="pt").to(models["1.5B"].device)
104
- output_1_5b = models["1.5B"].generate(
105
- input_ids_1_5b,
106
  max_new_tokens=max_new_tokens, # Use user-defined max_new_tokens
107
  temperature=temperature, # Use user-defined temperature
108
  top_p=top_p, # Use user-defined top_p
109
  do_sample=True
110
  )
111
- response_1_5b = tokenizers["1.5B"].decode(output_1_5b[0], skip_special_tokens=True)
112
- print(f"1.5B Response:\n{response_1_5b}")
113
- store_in_memory(f"1.5B Model Initial Response: {response_1_5b[:200]}...")
114
-
115
- # Final Stage Model Selection (7B or 14B)
116
- if final_model_size == "7B":
117
- final_model = models["7B"]
118
- final_tokenizer = tokenizers["7B"]
119
- print("\n[7B Model - Final Code Generation] - GPU Accelerated") # Model-specific message
120
- model_stage_name = "7B Model - Final Code"
121
- final_max_new_tokens = max_new_tokens + 100 # Slightly more tokens for 7B
122
-
123
- elif final_model_size == "14B":
124
- final_model = models["14B"]
125
- final_tokenizer = tokenizers["14B"]
126
- print("\n[14B Model - Final Code Generation] - GPU Accelerated") # Model-specific message
127
- model_stage_name = "14B Model - Final Code"
128
- final_max_new_tokens = max_new_tokens + 200 # Even more tokens for 14B
129
-
130
- else: # Default to 7B if selection is somehow invalid
131
- final_model = models["7B"]
132
- final_tokenizer = tokenizers["7B"]
133
- print("\n[7B Model - Final Code Generation] - GPU Accelerated (Default)")
134
- model_stage_name = "7B Model - Final Code (Default)"
135
- final_max_new_tokens = max_new_tokens + 100
136
-
137
-
138
- retrieved_memory_final = retrieve_from_memory(response_1_5b)
139
  context_final = "\n".join([f"- {mem}" for mem in retrieved_memory_final]) if retrieved_memory_final else "No relevant context found in memory."
140
 
141
- # Use user-provided prompt template for final model (currently using 7B prompt for both 7B and 14B for simplicity, you can create a separate 14B prompt if needed)
142
- prompt_final = prompt_7b_template.format(response_1_5b=response_1_5b, context_7b=context_final) # Using prompt_7b_template for final stage for now
143
 
144
 
145
  input_ids_final = final_tokenizer.encode(prompt_final, return_tensors="pt").to(final_model.device)
@@ -157,14 +158,13 @@ def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_temp
157
  return response_final # Returns final model's response
158
 
159
 
160
- # --- Gradio ChatInterface --- (with Model Selection Dropdown)
161
- def gradio_interface(message, history, temp, top_p, max_tokens, prompt_1_5b_text, prompt_7b_text, final_model_selector): # Added final_model_selector
162
  # history is automatically managed by ChatInterface
163
  response = swarm_agent_sequential_rag(
164
  message,
165
  prompt_1_5b_template=prompt_1_5b_text, # Pass prompt templates
166
  prompt_7b_template=prompt_7b_text,
167
- final_model_size=final_model_selector, # Pass model selection
168
  temperature=temp,
169
  top_p=top_p,
170
  max_new_tokens=int(max_tokens) # Ensure max_tokens is an integer
@@ -173,17 +173,16 @@ def gradio_interface(message, history, temp, top_p, max_tokens, prompt_1_5b_text
173
 
174
  iface = gr.ChatInterface( # Using ChatInterface now
175
  fn=gradio_interface,
176
- # Define additional inputs for settings, prompts, and model selection
177
  additional_inputs=[
178
  gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
179
  gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
180
  gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
181
- gr.Textbox(value=default_prompt_1_5b, lines=10, label="1.5B Model Prompt Template"), # Textbox for 1.5B prompt
182
- gr.Textbox(value=default_prompt_7b, lines=10, label="7B Model Prompt Template"), # Textbox for 7B prompt
183
- gr.Dropdown(choices=["7B", "14B"], value="7B", label="Final Stage Model (7B or 14B)") # Model selection dropdown
184
  ],
185
- title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - 2 Models + Model Swap)", # Updated title
186
- description="Chat with a DeepSeek agent swarm (1.5B + 7B/14B selectable) with shared memory, adjustable settings, **customizable prompts, and model swapping!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
187
  )
188
 
189
  if __name__ == "__main__":
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
  import torch
4
  import spaces # Import the spaces library
5
 
6
+ # Model IDs from Hugging Face Hub (Fixed to 7B and 32B Unsloth)
7
  model_ids = {
 
8
  "7B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
9
+ "32B-Unsloth": "unsloth/DeepSeek-R1-Distill-Qwen-32B-bnb-4bit", # Unsloth 32B model
10
  }
11
 
12
+ models = {} # Keep models as a dictionary, but initially empty
13
+ tokenizers = {} # Keep tokenizers as a dictionary, initially empty
14
+
15
+ # BitsAndBytesConfig for 4-bit quantization (for the 32B model)
16
+ bnb_config_4bit = BitsAndBytesConfig(
17
+ load_in_4bit=True,
18
+ bnb_4bit_quant_type="nf4",
19
+ bnb_4bit_compute_dtype=torch.bfloat16, # Or torch.float16 if needed
20
+ )
21
+
22
+
23
+ def get_model_and_tokenizer(size): # Function to load model on demand
24
+ if size not in models: # Load only if not already loaded
25
+ model_id = model_ids[size]
26
+ print(f"Loading {size} model: {model_id} on demand")
27
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
28
+ if size == "32B-Unsloth": # Apply 4-bit config for 32B model
29
+ model = AutoModelForCausalLM.from_pretrained(
30
+ model_id,
31
+ quantization_config=bnb_config_4bit,
32
+ torch_dtype=torch.bfloat16, # Or torch.float16 if needed
33
+ device_map='auto',
34
+ trust_remote_code=True
35
+ )
36
+ else: # 7B model - standard loading
37
+ model = AutoModelForCausalLM.from_pretrained(
38
+ model_id,
39
+ torch_dtype=torch.bfloat16, # Or torch.float16 if needed
40
+ device_map='auto',
41
+ trust_remote_code=True
42
+ )
43
+ models[size] = model
44
+ tokenizers[size] = tokenizer
45
+ print(f"Loaded {size} model on demand.")
46
+ return models[size], tokenizers[size]
47
+
48
+
49
+ # Revised Default Prompts (as defined previously - these are still good)
50
  default_prompt_1_5b = """**Code Analysis Task**
51
  As a Senior Code Analyst, analyze this programming problem:
52
 
 
59
  **Analysis Required:**
60
  1. Briefly break down the problem, including key constraints and edge cases.
61
  2. Suggest 2-3 potential approach options (algorithms/data structures).
62
+ 3. Recommend ONE primary strategy and briefly justify your choice.
63
+ 4. Provide a very brief initial pseudocode sketch of the core logic."""
64
 
65
 
66
  default_prompt_7b = """**Code Implementation Task**
 
79
  """
80
 
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  # --- Shared Memory Implementation --- (Same)
83
  shared_memory = []
84
 
 
101
  return relevant_memories[:top_k]
102
 
103
 
104
+ # --- Swarm Agent Function - Fixed Models (7B and 32B Unsloth) ---
105
  @spaces.GPU # <---- GPU DECORATOR ADDED HERE!
106
+ def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.5, top_p=0.9, max_new_tokens=300): # Removed final_model_size
107
  global shared_memory
108
  shared_memory = [] # Clear memory for each new request
109
 
110
+ print(f"\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED - Final Model: 32B Unsloth ---") # Updated message
111
 
112
+ # 7B Model - Brainstorming/Initial Draft (Lazy Load and get model)
113
+ print("\n[7B Model - Brainstorming] - GPU Accelerated") # Now 7B is brainstorming
114
+ model_7b, tokenizer_7b = get_model_and_tokenizer("7B") # Lazy load 7B
115
+ retrieved_memory_7b = retrieve_from_memory(user_prompt)
116
+ context_7b = "\n".join([f"- {mem}" for mem in retrieved_memory_7b]) if retrieved_memory_7b else "No relevant context found in memory."
117
 
118
+ # Use user-provided prompt template for 7B model (as brainstorming model now)
119
+ prompt_7b_brainstorm = prompt_1_5b_template.format(user_prompt=user_prompt, context_1_5b=context_7b) # Reusing 1.5B template - adjust if needed
120
 
121
+ input_ids_7b = tokenizer_7b.encode(prompt_7b_brainstorm, return_tensors="pt").to(model_7b.device)
122
+ output_7b = model_7b.generate(
123
+ input_ids_7b,
124
  max_new_tokens=max_new_tokens, # Use user-defined max_new_tokens
125
  temperature=temperature, # Use user-defined temperature
126
  top_p=top_p, # Use user-defined top_p
127
  do_sample=True
128
  )
129
+ response_7b = tokenizer_7b.decode(output_7b[0], skip_special_tokens=True)
130
+ print(f"7B Response (Brainstorming):\n{response_7b}") # Updated message
131
+ store_in_memory(f"7B Model Initial Response: {response_7b[:200]}...")
132
+
133
+ # 32B Unsloth Model - Final Code Generation (Lazy Load and get model)
134
+ final_model, final_tokenizer = get_model_and_tokenizer("32B-Unsloth") # Lazy load 32B Unsloth
135
+ print("\n[32B Unsloth Model - Final Code Generation] - GPU Accelerated") # Model-specific message
136
+ model_stage_name = "32B Unsloth Model - Final Code"
137
+ final_max_new_tokens = max_new_tokens + 200 # More tokens for 32B model
138
+
139
+ retrieved_memory_final = retrieve_from_memory(response_7b) # Memory from 7B brainstorm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  context_final = "\n".join([f"- {mem}" for mem in retrieved_memory_final]) if retrieved_memory_final else "No relevant context found in memory."
141
 
142
+ # Use user-provided prompt template for final model (using 7B template)
143
+ prompt_final = prompt_7b_template.format(response_1_5b=response_7b, context_7b=context_final) # Using prompt_7b_template for final stage
144
 
145
 
146
  input_ids_final = final_tokenizer.encode(prompt_final, return_tensors="pt").to(final_model.device)
 
158
  return response_final # Returns final model's response
159
 
160
 
161
+ # --- Gradio ChatInterface --- (No Model Selection Dropdown anymore)
162
+ def gradio_interface(message, history, temp, top_p, max_tokens, prompt_1_5b_text, prompt_7b_text): # Removed final_model_selector
163
  # history is automatically managed by ChatInterface
164
  response = swarm_agent_sequential_rag(
165
  message,
166
  prompt_1_5b_template=prompt_1_5b_text, # Pass prompt templates
167
  prompt_7b_template=prompt_7b_text,
 
168
  temperature=temp,
169
  top_p=top_p,
170
  max_new_tokens=int(max_tokens) # Ensure max_tokens is an integer
 
173
 
174
  iface = gr.ChatInterface( # Using ChatInterface now
175
  fn=gradio_interface,
176
+ # Define additional inputs for settings and prompts (NO model dropdown)
177
  additional_inputs=[
178
  gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
179
  gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
180
  gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
181
+ gr.Textbox(value=default_prompt_1_5b, lines=10, label="Brainstorming Model Prompt Template (7B Model)"), # Updated label - 7B now brainstormer
182
+ gr.Textbox(value=default_prompt_7b, lines=10, label="Code Generation Prompt Template (32B Unsloth Model)"), # Updated label - 32B is code generator
 
183
  ],
184
+ title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - Fixed Models: 7B + 32B Unsloth)", # Updated title
185
+ description="Chat with a DeepSeek agent swarm (7B + 32B Unsloth) with shared memory, adjustable settings, **and customizable prompts!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
186
  )
187
 
188
  if __name__ == "__main__":