wuhp commited on
Commit
85bfd55
·
verified ·
1 Parent(s): 254808b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -29
app.py CHANGED
@@ -3,16 +3,16 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
  import torch
4
  import spaces # Import the spaces library
5
 
6
- # Model IDs from Hugging Face Hub (Fixed to 7B and 32B Unsloth)
7
  model_ids = {
8
- "7B": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
9
- "32B-Unsloth": "unsloth/DeepSeek-R1-Distill-Qwen-32B-bnb-4bit", # Unsloth 32B model
10
  }
11
 
12
  models = {} # Keep models as a dictionary, but initially empty
13
  tokenizers = {} # Keep tokenizers as a dictionary, initially empty
14
 
15
- # BitsAndBytesConfig for 4-bit quantization (for the 32B model)
16
  bnb_config_4bit = BitsAndBytesConfig(
17
  load_in_4bit=True,
18
  bnb_4bit_quant_type="nf4",
@@ -25,21 +25,13 @@ def get_model_and_tokenizer(size): # Function to load model on demand
25
  model_id = model_ids[size]
26
  print(f"Loading {size} model: {model_id} on demand")
27
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
28
- if size == "32B-Unsloth": # Apply 4-bit config for 32B model
29
- model = AutoModelForCausalLM.from_pretrained(
30
- model_id,
31
- quantization_config=bnb_config_4bit,
32
- torch_dtype=torch.bfloat16, # Or torch.float16 if needed
33
- device_map='auto',
34
- trust_remote_code=True
35
- )
36
- else: # 7B model - standard loading
37
- model = AutoModelForCausalLM.from_pretrained(
38
- model_id,
39
- torch_dtype=torch.bfloat16, # Or torch.float16 if needed
40
- device_map='auto',
41
- trust_remote_code=True
42
- )
43
  models[size] = model
44
  tokenizers[size] = tokenizer
45
  print(f"Loaded {size} model on demand.")
@@ -101,7 +93,7 @@ def retrieve_from_memory(query, top_k=2):
101
  return relevant_memories[:top_k]
102
 
103
 
104
- # --- Swarm Agent Function - Fixed Models (7B and 32B Unsloth) ---
105
  @spaces.GPU # <---- GPU DECORATOR ADDED HERE!
106
  def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.5, top_p=0.9, max_new_tokens=300): # Removed final_model_size
107
  global shared_memory
@@ -109,9 +101,9 @@ def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_temp
109
 
110
  print(f"\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED - Final Model: 32B Unsloth ---") # Updated message
111
 
112
- # 7B Model - Brainstorming/Initial Draft (Lazy Load and get model)
113
- print("\n[7B Model - Brainstorming] - GPU Accelerated") # Now 7B is brainstorming
114
- model_7b, tokenizer_7b = get_model_and_tokenizer("7B") # Lazy load 7B
115
  retrieved_memory_7b = retrieve_from_memory(user_prompt)
116
  context_7b = "\n".join([f"- {mem}" for mem in retrieved_memory_7b]) if retrieved_memory_7b else "No relevant context found in memory."
117
 
@@ -127,8 +119,8 @@ def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_temp
127
  do_sample=True
128
  )
129
  response_7b = tokenizer_7b.decode(output_7b[0], skip_special_tokens=True)
130
- print(f"7B Response (Brainstorming):\n{response_7b}") # Updated message
131
- store_in_memory(f"7B Model Initial Response: {response_7b[:200]}...")
132
 
133
  # 32B Unsloth Model - Final Code Generation (Lazy Load and get model)
134
  final_model, final_tokenizer = get_model_and_tokenizer("32B-Unsloth") # Lazy load 32B Unsloth
@@ -178,11 +170,11 @@ iface = gr.ChatInterface( # Using ChatInterface now
178
  gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
179
  gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
180
  gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
181
- gr.Textbox(value=default_prompt_1_5b, lines=10, label="Brainstorming Model Prompt Template (7B Model)"), # Updated label - 7B now brainstormer
182
- gr.Textbox(value=default_prompt_7b, lines=10, label="Code Generation Prompt Template (32B Unsloth Model)"), # Updated label - 32B is code generator
183
  ],
184
- title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - Fixed Models: 7B + 32B Unsloth)", # Updated title
185
- description="Chat with a DeepSeek agent swarm (7B + 32B Unsloth) with shared memory, adjustable settings, **and customizable prompts!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
186
  )
187
 
188
  if __name__ == "__main__":
 
3
  import torch
4
  import spaces # Import the spaces library
5
 
6
+ # Model IDs from Hugging Face Hub (Fixed to Unsloth 7B and 32B Unsloth 4bit)
7
  model_ids = {
8
+ "7B-Unsloth": "unsloth/DeepSeek-R1-Distill-Qwen-7B-unsloth-bnb-4bit", # Unsloth 7B model
9
+ "32B-Unsloth": "unsloth/DeepSeek-R1-Distill-Qwen-32B-bnb-4bit", # Unsloth 32B model
10
  }
11
 
12
  models = {} # Keep models as a dictionary, but initially empty
13
  tokenizers = {} # Keep tokenizers as a dictionary, initially empty
14
 
15
+ # BitsAndBytesConfig for 4-bit quantization (for BOTH models now)
16
  bnb_config_4bit = BitsAndBytesConfig(
17
  load_in_4bit=True,
18
  bnb_4bit_quant_type="nf4",
 
25
  model_id = model_ids[size]
26
  print(f"Loading {size} model: {model_id} on demand")
27
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
28
+ model = AutoModelForCausalLM.from_pretrained(
29
+ model_id,
30
+ quantization_config=bnb_config_4bit, # Apply 4-bit config for BOTH models
31
+ torch_dtype=torch.bfloat16, # Or torch.float16 if needed
32
+ device_map='auto',
33
+ trust_remote_code=True
34
+ )
 
 
 
 
 
 
 
 
35
  models[size] = model
36
  tokenizers[size] = tokenizer
37
  print(f"Loaded {size} model on demand.")
 
93
  return relevant_memories[:top_k]
94
 
95
 
96
+ # --- Swarm Agent Function - Fixed Models (Unsloth 7B and 32B Unsloth) ---
97
  @spaces.GPU # <---- GPU DECORATOR ADDED HERE!
98
  def swarm_agent_sequential_rag(user_prompt, prompt_1_5b_template, prompt_7b_template, temperature=0.5, top_p=0.9, max_new_tokens=300): # Removed final_model_size
99
  global shared_memory
 
101
 
102
  print(f"\n--- Swarm Agent Processing with Shared Memory (RAG) - GPU ACCELERATED - Final Model: 32B Unsloth ---") # Updated message
103
 
104
+ # 7B Unsloth Model - Brainstorming/Initial Draft (Lazy Load and get model)
105
+ print("\n[7B Unsloth Model - Brainstorming] - GPU Accelerated") # Now 7B Unsloth is brainstorming
106
+ model_7b, tokenizer_7b = get_model_and_tokenizer("7B-Unsloth") # Lazy load 7B Unsloth
107
  retrieved_memory_7b = retrieve_from_memory(user_prompt)
108
  context_7b = "\n".join([f"- {mem}" for mem in retrieved_memory_7b]) if retrieved_memory_7b else "No relevant context found in memory."
109
 
 
119
  do_sample=True
120
  )
121
  response_7b = tokenizer_7b.decode(output_7b[0], skip_special_tokens=True)
122
+ print(f"7B Unsloth Response (Brainstorming):\n{response_7b}") # Updated message
123
+ store_in_memory(f"7B Unsloth Model Initial Response: {response_7b[:200]}...")
124
 
125
  # 32B Unsloth Model - Final Code Generation (Lazy Load and get model)
126
  final_model, final_tokenizer = get_model_and_tokenizer("32B-Unsloth") # Lazy load 32B Unsloth
 
170
  gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="Temperature"), # Lowered default temp to 0.5
171
  gr.Slider(minimum=0.01, maximum=1.0, step=0.05, value=0.9, label="Top P"),
172
  gr.Number(value=300, label="Max Tokens", precision=0), # Use Number for integer tokens
173
+ gr.Textbox(value=default_prompt_1_5b, lines=10, label="Brainstorming Model Prompt Template (Unsloth 7B)"), # Updated label - Unsloth 7B now brainstormer
174
+ gr.Textbox(value=default_prompt_7b, lines=10, label="Code Generation Prompt Template (Unsloth 32B)"), # Updated label - Unsloth 32B is code generator
175
  ],
176
+ title="DeepSeek Agent Swarm Chat (ZeroGPU Demo - Fixed Models: Unsloth 7B + 32B)", # Updated title
177
+ description="Chat with a DeepSeek agent swarm (Unsloth 7B + 32B) with shared memory, adjustable settings, **and customizable prompts!** **GPU accelerated using ZeroGPU!** (Requires Pro Space)", # Updated description
178
  )
179
 
180
  if __name__ == "__main__":