benardo0 commited on
Commit
17025e8
·
verified ·
1 Parent(s): 030bf70

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -31
app.py CHANGED
@@ -164,19 +164,21 @@ class MedicalAssistant:
164
  def __init__(self):
165
  """
166
  Initialize the medical assistant with CPU-friendly settings.
167
- We'll use careful memory management and avoid GPU-specific features.
168
  """
169
  try:
170
  logger.info("Starting model initialization...")
171
 
172
- # Model configuration
173
- self.model_name = "emircanerol/Llama3-Med42-8B-4bit"
 
174
  self.max_length = 2048
175
 
176
  # First load the tokenizer as it's lighter on memory
177
  logger.info("Loading tokenizer...")
178
  self.tokenizer = AutoTokenizer.from_pretrained(
179
  self.model_name,
 
180
  trust_remote_code=True
181
  )
182
 
@@ -189,22 +191,19 @@ class MedicalAssistant:
189
  logger.info("Loading model - this may take a few minutes...")
190
  self.model = AutoModelForCausalLM.from_pretrained(
191
  self.model_name,
192
- torch_dtype=torch.float32, # Use float32 for CPU
193
- low_cpu_mem_usage=True,
194
- trust_remote_code=True
 
 
195
  )
196
 
197
- # Create the pipeline with our loaded components
198
- logger.info("Creating pipeline...")
199
- self.pipe = pipeline(
200
- "text-generation",
201
- model=self.model,
202
- tokenizer=self.tokenizer,
203
- device=-1, # Force CPU usage
204
- torch_dtype=torch.float32
205
- )
206
 
207
- logger.info("Initialization completed successfully!")
208
 
209
  except Exception as e:
210
  logger.error(f"Initialization failed: {str(e)}")
@@ -213,8 +212,8 @@ class MedicalAssistant:
213
 
214
  def generate_response(self, message: str, chat_history: List[Dict] = None) -> str:
215
  """
216
- Generate a response using the text generation pipeline.
217
- Includes careful error handling and response processing.
218
  """
219
  try:
220
  logger.info("Preparing message for generation")
@@ -227,19 +226,30 @@ class MedicalAssistant:
227
  # Format the conversation
228
  prompt = f"{system_prompt}\n\nUser: {message}\nAssistant:"
229
 
 
 
 
 
 
 
 
 
 
230
  logger.info("Generating response")
231
  # Generate with conservative settings for CPU
232
- response = self.pipe(
233
- prompt,
234
- max_new_tokens=256, # Reduced for CPU efficiency
235
- do_sample=True,
236
- temperature=0.7,
237
- top_p=0.95,
238
- num_return_sequences=1,
239
- pad_token_id=self.tokenizer.pad_token_id
240
- )[0]["generated_text"]
 
241
 
242
- # Clean up the response
 
243
  response = response.split("Assistant:")[-1].strip()
244
 
245
  logger.info("Response generated successfully")
@@ -250,7 +260,7 @@ class MedicalAssistant:
250
  logger.error(traceback.format_exc())
251
  return f"I apologize, but I encountered an error: {str(e)}"
252
 
253
- # Global assistant instance
254
  assistant = None
255
 
256
  def initialize_assistant():
@@ -287,8 +297,8 @@ demo = gr.ChatInterface(
287
  fn=chat_response,
288
  title="Medical Assistant (CPU Version)",
289
  description="""This medical assistant provides guidance and information
290
- about health-related queries. Note that this is running
291
- in CPU mode for broader compatibility.""",
292
  examples=[
293
  "What are the symptoms of malaria?",
294
  "How can I prevent type 2 diabetes?",
 
164
  def __init__(self):
165
  """
166
  Initialize the medical assistant with CPU-friendly settings.
167
+ We use a base model instead of a quantized version to ensure CPU compatibility.
168
  """
169
  try:
170
  logger.info("Starting model initialization...")
171
 
172
+ # Using a standard model instead of a 4-bit quantized version
173
+ # This model is larger but more compatible with CPU-only environments
174
+ self.model_name = "meta-llama/Llama-2-7b-chat-hf"
175
  self.max_length = 2048
176
 
177
  # First load the tokenizer as it's lighter on memory
178
  logger.info("Loading tokenizer...")
179
  self.tokenizer = AutoTokenizer.from_pretrained(
180
  self.model_name,
181
+ token=os.getenv('HUGGING_FACE_TOKEN'), # Add your token in Space settings
182
  trust_remote_code=True
183
  )
184
 
 
191
  logger.info("Loading model - this may take a few minutes...")
192
  self.model = AutoModelForCausalLM.from_pretrained(
193
  self.model_name,
194
+ token=os.getenv('HUGGING_FACE_TOKEN'),
195
+ device_map="auto", # This will default to CPU if no GPU is available
196
+ torch_dtype=torch.float32, # Standard precision for CPU
197
+ low_cpu_mem_usage=True, # Optimize memory usage
198
+ offload_folder="offload" # Enable disk offloading for memory management
199
  )
200
 
201
+ # Move model explicitly to CPU and clear any GPU memory
202
+ self.model = self.model.to('cpu')
203
+ if torch.cuda.is_available():
204
+ torch.cuda.empty_cache()
 
 
 
 
 
205
 
206
+ logger.info("Model loaded successfully!")
207
 
208
  except Exception as e:
209
  logger.error(f"Initialization failed: {str(e)}")
 
212
 
213
  def generate_response(self, message: str, chat_history: List[Dict] = None) -> str:
214
  """
215
+ Generate a response directly using the model instead of a pipeline.
216
+ This gives us more control over the generation process.
217
  """
218
  try:
219
  logger.info("Preparing message for generation")
 
226
  # Format the conversation
227
  prompt = f"{system_prompt}\n\nUser: {message}\nAssistant:"
228
 
229
+ # Tokenize the input
230
+ inputs = self.tokenizer(
231
+ prompt,
232
+ return_tensors="pt",
233
+ padding=True,
234
+ truncation=True,
235
+ max_length=self.max_length
236
+ ).to('cpu') # Ensure inputs are on CPU
237
+
238
  logger.info("Generating response")
239
  # Generate with conservative settings for CPU
240
+ with torch.no_grad(): # Disable gradient computation to save memory
241
+ outputs = self.model.generate(
242
+ **inputs,
243
+ max_new_tokens=256, # Reduced for CPU efficiency
244
+ do_sample=True,
245
+ temperature=0.7,
246
+ top_p=0.95,
247
+ pad_token_id=self.tokenizer.pad_token_id,
248
+ repetition_penalty=1.1
249
+ )
250
 
251
+ # Decode and clean up the response
252
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
253
  response = response.split("Assistant:")[-1].strip()
254
 
255
  logger.info("Response generated successfully")
 
260
  logger.error(traceback.format_exc())
261
  return f"I apologize, but I encountered an error: {str(e)}"
262
 
263
+ # The rest of your code remains the same
264
  assistant = None
265
 
266
  def initialize_assistant():
 
297
  fn=chat_response,
298
  title="Medical Assistant (CPU Version)",
299
  description="""This medical assistant provides guidance and information
300
+ about health-related queries. Please note that response
301
+ generation may take longer as this is running in CPU mode.""",
302
  examples=[
303
  "What are the symptoms of malaria?",
304
  "How can I prevent type 2 diabetes?",