Spaces:

benardo0
/

Nurses

Running

App Files Files Community

benardo0 commited on Jan 22

Commit

17025e8

verified ·

1 Parent(s): 030bf70

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -31

app.py CHANGED Viewed

@@ -164,19 +164,21 @@ class MedicalAssistant:
     def __init__(self):
         """
         Initialize the medical assistant with CPU-friendly settings.
-        We'll use careful memory management and avoid GPU-specific features.
         """
         try:
             logger.info("Starting model initialization...")
-            # Model configuration
-            self.model_name = "emircanerol/Llama3-Med42-8B-4bit"
             self.max_length = 2048
             # First load the tokenizer as it's lighter on memory
             logger.info("Loading tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(
                 self.model_name,
                 trust_remote_code=True
             )
@@ -189,22 +191,19 @@ class MedicalAssistant:
             logger.info("Loading model - this may take a few minutes...")
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
-                torch_dtype=torch.float32,  # Use float32 for CPU
-                low_cpu_mem_usage=True,
-                trust_remote_code=True
             )
-            # Create the pipeline with our loaded components
-            logger.info("Creating pipeline...")
-            self.pipe = pipeline(
-                "text-generation",
-                model=self.model,
-                tokenizer=self.tokenizer,
-                device=-1,  # Force CPU usage
-                torch_dtype=torch.float32
-            )
-            logger.info("Initialization completed successfully!")
         except Exception as e:
             logger.error(f"Initialization failed: {str(e)}")
@@ -213,8 +212,8 @@ class MedicalAssistant:
     def generate_response(self, message: str, chat_history: List[Dict] = None) -> str:
         """
-        Generate a response using the text generation pipeline.
-        Includes careful error handling and response processing.
         """
         try:
             logger.info("Preparing message for generation")
@@ -227,19 +226,30 @@ class MedicalAssistant:
             # Format the conversation
             prompt = f"{system_prompt}\n\nUser: {message}\nAssistant:"
             logger.info("Generating response")
             # Generate with conservative settings for CPU
-            response = self.pipe(
-                prompt,
-                max_new_tokens=256,  # Reduced for CPU efficiency
-                do_sample=True,
-                temperature=0.7,
-                top_p=0.95,
-                num_return_sequences=1,
-                pad_token_id=self.tokenizer.pad_token_id
-            )[0]["generated_text"]
-            # Clean up the response
             response = response.split("Assistant:")[-1].strip()
             logger.info("Response generated successfully")
@@ -250,7 +260,7 @@ class MedicalAssistant:
             logger.error(traceback.format_exc())
             return f"I apologize, but I encountered an error: {str(e)}"
-# Global assistant instance
 assistant = None
 def initialize_assistant():
@@ -287,8 +297,8 @@ demo = gr.ChatInterface(
     fn=chat_response,
     title="Medical Assistant (CPU Version)",
     description="""This medical assistant provides guidance and information
-                   about health-related queries. Note that this is running
-                   in CPU mode for broader compatibility.""",
     examples=[
         "What are the symptoms of malaria?",
         "How can I prevent type 2 diabetes?",

     def __init__(self):
         """
         Initialize the medical assistant with CPU-friendly settings.
+        We use a base model instead of a quantized version to ensure CPU compatibility.
         """
         try:
             logger.info("Starting model initialization...")
+            # Using a standard model instead of a 4-bit quantized version
+            # This model is larger but more compatible with CPU-only environments
+            self.model_name = "meta-llama/Llama-2-7b-chat-hf"
             self.max_length = 2048
             # First load the tokenizer as it's lighter on memory
             logger.info("Loading tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(
                 self.model_name,
+                token=os.getenv('HUGGING_FACE_TOKEN'),  # Add your token in Space settings
                 trust_remote_code=True
             )
             logger.info("Loading model - this may take a few minutes...")
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
+                token=os.getenv('HUGGING_FACE_TOKEN'),
+                device_map="auto",         # This will default to CPU if no GPU is available
+                torch_dtype=torch.float32,  # Standard precision for CPU
+                low_cpu_mem_usage=True,    # Optimize memory usage
+                offload_folder="offload"   # Enable disk offloading for memory management
             )
+            # Move model explicitly to CPU and clear any GPU memory
+            self.model = self.model.to('cpu')
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            logger.info("Model loaded successfully!")
         except Exception as e:
             logger.error(f"Initialization failed: {str(e)}")
     def generate_response(self, message: str, chat_history: List[Dict] = None) -> str:
         """
+        Generate a response directly using the model instead of a pipeline.
+        This gives us more control over the generation process.
         """
         try:
             logger.info("Preparing message for generation")
             # Format the conversation
             prompt = f"{system_prompt}\n\nUser: {message}\nAssistant:"
+            # Tokenize the input
+            inputs = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=self.max_length
+            ).to('cpu')  # Ensure inputs are on CPU
             logger.info("Generating response")
             # Generate with conservative settings for CPU
+            with torch.no_grad():  # Disable gradient computation to save memory
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=256,  # Reduced for CPU efficiency
+                    do_sample=True,
+                    temperature=0.7,
+                    top_p=0.95,
+                    pad_token_id=self.tokenizer.pad_token_id,
+                    repetition_penalty=1.1
+                )
+            # Decode and clean up the response
+            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             response = response.split("Assistant:")[-1].strip()
             logger.info("Response generated successfully")
             logger.error(traceback.format_exc())
             return f"I apologize, but I encountered an error: {str(e)}"
+# The rest of your code remains the same
 assistant = None
 def initialize_assistant():
     fn=chat_response,
     title="Medical Assistant (CPU Version)",
     description="""This medical assistant provides guidance and information
+                   about health-related queries. Please note that response
+                   generation may take longer as this is running in CPU mode.""",
     examples=[
         "What are the symptoms of malaria?",
         "How can I prevent type 2 diabetes?",