Spaces:

benardo0
/

Nurses

Running

App Files Files Community

benardo0 commited on Jan 22

Commit

4b15044

verified ·

1 Parent(s): 17025e8

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -61

app.py CHANGED Viewed

@@ -163,47 +163,42 @@ logger = logging.getLogger(__name__)
 class MedicalAssistant:
     def __init__(self):
         """
-        Initialize the medical assistant with CPU-friendly settings.
-        We use a base model instead of a quantized version to ensure CPU compatibility.
         """
         try:
             logger.info("Starting model initialization...")
-            # Using a standard model instead of a 4-bit quantized version
-            # This model is larger but more compatible with CPU-only environments
-            self.model_name = "meta-llama/Llama-2-7b-chat-hf"
             self.max_length = 2048
-            # First load the tokenizer as it's lighter on memory
             logger.info("Loading tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(
                 self.model_name,
-                token=os.getenv('HUGGING_FACE_TOKEN'),  # Add your token in Space settings
                 trust_remote_code=True
             )
-            # Handle padding token
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
-            logger.info("Tokenizer loaded successfully")
-            # Load model with CPU-friendly settings
-            logger.info("Loading model - this may take a few minutes...")
-            self.model = AutoModelForCausalLM.from_pretrained(
-                self.model_name,
-                token=os.getenv('HUGGING_FACE_TOKEN'),
-                device_map="auto",         # This will default to CPU if no GPU is available
-                torch_dtype=torch.float32,  # Standard precision for CPU
-                low_cpu_mem_usage=True,    # Optimize memory usage
-                offload_folder="offload"   # Enable disk offloading for memory management
-            )
-            # Move model explicitly to CPU and clear any GPU memory
-            self.model = self.model.to('cpu')
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            logger.info("Model loaded successfully!")
         except Exception as e:
             logger.error(f"Initialization failed: {str(e)}")
@@ -212,45 +207,45 @@ class MedicalAssistant:
     def generate_response(self, message: str, chat_history: List[Dict] = None) -> str:
         """
-        Generate a response directly using the model instead of a pipeline.
-        This gives us more control over the generation process.
         """
         try:
             logger.info("Preparing message for generation")
             # Create a medical context-aware prompt
-            system_prompt = """You are a medical AI assistant. Provide accurate,
-            professional medical guidance. Always recommend consulting healthcare
-            providers for specific medical advice."""
-            # Format the conversation
-            prompt = f"{system_prompt}\n\nUser: {message}\nAssistant:"
-            # Tokenize the input
-            inputs = self.tokenizer(
-                prompt,
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-                max_length=self.max_length
-            ).to('cpu')  # Ensure inputs are on CPU
             logger.info("Generating response")
-            # Generate with conservative settings for CPU
-            with torch.no_grad():  # Disable gradient computation to save memory
-                outputs = self.model.generate(
-                    **inputs,
-                    max_new_tokens=256,  # Reduced for CPU efficiency
-                    do_sample=True,
-                    temperature=0.7,
-                    top_p=0.95,
-                    pad_token_id=self.tokenizer.pad_token_id,
-                    repetition_penalty=1.1
-                )
-            # Decode and clean up the response
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            response = response.split("Assistant:")[-1].strip()
             logger.info("Response generated successfully")
             return response
@@ -260,7 +255,7 @@ class MedicalAssistant:
             logger.error(traceback.format_exc())
             return f"I apologize, but I encountered an error: {str(e)}"
-# The rest of your code remains the same
 assistant = None
 def initialize_assistant():
@@ -295,10 +290,11 @@ def chat_response(message: str, history: List[Dict]):
 # Create the Gradio interface
 demo = gr.ChatInterface(
     fn=chat_response,
-    title="Medical Assistant (CPU Version)",
-    description="""This medical assistant provides guidance and information
-                   about health-related queries. Please note that response
-                   generation may take longer as this is running in CPU mode.""",
     examples=[
         "What are the symptoms of malaria?",
         "How can I prevent type 2 diabetes?",

 class MedicalAssistant:
     def __init__(self):
         """
+        Initialize the medical assistant with the Llama3-Med42 model.
+        This model is specifically trained on medical data and quantized to 4-bit precision
+        for better memory efficiency while maintaining good performance.
         """
         try:
             logger.info("Starting model initialization...")
+            # Updated model to use Llama3-Med42
+            self.model_name = "emircanerol/Llama3-Med42-8B-4bit"
             self.max_length = 2048
+            # Initialize the pipeline for simplified text generation
+            # The pipeline handles tokenizer and model loading automatically
+            logger.info("Initializing pipeline...")
+            self.pipe = pipeline(
+                "text-generation",
+                model=self.model_name,
+                token=os.getenv('HUGGING_FACE_TOKEN'),
+                device_map="auto",
+                torch_dtype=torch.float16,  # Use half precision for 4-bit model
+                load_in_4bit=True          # Enable 4-bit quantization
+            )
+            # Load tokenizer separately for more control over text processing
             logger.info("Loading tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(
                 self.model_name,
+                token=os.getenv('HUGGING_FACE_TOKEN'),
                 trust_remote_code=True
             )
+            # Ensure proper padding token configuration
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
+            logger.info("Medical Assistant initialized successfully!")
         except Exception as e:
             logger.error(f"Initialization failed: {str(e)}")
     def generate_response(self, message: str, chat_history: List[Dict] = None) -> str:
         """
+        Generate a response using the Llama3-Med42 pipeline.
+        This method formats the conversation history and generates appropriate medical responses.
         """
         try:
             logger.info("Preparing message for generation")
             # Create a medical context-aware prompt
+            system_prompt = """You are a medical AI assistant based on Llama3-Med42,
+            specifically trained on medical knowledge. Provide accurate, professional
+            medical guidance while acknowledging limitations. Always recommend
+            consulting healthcare providers for specific medical advice."""
+            # Format the conversation for the model
+            messages = [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": message}
+            ]
+            # Add chat history if available
+            if chat_history:
+                for chat in chat_history:
+                    messages.append({
+                        "role": "user" if chat["role"] == "user" else "assistant",
+                        "content": chat["content"]
+                    })
             logger.info("Generating response")
+            # Generate response using the pipeline
+            response = self.pipe(
+                messages,
+                max_new_tokens=256,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.95,
+                repetition_penalty=1.1
+            )[0]["generated_text"]
+            # Clean up the response by extracting the last assistant message
+            response = response.split("assistant:")[-1].strip()
             logger.info("Response generated successfully")
             return response
             logger.error(traceback.format_exc())
             return f"I apologize, but I encountered an error: {str(e)}"
+# Initialize the assistant
 assistant = None
 def initialize_assistant():
 # Create the Gradio interface
 demo = gr.ChatInterface(
     fn=chat_response,
+    title="Medical Assistant (Llama3-Med42)",
+    description="""This medical assistant is powered by Llama3-Med42,
+                   a model specifically trained on medical knowledge. It provides
+                   guidance and information about health-related queries while
+                   maintaining professional medical standards.""",
     examples=[
         "What are the symptoms of malaria?",
         "How can I prevent type 2 diabetes?",