Nurses / app.py
benardo0's picture
Update app.py
17025e8 verified
raw
history blame
11.9 kB
# import os
# import gradio as gr
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch
# from typing import List, Dict
# import logging
# # Set up logging to help us debug model loading and inference
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)
# class MedicalAssistant:
# def __init__(self):
# """Initialize the medical assistant with model and tokenizer"""
# try:
# logger.info("Starting model initialization...")
# # Model configuration - adjust these based on your available compute
# self.model_name = "mradermacher/Llama3-Med42-8B-GGUF"
# self.max_length = 1048
# self.device = "cuda" if torch.cuda.is_available() else "cpu"
# logger.info(f"Using device: {self.device}")
# # Load tokenizer first - this is typically faster and can catch issues early
# logger.info("Loading tokenizer...")
# self.tokenizer = AutoTokenizer.from_pretrained(
# self.model_name,
# padding_side="left",
# trust_remote_code=True
# )
# # Set padding token if not set
# if self.tokenizer.pad_token is None:
# self.tokenizer.pad_token = self.tokenizer.eos_token
# # Load model with memory optimizations
# logger.info("Loading model...")
# self.model = AutoModelForCausalLM.from_pretrained(
# self.model_name,
# torch_dtype=torch.float16,
# device_map="auto",
# load_in_8bit=True,
# trust_remote_code=True
# )
# logger.info("Model initialization completed successfully!")
# except Exception as e:
# logger.error(f"Error during initialization: {str(e)}")
# raise
# def generate_response(self, message: str, chat_history: List[Dict] = None) -> str:
# """Generate a response to the user's message"""
# try:
# # Prepare the prompt
# system_prompt = """You are a medical AI assistant. Respond to medical queries
# professionally and accurately. If you're unsure, always recommend consulting
# with a healthcare provider."""
# # Combine system prompt, chat history, and current message
# full_prompt = f"{system_prompt}\n\nUser: {message}\nAssistant:"
# # Tokenize input
# inputs = self.tokenizer(
# full_prompt,
# return_tensors="pt",
# padding=True,
# truncation=True,
# max_length=self.max_length
# ).to(self.device)
# # Generate response
# with torch.no_grad():
# outputs = self.model.generate(
# **inputs,
# max_new_tokens=512,
# do_sample=True,
# temperature=0.7,
# top_p=0.95,
# pad_token_id=self.tokenizer.pad_token_id,
# repetition_penalty=1.1
# )
# # Decode and clean up response
# response = self.tokenizer.decode(
# outputs[0],
# skip_special_tokens=True
# )
# # Extract just the assistant's response
# response = response.split("Assistant:")[-1].strip()
# return response
# except Exception as e:
# logger.error(f"Error during response generation: {str(e)}")
# return f"I apologize, but I encountered an error. Please try again."
# # Initialize the assistant
# assistant = None
# def initialize_assistant():
# """Initialize the assistant and handle any errors"""
# global assistant
# try:
# assistant = MedicalAssistant()
# return True
# except Exception as e:
# logger.error(f"Failed to initialize assistant: {str(e)}")
# return False
# def chat_response(message: str, history: List[Dict]):
# """Handle chat messages and return responses"""
# global assistant
# # Check if assistant is initialized
# if assistant is None:
# if not initialize_assistant():
# return "I apologize, but I'm currently unavailable. Please try again later."
# try:
# return assistant.generate_response(message, history)
# except Exception as e:
# logger.error(f"Error in chat response: {str(e)}")
# return "I encountered an error. Please try again."
# # Create Gradio interface
# demo = gr.ChatInterface(
# fn=chat_response,
# title="Medical Assistant (Test Version)",
# description="""This is a test version of the medical assistant.
# Please use it to verify basic functionality.""",
# examples=[
# "What are the symptoms of malaria?",
# "How can I prevent type 2 diabetes?",
# "What should I do for a mild headache?"
# ],
# # retry_btn=None,
# # undo_btn=None,
# # clear_btn="Clear"
# )
# # Launch the interface
# if __name__ == "__main__":
# demo.launch()
import os
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from typing import List, Dict
import logging
import traceback
# Set up logging to help us track what's happening
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class MedicalAssistant:
def __init__(self):
"""
Initialize the medical assistant with CPU-friendly settings.
We use a base model instead of a quantized version to ensure CPU compatibility.
"""
try:
logger.info("Starting model initialization...")
# Using a standard model instead of a 4-bit quantized version
# This model is larger but more compatible with CPU-only environments
self.model_name = "meta-llama/Llama-2-7b-chat-hf"
self.max_length = 2048
# First load the tokenizer as it's lighter on memory
logger.info("Loading tokenizer...")
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name,
token=os.getenv('HUGGING_FACE_TOKEN'), # Add your token in Space settings
trust_remote_code=True
)
# Handle padding token
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
logger.info("Tokenizer loaded successfully")
# Load model with CPU-friendly settings
logger.info("Loading model - this may take a few minutes...")
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
token=os.getenv('HUGGING_FACE_TOKEN'),
device_map="auto", # This will default to CPU if no GPU is available
torch_dtype=torch.float32, # Standard precision for CPU
low_cpu_mem_usage=True, # Optimize memory usage
offload_folder="offload" # Enable disk offloading for memory management
)
# Move model explicitly to CPU and clear any GPU memory
self.model = self.model.to('cpu')
if torch.cuda.is_available():
torch.cuda.empty_cache()
logger.info("Model loaded successfully!")
except Exception as e:
logger.error(f"Initialization failed: {str(e)}")
logger.error(traceback.format_exc())
raise
def generate_response(self, message: str, chat_history: List[Dict] = None) -> str:
"""
Generate a response directly using the model instead of a pipeline.
This gives us more control over the generation process.
"""
try:
logger.info("Preparing message for generation")
# Create a medical context-aware prompt
system_prompt = """You are a medical AI assistant. Provide accurate,
professional medical guidance. Always recommend consulting healthcare
providers for specific medical advice."""
# Format the conversation
prompt = f"{system_prompt}\n\nUser: {message}\nAssistant:"
# Tokenize the input
inputs = self.tokenizer(
prompt,
return_tensors="pt",
padding=True,
truncation=True,
max_length=self.max_length
).to('cpu') # Ensure inputs are on CPU
logger.info("Generating response")
# Generate with conservative settings for CPU
with torch.no_grad(): # Disable gradient computation to save memory
outputs = self.model.generate(
**inputs,
max_new_tokens=256, # Reduced for CPU efficiency
do_sample=True,
temperature=0.7,
top_p=0.95,
pad_token_id=self.tokenizer.pad_token_id,
repetition_penalty=1.1
)
# Decode and clean up the response
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response.split("Assistant:")[-1].strip()
logger.info("Response generated successfully")
return response
except Exception as e:
logger.error(f"Error during response generation: {str(e)}")
logger.error(traceback.format_exc())
return f"I apologize, but I encountered an error: {str(e)}"
# The rest of your code remains the same
assistant = None
def initialize_assistant():
"""Initialize the assistant with proper error handling"""
global assistant
try:
logger.info("Attempting to initialize assistant")
assistant = MedicalAssistant()
logger.info("Assistant initialized successfully")
return True
except Exception as e:
logger.error(f"Failed to initialize assistant: {str(e)}")
logger.error(traceback.format_exc())
return False
def chat_response(message: str, history: List[Dict]):
"""Handle chat interactions with error recovery"""
global assistant
if assistant is None:
logger.info("Assistant not initialized, attempting initialization")
if not initialize_assistant():
return "I apologize, but I'm currently unavailable. Please try again later."
try:
return assistant.generate_response(message, history)
except Exception as e:
logger.error(f"Error in chat response: {str(e)}")
logger.error(traceback.format_exc())
return f"I encountered an error: {str(e)}"
# Create the Gradio interface
demo = gr.ChatInterface(
fn=chat_response,
title="Medical Assistant (CPU Version)",
description="""This medical assistant provides guidance and information
about health-related queries. Please note that response
generation may take longer as this is running in CPU mode.""",
examples=[
"What are the symptoms of malaria?",
"How can I prevent type 2 diabetes?",
"What should I do for a mild headache?"
]
)
# Launch the interface
if __name__ == "__main__":
logger.info("Starting the application")
demo.launch()