# import os | |
# import gradio as gr | |
# from transformers import AutoModelForCausalLM, AutoTokenizer | |
# import torch | |
# from typing import List, Dict | |
# import logging | |
# # Set up logging to help us debug model loading and inference | |
# logging.basicConfig(level=logging.INFO) | |
# logger = logging.getLogger(__name__) | |
# class MedicalAssistant: | |
# def __init__(self): | |
# """Initialize the medical assistant with model and tokenizer""" | |
# try: | |
# logger.info("Starting model initialization...") | |
# # Model configuration - adjust these based on your available compute | |
# self.model_name = "mradermacher/Llama3-Med42-8B-GGUF" | |
# self.max_length = 1048 | |
# self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
# logger.info(f"Using device: {self.device}") | |
# # Load tokenizer first - this is typically faster and can catch issues early | |
# logger.info("Loading tokenizer...") | |
# self.tokenizer = AutoTokenizer.from_pretrained( | |
# self.model_name, | |
# padding_side="left", | |
# trust_remote_code=True | |
# ) | |
# # Set padding token if not set | |
# if self.tokenizer.pad_token is None: | |
# self.tokenizer.pad_token = self.tokenizer.eos_token | |
# # Load model with memory optimizations | |
# logger.info("Loading model...") | |
# self.model = AutoModelForCausalLM.from_pretrained( | |
# self.model_name, | |
# torch_dtype=torch.float16, | |
# device_map="auto", | |
# load_in_8bit=True, | |
# trust_remote_code=True | |
# ) | |
# logger.info("Model initialization completed successfully!") | |
# except Exception as e: | |
# logger.error(f"Error during initialization: {str(e)}") | |
# raise | |
# def generate_response(self, message: str, chat_history: List[Dict] = None) -> str: | |
# """Generate a response to the user's message""" | |
# try: | |
# # Prepare the prompt | |
# system_prompt = """You are a medical AI assistant. Respond to medical queries | |
# professionally and accurately. If you're unsure, always recommend consulting | |
# with a healthcare provider.""" | |
# # Combine system prompt, chat history, and current message | |
# full_prompt = f"{system_prompt}\n\nUser: {message}\nAssistant:" | |
# # Tokenize input | |
# inputs = self.tokenizer( | |
# full_prompt, | |
# return_tensors="pt", | |
# padding=True, | |
# truncation=True, | |
# max_length=self.max_length | |
# ).to(self.device) | |
# # Generate response | |
# with torch.no_grad(): | |
# outputs = self.model.generate( | |
# **inputs, | |
# max_new_tokens=512, | |
# do_sample=True, | |
# temperature=0.7, | |
# top_p=0.95, | |
# pad_token_id=self.tokenizer.pad_token_id, | |
# repetition_penalty=1.1 | |
# ) | |
# # Decode and clean up response | |
# response = self.tokenizer.decode( | |
# outputs[0], | |
# skip_special_tokens=True | |
# ) | |
# # Extract just the assistant's response | |
# response = response.split("Assistant:")[-1].strip() | |
# return response | |
# except Exception as e: | |
# logger.error(f"Error during response generation: {str(e)}") | |
# return f"I apologize, but I encountered an error. Please try again." | |
# # Initialize the assistant | |
# assistant = None | |
# def initialize_assistant(): | |
# """Initialize the assistant and handle any errors""" | |
# global assistant | |
# try: | |
# assistant = MedicalAssistant() | |
# return True | |
# except Exception as e: | |
# logger.error(f"Failed to initialize assistant: {str(e)}") | |
# return False | |
# def chat_response(message: str, history: List[Dict]): | |
# """Handle chat messages and return responses""" | |
# global assistant | |
# # Check if assistant is initialized | |
# if assistant is None: | |
# if not initialize_assistant(): | |
# return "I apologize, but I'm currently unavailable. Please try again later." | |
# try: | |
# return assistant.generate_response(message, history) | |
# except Exception as e: | |
# logger.error(f"Error in chat response: {str(e)}") | |
# return "I encountered an error. Please try again." | |
# # Create Gradio interface | |
# demo = gr.ChatInterface( | |
# fn=chat_response, | |
# title="Medical Assistant (Test Version)", | |
# description="""This is a test version of the medical assistant. | |
# Please use it to verify basic functionality.""", | |
# examples=[ | |
# "What are the symptoms of malaria?", | |
# "How can I prevent type 2 diabetes?", | |
# "What should I do for a mild headache?" | |
# ], | |
# # retry_btn=None, | |
# # undo_btn=None, | |
# # clear_btn="Clear" | |
# ) | |
# # Launch the interface | |
# if __name__ == "__main__": | |
# demo.launch() | |
import os | |
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
import torch | |
from typing import List, Dict | |
import logging | |
import traceback | |
# Set up logging to help us understand what's happening in our application | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s' | |
) | |
logger = logging.getLogger(__name__) | |
class MedicalAssistant: | |
def __init__(self): | |
""" | |
Initialize a basic medical assistant for CPU-only environments. | |
This version uses standard model loading without quantization for maximum compatibility. | |
""" | |
try: | |
logger.info("Starting basic model initialization...") | |
# Define our model configuration | |
self.model_name = "emircanerol/Llama3-Med42-8B-4bit" | |
self.max_length = 2048 | |
# First load the tokenizer since it's lighter on memory | |
logger.info("Loading tokenizer...") | |
self.tokenizer = AutoTokenizer.from_pretrained( | |
self.model_name, | |
token=os.getenv('HUGGING_FACE_TOKEN') | |
) | |
# Handle padding token setup | |
if self.tokenizer.pad_token is None: | |
self.tokenizer.pad_token = self.tokenizer.eos_token | |
# Initialize pipeline with basic CPU settings | |
logger.info("Initializing CPU-based pipeline...") | |
self.pipe = pipeline( | |
"text-generation", | |
model=self.model_name, | |
token=os.getenv('HUGGING_FACE_TOKEN'), | |
device_map="cpu", # Explicitly use CPU | |
torch_dtype=torch.float32, # Use standard precision | |
use_safetensors=True, # Enable safetensors for better memory handling | |
# Removed all quantization settings | |
) | |
logger.info("Medical Assistant initialized successfully in basic CPU mode!") | |
except Exception as e: | |
logger.error(f"Initialization failed: {str(e)}") | |
logger.error(traceback.format_exc()) | |
raise | |
def generate_response(self, message: str, chat_history: List[Dict] = None) -> str: | |
""" | |
Generate responses using basic CPU-friendly settings. | |
This method focuses on stability over speed, using conservative parameters. | |
""" | |
try: | |
logger.info("Preparing message for generation") | |
# Create our medical context prompt | |
system_prompt = """You are a medical AI assistant trained on medical knowledge. | |
Provide accurate, professional medical guidance while acknowledging limitations. | |
Always recommend consulting healthcare providers for specific medical advice.""" | |
# Format our conversation for the model | |
messages = [ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": message} | |
] | |
# Add recent chat history if available | |
if chat_history: | |
# Only keep recent history to manage memory | |
recent_history = chat_history[-2:] # Keep last 2 exchanges | |
for chat in recent_history: | |
messages.append({ | |
"role": "user" if chat["role"] == "user" else "assistant", | |
"content": chat["content"] | |
}) | |
logger.info("Generating response with basic settings") | |
# Generate with conservative parameters | |
response = self.pipe( | |
messages, | |
max_new_tokens=100, # Conservative token limit | |
do_sample=True, | |
temperature=0.7, | |
top_p=0.95, | |
num_beams=1, # Single beam for simplicity | |
pad_token_id=self.tokenizer.pad_token_id | |
)[0]["generated_text"] | |
# Clean up our response | |
response = response.split("assistant:")[-1].strip() | |
logger.info("Response generated successfully") | |
return response | |
except Exception as e: | |
logger.error(f"Error during response generation: {str(e)}") | |
logger.error(traceback.format_exc()) | |
return f"I apologize, but I encountered an error: {str(e)}" | |
# Initialize our assistant | |
assistant = None | |
def initialize_assistant(): | |
"""Initialize the assistant with careful error handling""" | |
global assistant | |
try: | |
logger.info("Attempting to initialize basic CPU assistant") | |
assistant = MedicalAssistant() | |
logger.info("Assistant initialized successfully") | |
return True | |
except Exception as e: | |
logger.error(f"Failed to initialize assistant: {str(e)}") | |
logger.error(traceback.format_exc()) | |
return False | |
def chat_response(message: str, history: List[Dict]): | |
"""Handle chat interactions with proper error recovery""" | |
global assistant | |
if assistant is None: | |
logger.info("Assistant not initialized, attempting initialization") | |
if not initialize_assistant(): | |
return "I apologize, but I'm currently unavailable. Please try again later." | |
try: | |
return assistant.generate_response(message, history) | |
except Exception as e: | |
logger.error(f"Error in chat response: {str(e)}") | |
logger.error(traceback.format_exc()) | |
return f"I encountered an error: {str(e)}" | |
# Create our Gradio interface | |
demo = gr.ChatInterface( | |
fn=chat_response, | |
title="Medical Assistant (Basic CPU Version)", | |
description="""This medical assistant provides medical guidance using a basic CPU configuration. | |
Responses may take longer but will be stable and reliable.""", | |
examples=[ | |
"What are the symptoms of malaria?", | |
"How can I prevent type 2 diabetes?", | |
"What should I do for a mild headache?" | |
] | |
) | |
# Launch our interface | |
if __name__ == "__main__": | |
logger.info("Starting the basic CPU application") | |
demo.launch() |