import torch
import gradio as gr
from transformers import Blip2ForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

# Load the PEFT model configuration and quantization settings
peft_model_id = "Prasi21/blip2-opt-2.7b-strep-throat-caption-adapters3"
config = PeftConfig.from_pretrained(peft_model_id)
config.base_model_name_or_path = "Prasi21/blip2-opt-2.7b-strep-throat-caption-adapters3"

# Enable 8-bit quantization for more efficient loading
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

# Load the base model with quantization
model = Blip2ForConditionalGeneration.from_pretrained(
    config.base_model_name_or_path,
    quantization_config=quantization_config,
    device_map="auto"
)

# Load the fine-tuned PEFT model
model = PeftModel.from_pretrained(model, peft_model_id)

# Load the processor
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")

# Define the prediction function
def predict(image):
    # Preprocess the image
    inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
    new_eos_token_id = 13
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_length=100,
                                      eos_token_id=new_eos_token_id)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)
    return f"{generated_caption[0]}"

# Set up the Gradio interface
demo = gr.Interface(
    fn=predict,
    inputs=gr.Image(type="pil"),  # Upload an image in PIL format
    outputs=gr.Textbox(),  # The output will be the generated caption
    title="Strep Throat Image Assessment",
    description="Upload an image of a throat and receive a medical assessment caption based on the model's output."
)

# Launch the Gradio app
demo.launch()