import streamlit as st
import requests
from PIL import Image
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
import io


@st.cache_resource
def load_model():
    model = Blip2ForConditionalGeneration.from_pretrained("ybelkada/blip2-opt-2.7b-fp16-sharded")
    model.load_adapter('blip-cpu-model')
    processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    return model, processor

model, processor = load_model()

st.title("Image Captioning with Fine-Tuned BLiPv2 Model")

uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    image = Image.open(uploaded_file)
    st.image(image, caption="Uploaded Image", use_column_width=True)
    
    # files = {"file": uploaded_file.getvalue()}
    print("Sending API request")
    # response = requests.post("http://0.0.0.0:8502/generate-caption/", files=files)
    # caption = response.json().get("caption")
    
    inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
    
    with torch.no_grad():
        caption_ids = model.generate(**inputs, max_length=128)
        caption = processor.decode(caption_ids[0], skip_special_tokens=True)
    
    st.write("Generated Caption:")
    st.write(f"**{caption}**")