import streamlit as st import requests from PIL import Image from transformers import AutoProcessor, Blip2ForConditionalGeneration import torch import io @st.cache_resource def load_model(): model = Blip2ForConditionalGeneration.from_pretrained("ybelkada/blip2-opt-2.7b-fp16-sharded") model.load_adapter('blip-cpu-model') processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) return model, processor model, processor = load_model() st.title("Image Captioning with Fine-Tuned BLiPv2 Model") uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: image = Image.open(uploaded_file) st.image(image, caption="Uploaded Image", use_column_width=True) # files = {"file": uploaded_file.getvalue()} print("Sending API request") # response = requests.post("http://0.0.0.0:8502/generate-caption/", files=files) # caption = response.json().get("caption") inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) with torch.no_grad(): caption_ids = model.generate(**inputs, max_length=128) caption = processor.decode(caption_ids[0], skip_special_tokens=True) st.write("Generated Caption:") st.write(f"**{caption}**")