import streamlit as st from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2Tokenizer import torch from PIL import Image # Load the model and tokenizer model = VisionEncoderDecoderModel.from_pretrained("ashok2216/vit-gpt2-image-captioning_COCO_FineTuned") processor = ViTImageProcessor.from_pretrained("ashok2216/vit-gpt2-image-captioning_COCO_FineTuned") tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # Streamlit app title st.title("Image Captioning with ViT-GPT2 Model") st.write("Upload an image, and the model will generate a descriptive caption.") # File uploader for image input uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png", "jpeg"]) if uploaded_file is not None: # Load and display the uploaded image image = Image.open(uploaded_file) st.image(image, caption="Uploaded Image", use_column_width=True) # Preprocess the image for the model inputs = processor(images=image, return_tensors="pt") pixel_values = inputs.pixel_values # Generate the caption with st.spinner("Generating caption..."): output = model.generate(pixel_values) caption = tokenizer.decode(output[0], skip_special_tokens=True) # Display the generated caption st.success("Generated Caption:") st.write(caption)