Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2Tokenizer | |
import torch | |
from PIL import Image | |
# Load the model and tokenizer | |
model = VisionEncoderDecoderModel.from_pretrained("ashok2216/vit-gpt2-image-captioning_COCO_FineTuned") | |
processor = ViTImageProcessor.from_pretrained("ashok2216/vit-gpt2-image-captioning_COCO_FineTuned") | |
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
# Streamlit app title | |
st.title("Image Captioning with ViT-GPT2 Model") | |
st.write("Upload an image, and the model will generate a descriptive caption.") | |
# File uploader for image input | |
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png", "jpeg"]) | |
if uploaded_file is not None: | |
# Load and display the uploaded image | |
image = Image.open(uploaded_file) | |
st.image(image, caption="Uploaded Image", use_column_width=True) | |
# Preprocess the image for the model | |
inputs = processor(images=image, return_tensors="pt") | |
pixel_values = inputs.pixel_values | |
# Generate the caption | |
with st.spinner("Generating caption..."): | |
output = model.generate(pixel_values) | |
caption = tokenizer.decode(output[0], skip_special_tokens=True) | |
# Display the generated caption | |
st.success("Generated Caption:") | |
st.write(caption) | |