Image-to-Text / app.py
ashok2216's picture
Create app.py
bb8c731 verified
raw
history blame
1.32 kB
import streamlit as st
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2Tokenizer
import torch
from PIL import Image
# Load the model and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("ashok2216/vit-gpt2-image-captioning_COCO_FineTuned")
processor = ViTImageProcessor.from_pretrained("ashok2216/vit-gpt2-image-captioning_COCO_FineTuned")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Streamlit app title
st.title("Image Captioning with ViT-GPT2 Model")
st.write("Upload an image, and the model will generate a descriptive caption.")
# File uploader for image input
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png", "jpeg"])
if uploaded_file is not None:
# Load and display the uploaded image
image = Image.open(uploaded_file)
st.image(image, caption="Uploaded Image", use_column_width=True)
# Preprocess the image for the model
inputs = processor(images=image, return_tensors="pt")
pixel_values = inputs.pixel_values
# Generate the caption
with st.spinner("Generating caption..."):
output = model.generate(pixel_values)
caption = tokenizer.decode(output[0], skip_special_tokens=True)
# Display the generated caption
st.success("Generated Caption:")
st.write(caption)