Raj086's picture
Upload 2 files
fe52fb1 verified
import streamlit as st
from PIL import Image
import torch
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2TokenizerFast
from gtts import gTTS
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = VisionEncoderDecoderModel.from_pretrained('nlpconnect/vit-gpt2-image-captioning').to(device)
tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
def get_caption(model,image_processor,tokenizer,image_path):
image = Image.open(image_path)
#processing the image
img = image_processor(image,return_tensors='pt').to(device)
# gteneratimg caption
output = model.generate(**img)
# decode the output
caption = tokenizer.batch_decode(output,skip_special_tokens=True)[0]
return caption
st.title('Vision Transformers (ViT) in Image Captioning Using Pretrained ViT Models')
uploaded_image = st.file_uploader('Upload an Image',type=['png','jpg','jpeg'])
if uploaded_image is not None:
# image = Image.open(uploaded_image)
st.image(uploaded_image)
caption = get_caption(model,image_processor,tokenizer,uploaded_image)
st.header(caption)
read_caption = gTTS(caption,lang='en',slow=True)
read_caption.save('caption.mp3')
st.audio('caption.mp3',autoplay=True)
else:
st.error('No Image Uploaded !')