Spaces:
Sleeping
Sleeping
import streamlit as st | |
from PIL import Image | |
import torch | |
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, GPT2TokenizerFast | |
from gtts import gTTS | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
model = VisionEncoderDecoderModel.from_pretrained('nlpconnect/vit-gpt2-image-captioning').to(device) | |
tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
def get_caption(model,image_processor,tokenizer,image_path): | |
image = Image.open(image_path) | |
#processing the image | |
img = image_processor(image,return_tensors='pt').to(device) | |
# gteneratimg caption | |
output = model.generate(**img) | |
# decode the output | |
caption = tokenizer.batch_decode(output,skip_special_tokens=True)[0] | |
return caption | |
st.title('Vision Transformers (ViT) in Image Captioning Using Pretrained ViT Models') | |
uploaded_image = st.file_uploader('Upload an Image',type=['png','jpg','jpeg']) | |
if uploaded_image is not None: | |
# image = Image.open(uploaded_image) | |
st.image(uploaded_image) | |
caption = get_caption(model,image_processor,tokenizer,uploaded_image) | |
st.header(caption) | |
read_caption = gTTS(caption,lang='en',slow=True) | |
read_caption.save('caption.mp3') | |
st.audio('caption.mp3',autoplay=True) | |
else: | |
st.error('No Image Uploaded !') |