Spaces:
Sleeping
Sleeping
import tempfile | |
import gradio as gr | |
from gtts import gTTS | |
import inference_script | |
import vit_gpt2 | |
import os | |
import warnings | |
warnings.filterwarnings('ignore') | |
def process_image_and_generate_output(image, model_selection): | |
if image is None: | |
return "Please select an image", None | |
# (Trained only for 15 epochs without any hyperparameter tuning, utilizing inception v3)' | |
# (SOTA model for Image captioning) | |
if model_selection == 'Basic Model': | |
result = inference_script.evaluate(image) | |
pred_caption = ' '.join(result).rsplit(' ', 1)[0] | |
pred_caption = pred_caption.replace('<unk>', '') | |
elif model_selection == 'ViT-GPT2': | |
result = vit_gpt2.predict_step(image) | |
pred_caption = result[0] | |
else: | |
return "Invalid model selection", None | |
# Generate speech from the caption | |
tts = gTTS(text=pred_caption, lang='en', slow=False) | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio: | |
audio_file_path = temp_audio.name | |
tts.save(audio_file_path) | |
# Read the audio file | |
with open(audio_file_path, "rb") as f: | |
audio_content = f.read() | |
# Clean up the temporary audio file | |
os.unlink(audio_file_path) | |
return pred_caption, audio_content | |
sample_images = [ | |
[os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"), "ViT-GPT2"], | |
[os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"), 'Basic Model'], | |
[os.path.join(os.path.dirname(__file__), "sample_images/2.jpg"), "ViT-GPT2"], | |
[os.path.join(os.path.dirname(__file__), "sample_images/2.jpg"), 'Basic Model'], | |
[os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"), "ViT-GPT2"], | |
[os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"), 'Basic Model'] | |
] | |
# Create a dropdown to select sample image | |
image_input = gr.Image(label="Upload Image") | |
# Create a dropdown to choose the model | |
model_selection_input = gr.Radio(["Basic Model", | |
"ViT-GPT2"], | |
label="Choose Model") | |
iface = gr.Interface(fn=process_image_and_generate_output, | |
inputs=[image_input, model_selection_input], | |
outputs=["text", "audio"], | |
examples=sample_images, | |
cache_examples=True, | |
allow_flagging='never', | |
title="Eye For Blind | Image Captioning & TTS", | |
description="To be added") | |
iface.launch() | |