import tempfile import gradio as gr from gtts import gTTS import inference_script import vit_gpt2 import os import warnings warnings.filterwarnings('ignore') def process_image_and_generate_output(image, model_selection): if image is None: return "Please select an image", None if model_selection == ('Basic Model (Trained only for 15 epochs without any hyperparameter tuning, utilizing ' 'inception v3)'): result = inference_script.evaluate(image) pred_caption = ' '.join(result).rsplit(' ', 1)[0] pred_caption = pred_caption.replace('', '') elif model_selection == 'ViT-GPT2 (SOTA model for Image captioning)': result = vit_gpt2.predict_step(image) pred_caption = result[0] else: return "Invalid model selection", None # Generate speech from the caption tts = gTTS(text=pred_caption, lang='en', slow=False) with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio: audio_file_path = temp_audio.name tts.save(audio_file_path) # Read the audio file with open(audio_file_path, "rb") as f: audio_content = f.read() # Clean up the temporary audio file os.unlink(audio_file_path) return pred_caption, audio_content # Define your sample images # sample_images = [os.path.join(os.path.dirname(__file__), 'sample_images/1.jpg'), # os.path.join(os.path.dirname(__file__), 'sample_images/2.jpg'), # os.path.join(os.path.dirname(__file__), 'sample_images/3.jpg'), # os.path.join(os.path.dirname(__file__), 'sample_images/4.jpg'), ] sample_images = [ [os.path.join(os.path.dirname(__file__), "sample_images/1.jpg")], [os.path.join(os.path.dirname(__file__), "sample_images/2.jpg")], [os.path.join(os.path.dirname(__file__), "sample_images/3.jpg")], [os.path.join(os.path.dirname(__file__), "sample_images/4.jpg")], [os.path.join(os.path.dirname(__file__), "sample_images/5.jpg")], [os.path.join(os.path.dirname(__file__), "sample_images/6.jpg")] ] # Create a dropdown to select sample image image_input = gr.Image(label="Upload Image") # Create a dropdown to choose the model model_selection_input = gr.Radio(["Basic Model (Trained only for 15 epochs without any hyperparameter " "tuning, utilizing inception v3)", "ViT-GPT2 (SOTA model for Image captioning)"], label="Choose Model") iface = gr.Interface(fn=process_image_and_generate_output, inputs=[image_input, model_selection_input], outputs=["text", "audio"], examples=sample_images, allow_flagging='never', title="Eye For Blind | Image Captioning & TTS", description="To be added") iface.launch()