Spaces:
Sleeping
Sleeping
File size: 4,117 Bytes
73d4923 3a2d1fe 73d4923 599f168 4e40bc0 599f168 35e87cb 599f168 73d4923 3a2d1fe dc65ada 73d4923 00eaff9 73d4923 090ccb6 8692c47 dc65ada 8692c47 dc65ada 3a2d1fe 4701b71 3a2d1fe 35e87cb 00eaff9 3a2d1fe f8b3ccb 73d4923 3a2d1fe f8b3ccb 3a2d1fe 62d3fbb 599f168 73d4923 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import tempfile
import gradio as gr
from gtts import gTTS
import inference_script
import vit_gpt2
import os
import warnings
warnings.filterwarnings('ignore')
# Define problem statement
problem_statement = """
### Problem Statement
This project aims to develop a deep learning model to verbally describe image contents for the visually impaired using caption generation with an attention mechanism on the Flickr8K dataset. Inspired by the "Show, Attend and Tell: Neural Image Caption Generation with Visual Attention" paper, the model utilizes a CNN-RNN architecture to extract image features and generate captions, facilitating accessibility. The Kaggle dataset comprises 8,000 images, each paired with five descriptive captions, enabling comprehensive understanding of image content.
"""
# Define solution overview
solution_overview = """
### Solution Overview
The basic model, **trained for only 20 epochs without extensive hyperparameter tuning,** primarily focuses on exploring the integration of the attention mechanism with the Encoder-Decoder architecture for image processing utilizing subclassing. To improve inference quality, Vit-GPT2 architecture is integrated. [Visit the Kaggle notebook](https://www.kaggle.com/code/krishna2308/eye-for-blind) for implementation details.
"""
# Define real-life scenario application
real_life_scenario = """
### Real-life Scenario Application
While this current implementation may not support real-time processing, the potential for future development is vast. Where a visually impaired individual wears smart glasses equipped with a camera. As they move around, the camera captures live footage of their surroundings, which is then processed in real-time by the image captioning model integrated into the glasses. The generated spoken descriptions can be streamed directly to the user's earpiece, providing instant audio feedback about their environment.
"""
def process_image_and_generate_output(image, model_selection):
if image is None:
return "Please select an image", None
if model_selection == "Basic Model (Results won't be good)":
result = inference_script.evaluate(image)
pred_caption = ' '.join(result).rsplit(' ', 1)[0]
pred_caption = pred_caption.replace('<unk>', '')
elif model_selection == 'ViT-GPT2':
result = vit_gpt2.predict_step(image)
pred_caption = result[0]
else:
return "Invalid model selection", None
# Generate speech from the caption
tts = gTTS(text=pred_caption, lang='en', slow=False)
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio:
audio_file_path = temp_audio.name
tts.save(audio_file_path)
# Read the audio file
with open(audio_file_path, "rb") as f:
audio_content = f.read()
# Clean up the temporary audio file
os.unlink(audio_file_path)
return pred_caption, audio_content
sample_images = [
[os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"), "ViT-GPT2"],
[os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"), "Basic Model (Results won't be good)"],
[os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"), "ViT-GPT2"],
[os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"), "Basic Model (Results won't be good)"]
]
# Create a dropdown to select sample image
image_input = gr.Image(label="Upload Image")
# Create a dropdown to choose the model
model_selection_input = gr.Radio(["Basic Model (Results won't be good)",
"ViT-GPT2"],
label="Choose Model")
iface = gr.Interface(fn=process_image_and_generate_output,
inputs=[image_input, model_selection_input],
outputs=[gr.Text(label="Caption"),gr.Audio(label="Audio")],
examples=sample_images,
allow_flagging='never',
title="Eye For Blind | Image Captioning & TTS Demo",
description=f"{problem_statement}\n\n{solution_overview}\n\n{real_life_scenario}")
iface.launch()
|