Spaces:
Sleeping
Sleeping
Merge branch 'main' of https://huggingface.co/spaces/krishnapal2308/eye_for_blind
Browse files
app.py
CHANGED
@@ -11,13 +11,13 @@ warnings.filterwarnings('ignore')
|
|
11 |
# Define problem statement
|
12 |
problem_statement = """
|
13 |
### Problem Statement
|
14 |
-
|
15 |
"""
|
16 |
|
17 |
# Define solution overview
|
18 |
solution_overview = """
|
19 |
### Solution Overview
|
20 |
-
The basic model, trained for
|
21 |
"""
|
22 |
|
23 |
# Define real-life scenario application
|
@@ -30,7 +30,7 @@ While this current implementation may not support real-time processing, the pote
|
|
30 |
def process_image_and_generate_output(image, model_selection):
|
31 |
if image is None:
|
32 |
return "Please select an image", None
|
33 |
-
if model_selection ==
|
34 |
result = inference_script.evaluate(image)
|
35 |
pred_caption = ' '.join(result).rsplit(' ', 1)[0]
|
36 |
pred_caption = pred_caption.replace('<unk>', '')
|
@@ -57,16 +57,16 @@ def process_image_and_generate_output(image, model_selection):
|
|
57 |
|
58 |
sample_images = [
|
59 |
[os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"), "ViT-GPT2"],
|
60 |
-
[os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"),
|
61 |
[os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"), "ViT-GPT2"],
|
62 |
-
[os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"),
|
63 |
]
|
64 |
|
65 |
# Create a dropdown to select sample image
|
66 |
image_input = gr.Image(label="Upload Image")
|
67 |
|
68 |
# Create a dropdown to choose the model
|
69 |
-
model_selection_input = gr.Radio(["Basic Model",
|
70 |
"ViT-GPT2"],
|
71 |
label="Choose Model")
|
72 |
|
|
|
11 |
# Define problem statement
|
12 |
problem_statement = """
|
13 |
### Problem Statement
|
14 |
+
This project aims to develop a deep learning model to verbally describe image contents for the visually impaired using caption generation with an attention mechanism on the Flickr8K dataset. Inspired by the "Show, Attend and Tell: Neural Image Caption Generation with Visual Attention" paper, the model utilizes a CNN-RNN architecture to extract image features and generate captions, facilitating accessibility. The Kaggle dataset comprises 8,000 images, each paired with five descriptive captions, enabling comprehensive understanding of image content.
|
15 |
"""
|
16 |
|
17 |
# Define solution overview
|
18 |
solution_overview = """
|
19 |
### Solution Overview
|
20 |
+
The basic model, **trained for only 20 epochs without extensive hyperparameter tuning,** primarily focuses on exploring the integration of the attention mechanism with the Encoder-Decoder architecture for image processing utilizing subclassing. To improve inference quality, Vit-GPT2 architecture is integrated. [Visit the Kaggle notebook](https://www.kaggle.com/code/krishna2308/eye-for-blind) for implementation details.
|
21 |
"""
|
22 |
|
23 |
# Define real-life scenario application
|
|
|
30 |
def process_image_and_generate_output(image, model_selection):
|
31 |
if image is None:
|
32 |
return "Please select an image", None
|
33 |
+
if model_selection == "Basic Model (Results won't be good)":
|
34 |
result = inference_script.evaluate(image)
|
35 |
pred_caption = ' '.join(result).rsplit(' ', 1)[0]
|
36 |
pred_caption = pred_caption.replace('<unk>', '')
|
|
|
57 |
|
58 |
sample_images = [
|
59 |
[os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"), "ViT-GPT2"],
|
60 |
+
[os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"), "Basic Model (Results won't be good)"],
|
61 |
[os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"), "ViT-GPT2"],
|
62 |
+
[os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"), "Basic Model (Results won't be good)"]
|
63 |
]
|
64 |
|
65 |
# Create a dropdown to select sample image
|
66 |
image_input = gr.Image(label="Upload Image")
|
67 |
|
68 |
# Create a dropdown to choose the model
|
69 |
+
model_selection_input = gr.Radio(["Basic Model (Results won't be good)",
|
70 |
"ViT-GPT2"],
|
71 |
label="Choose Model")
|
72 |
|