Spaces:
Sleeping
Sleeping
krishnapal2308
commited on
Commit
·
73d4923
1
Parent(s):
decea29
Adding all the files.
Browse files- .gitmodules +3 -0
- .idea/.gitignore +3 -0
- .idea/eye_for_blind.iml +8 -0
- .idea/inspectionProfiles/Project_Default.xml +78 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +7 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +7 -0
- __pycache__/inference_script.cpython-310.pyc +0 -0
- __pycache__/vit_gpt2.cpython-310.pyc +0 -0
- app.py +50 -0
- efb-requirements/decoder_50epoch_weights.h5 +3 -0
- efb-requirements/encoder_50epoch_weights.h5 +3 -0
- efb-requirements/inception_v3.h5 +3 -0
- efb-requirements/tokenizer.json +0 -0
- flagged/image/c272d1784bee86f02dd8/99679241_adc853a5c0.jpg +0 -0
- flagged/log.csv +2 -0
- inference_script.py +140 -0
- test.py +50 -0
- vit-gpt2-image-captioning +1 -0
- vit_gpt2.py +26 -0
.gitmodules
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[submodule "vit-gpt2-image-captioning"]
|
2 |
+
path = vit-gpt2-image-captioning
|
3 |
+
url = https://huggingface.co/nlpconnect/vit-gpt2-image-captioning
|
.idea/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
.idea/eye_for_blind.iml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$" />
|
5 |
+
<orderEntry type="inheritedJdk" />
|
6 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
7 |
+
</component>
|
8 |
+
</module>
|
.idea/inspectionProfiles/Project_Default.xml
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<profile version="1.0">
|
3 |
+
<option name="myName" value="Project Default" />
|
4 |
+
<inspection_tool class="HtmlUnknownTag" enabled="true" level="WARNING" enabled_by_default="true">
|
5 |
+
<option name="myValues">
|
6 |
+
<value>
|
7 |
+
<list size="7">
|
8 |
+
<item index="0" class="java.lang.String" itemvalue="nobr" />
|
9 |
+
<item index="1" class="java.lang.String" itemvalue="noembed" />
|
10 |
+
<item index="2" class="java.lang.String" itemvalue="comment" />
|
11 |
+
<item index="3" class="java.lang.String" itemvalue="noscript" />
|
12 |
+
<item index="4" class="java.lang.String" itemvalue="embed" />
|
13 |
+
<item index="5" class="java.lang.String" itemvalue="script" />
|
14 |
+
<item index="6" class="java.lang.String" itemvalue="style" />
|
15 |
+
</list>
|
16 |
+
</value>
|
17 |
+
</option>
|
18 |
+
<option name="myCustomValuesEnabled" value="true" />
|
19 |
+
</inspection_tool>
|
20 |
+
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
21 |
+
<option name="ignoredPackages">
|
22 |
+
<value>
|
23 |
+
<list size="42">
|
24 |
+
<item index="0" class="java.lang.String" itemvalue="absl-py" />
|
25 |
+
<item index="1" class="java.lang.String" itemvalue="networkx" />
|
26 |
+
<item index="2" class="java.lang.String" itemvalue="huggingface-hub" />
|
27 |
+
<item index="3" class="java.lang.String" itemvalue="PyYAML" />
|
28 |
+
<item index="4" class="java.lang.String" itemvalue="gast" />
|
29 |
+
<item index="5" class="java.lang.String" itemvalue="MarkupSafe" />
|
30 |
+
<item index="6" class="java.lang.String" itemvalue="numpy" />
|
31 |
+
<item index="7" class="java.lang.String" itemvalue="pyasn1" />
|
32 |
+
<item index="8" class="java.lang.String" itemvalue="requests" />
|
33 |
+
<item index="9" class="java.lang.String" itemvalue="Jinja2" />
|
34 |
+
<item index="10" class="java.lang.String" itemvalue="fsspec" />
|
35 |
+
<item index="11" class="java.lang.String" itemvalue="pyasn1-modules" />
|
36 |
+
<item index="12" class="java.lang.String" itemvalue="safetensors" />
|
37 |
+
<item index="13" class="java.lang.String" itemvalue="certifi" />
|
38 |
+
<item index="14" class="java.lang.String" itemvalue="keras" />
|
39 |
+
<item index="15" class="java.lang.String" itemvalue="urllib3" />
|
40 |
+
<item index="16" class="java.lang.String" itemvalue="itsdangerous" />
|
41 |
+
<item index="17" class="java.lang.String" itemvalue="Markdown" />
|
42 |
+
<item index="18" class="java.lang.String" itemvalue="sympy" />
|
43 |
+
<item index="19" class="java.lang.String" itemvalue="Flask" />
|
44 |
+
<item index="20" class="java.lang.String" itemvalue="blinker" />
|
45 |
+
<item index="21" class="java.lang.String" itemvalue="tokenizers" />
|
46 |
+
<item index="22" class="java.lang.String" itemvalue="libclang" />
|
47 |
+
<item index="23" class="java.lang.String" itemvalue="transformers" />
|
48 |
+
<item index="24" class="java.lang.String" itemvalue="google-auth-oauthlib" />
|
49 |
+
<item index="25" class="java.lang.String" itemvalue="Werkzeug" />
|
50 |
+
<item index="26" class="java.lang.String" itemvalue="h5py" />
|
51 |
+
<item index="27" class="java.lang.String" itemvalue="tensorboard-data-server" />
|
52 |
+
<item index="28" class="java.lang.String" itemvalue="packaging" />
|
53 |
+
<item index="29" class="java.lang.String" itemvalue="torch" />
|
54 |
+
<item index="30" class="java.lang.String" itemvalue="click" />
|
55 |
+
<item index="31" class="java.lang.String" itemvalue="tqdm" />
|
56 |
+
<item index="32" class="java.lang.String" itemvalue="termcolor" />
|
57 |
+
<item index="33" class="java.lang.String" itemvalue="regex" />
|
58 |
+
<item index="34" class="java.lang.String" itemvalue="mpmath" />
|
59 |
+
<item index="35" class="java.lang.String" itemvalue="typing_extensions" />
|
60 |
+
<item index="36" class="java.lang.String" itemvalue="cachetools" />
|
61 |
+
<item index="37" class="java.lang.String" itemvalue="charset-normalizer" />
|
62 |
+
<item index="38" class="java.lang.String" itemvalue="grpcio" />
|
63 |
+
<item index="39" class="java.lang.String" itemvalue="gTTS" />
|
64 |
+
<item index="40" class="java.lang.String" itemvalue="google-auth" />
|
65 |
+
<item index="41" class="java.lang.String" itemvalue="idna" />
|
66 |
+
</list>
|
67 |
+
</value>
|
68 |
+
</option>
|
69 |
+
</inspection_tool>
|
70 |
+
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
71 |
+
<option name="ignoredErrors">
|
72 |
+
<list>
|
73 |
+
<option value="N801" />
|
74 |
+
</list>
|
75 |
+
</option>
|
76 |
+
</inspection_tool>
|
77 |
+
</profile>
|
78 |
+
</component>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/misc.xml
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="Black">
|
4 |
+
<option name="sdkName" value="Python 3.10" />
|
5 |
+
</component>
|
6 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
|
7 |
+
</project>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/eye_for_blind.iml" filepath="$PROJECT_DIR$/.idea/eye_for_blind.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="" vcs="Git" />
|
5 |
+
<mapping directory="$PROJECT_DIR$/vit-gpt2-image-captioning" vcs="Git" />
|
6 |
+
</component>
|
7 |
+
</project>
|
__pycache__/inference_script.cpython-310.pyc
ADDED
Binary file (4.54 kB). View file
|
|
__pycache__/vit_gpt2.cpython-310.pyc
ADDED
Binary file (1.1 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import tempfile
|
3 |
+
import numpy as np
|
4 |
+
import gradio as gr
|
5 |
+
from gtts import gTTS
|
6 |
+
import inference_script
|
7 |
+
import vit_gpt2
|
8 |
+
import os
|
9 |
+
import warnings
|
10 |
+
warnings.filterwarnings('ignore')
|
11 |
+
|
12 |
+
|
13 |
+
def process_image_and_generate_output(image, model_selection):
|
14 |
+
|
15 |
+
if model_selection == ('Basic Model (Trained only for 15 epochs without any hyperparameter tuning, utilizing '
|
16 |
+
'inception v3)'):
|
17 |
+
result = inference_script.evaluate(image)
|
18 |
+
pred_caption = ' '.join(result).rsplit(' ', 1)[0]
|
19 |
+
pred_caption = pred_caption.replace('<unk>', '')
|
20 |
+
elif model_selection == 'ViT-GPT2 (SOTA model for Image captioning)':
|
21 |
+
result = vit_gpt2.predict_step(image)
|
22 |
+
pred_caption = result[0]
|
23 |
+
else:
|
24 |
+
return "Invalid model selection", None
|
25 |
+
|
26 |
+
# Generate speech from the caption
|
27 |
+
tts = gTTS(text=pred_caption, lang='en', slow=False)
|
28 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio:
|
29 |
+
audio_file_path = temp_audio.name
|
30 |
+
tts.save(audio_file_path)
|
31 |
+
|
32 |
+
# Read the audio file
|
33 |
+
with open(audio_file_path, "rb") as f:
|
34 |
+
audio_content = f.read()
|
35 |
+
|
36 |
+
# Clean up the temporary audio file
|
37 |
+
os.unlink(audio_file_path)
|
38 |
+
return pred_caption, audio_content
|
39 |
+
|
40 |
+
|
41 |
+
iface = gr.Interface(fn=process_image_and_generate_output,
|
42 |
+
inputs=["image", gr.Radio(["Basic Model (Trained only for 15 epochs without any hyperparameter "
|
43 |
+
"tuning, utilizing inception v3)", "ViT-GPT2 (SOTA model for Image "
|
44 |
+
"captioning)"], label="Choose "
|
45 |
+
"Model")],
|
46 |
+
outputs=["text", "audio"],
|
47 |
+
title="Eye For Blind | Image Captioning & TTS",
|
48 |
+
description="To be added")
|
49 |
+
|
50 |
+
iface.launch()
|
efb-requirements/decoder_50epoch_weights.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:974619e48b783c3ba43f38c6e00cff2948d2be3ad46487ad7b48a612f3ab72e2
|
3 |
+
size 24349812
|
efb-requirements/encoder_50epoch_weights.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7552a644a862a01215983b7906696c11427eda4f18efe8cd6ab5cae555105b9
|
3 |
+
size 2109280
|
efb-requirements/inception_v3.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c192f732860a2d28ec83c054d7e656717ff027693fb2d3ad2eded243f73cba5f
|
3 |
+
size 88104664
|
efb-requirements/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
flagged/image/c272d1784bee86f02dd8/99679241_adc853a5c0.jpg
ADDED
![]() |
flagged/log.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
image,Choose Model,output 0,output 1,flag,username,timestamp
|
2 |
+
"{""path"":""flagged\\image\\c272d1784bee86f02dd8\\99679241_adc853a5c0.jpg"",""url"":""http://127.0.0.1:7860/file=C:\\Users\\krish\\AppData\\Local\\Temp\\gradio\\28021d595e5e6c88962f4a2deef9ec5c14e5a8a1\\99679241_adc853a5c0.jpg"",""size"":128165,""orig_name"":""99679241_adc853a5c0.jpg"",""mime_type"":""""}",vit_gpt2,,,,,2024-02-08 20:15:09.773370
|
inference_script.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
import tensorflow as tf
|
4 |
+
import keras
|
5 |
+
from keras.models import Model
|
6 |
+
|
7 |
+
|
8 |
+
class Encoder(Model):
|
9 |
+
def __init__(self, embed_dim):
|
10 |
+
super(Encoder, self).__init__()
|
11 |
+
self.dense = tf.keras.layers.Dense(embed_dim)
|
12 |
+
|
13 |
+
def call(self, features):
|
14 |
+
features = self.dense(features)
|
15 |
+
features = tf.keras.activations.relu(features)
|
16 |
+
|
17 |
+
return features
|
18 |
+
|
19 |
+
|
20 |
+
class Attention_model(Model):
|
21 |
+
def __init__(self, units):
|
22 |
+
super(Attention_model, self).__init__()
|
23 |
+
self.W1 = tf.keras.layers.Dense(units)
|
24 |
+
self.W2 = tf.keras.layers.Dense(units)
|
25 |
+
self.V = tf.keras.layers.Dense(1)
|
26 |
+
self.units = units
|
27 |
+
|
28 |
+
def call(self, features, hidden):
|
29 |
+
hidden_with_time_axis = hidden[:, tf.newaxis]
|
30 |
+
|
31 |
+
score = tf.keras.activations.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
|
32 |
+
|
33 |
+
attention_weights = tf.keras.activations.softmax(self.V(score), axis=1)
|
34 |
+
|
35 |
+
context_vector = attention_weights * features
|
36 |
+
|
37 |
+
context_vector = tf.reduce_sum(context_vector, axis=1)
|
38 |
+
|
39 |
+
return context_vector, attention_weights
|
40 |
+
|
41 |
+
|
42 |
+
class Decoder(Model):
|
43 |
+
def __init__(self, embed_dim, units, vocab_size):
|
44 |
+
super(Decoder, self).__init__()
|
45 |
+
self.units = units
|
46 |
+
self.attention = Attention_model(self.units)
|
47 |
+
self.embed = tf.keras.layers.Embedding(vocab_size, embed_dim)
|
48 |
+
self.gru = tf.keras.layers.GRU(self.units, return_sequences=True, return_state=True,
|
49 |
+
recurrent_initializer='glorot_uniform')
|
50 |
+
self.d1 = tf.keras.layers.Dense(self.units)
|
51 |
+
self.d2 = tf.keras.layers.Dense(vocab_size)
|
52 |
+
|
53 |
+
def call(self, x, features, hidden):
|
54 |
+
context_vector, attention_weights = self.attention(features, hidden)
|
55 |
+
embed = self.embed(x)
|
56 |
+
embed = tf.concat([tf.expand_dims(context_vector, 1), embed], axis=-1)
|
57 |
+
output, state = self.gru(embed)
|
58 |
+
output = self.d1(output)
|
59 |
+
output = tf.reshape(output, (-1, output.shape[2]))
|
60 |
+
output = self.d2(output)
|
61 |
+
|
62 |
+
return output, state, attention_weights
|
63 |
+
|
64 |
+
def init_state(self, batch_size):
|
65 |
+
return tf.zeros((batch_size, self.units))
|
66 |
+
|
67 |
+
def reset_state(self, batch_size):
|
68 |
+
return tf.zeros((batch_size, self.units))
|
69 |
+
|
70 |
+
|
71 |
+
# Loading the tokenizer
|
72 |
+
with open("efb-requirements/tokenizer.json", 'r', encoding='utf-8') as f:
|
73 |
+
loaded_tokenizer_json = f.read()
|
74 |
+
|
75 |
+
tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(loaded_tokenizer_json)
|
76 |
+
|
77 |
+
|
78 |
+
def load_and_process_image(image, target_size=(299, 299)):
|
79 |
+
img = tf.convert_to_tensor(image)
|
80 |
+
img = tf.cast(img, tf.uint8)
|
81 |
+
img = tf.image.resize(img, target_size)
|
82 |
+
img = tf.keras.applications.inception_v3.preprocess_input(img)
|
83 |
+
return img
|
84 |
+
|
85 |
+
|
86 |
+
image_features_extract_model = keras.models.load_model("efb-requirements/inception_v3.h5")
|
87 |
+
|
88 |
+
embedding_dim = 256
|
89 |
+
units = 512
|
90 |
+
vocab_size = 5001
|
91 |
+
|
92 |
+
encoder = Encoder(embedding_dim)
|
93 |
+
decoder = Decoder(embedding_dim, units, vocab_size)
|
94 |
+
|
95 |
+
# Creating dummy inputs
|
96 |
+
dummy_img_input = tf.ones((32, 64, 2048))
|
97 |
+
features = encoder(dummy_img_input)
|
98 |
+
hidden = decoder.init_state(32)
|
99 |
+
dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * 32, 1)
|
100 |
+
dec = decoder(dec_input, features, hidden)
|
101 |
+
|
102 |
+
# Loading saved models
|
103 |
+
encoder.load_weights("efb-requirements/encoder_50epoch_weights.h5")
|
104 |
+
decoder.load_weights("efb-requirements/decoder_50epoch_weights.h5")
|
105 |
+
|
106 |
+
|
107 |
+
def evaluate(image):
|
108 |
+
max_length = 39
|
109 |
+
attention_plot = np.zeros((max_length, 64))
|
110 |
+
|
111 |
+
hidden = decoder.reset_state(batch_size=1)
|
112 |
+
# processing the input image to desired format before extracting features
|
113 |
+
temp_input = tf.expand_dims(load_and_process_image(image), 0)
|
114 |
+
|
115 |
+
img_tensor_val = image_features_extract_model(temp_input)
|
116 |
+
img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))
|
117 |
+
|
118 |
+
cnn_features = encoder(img_tensor_val)
|
119 |
+
|
120 |
+
decoder_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
|
121 |
+
|
122 |
+
result = []
|
123 |
+
predictions = ''
|
124 |
+
for i in range(max_length):
|
125 |
+
predictions, hidden, attention_weights = decoder(decoder_input, cnn_features, hidden)
|
126 |
+
|
127 |
+
attention_plot[i] = tf.reshape(attention_weights, (-1,)).numpy()
|
128 |
+
|
129 |
+
predicted_id = tf.argmax(predictions[0]).numpy()
|
130 |
+
result.append(tokenizer.index_word[predicted_id])
|
131 |
+
|
132 |
+
if tokenizer.index_word[predicted_id] == '<end>':
|
133 |
+
# return result, attention_plot, predictions
|
134 |
+
return result
|
135 |
+
|
136 |
+
decoder_input = tf.expand_dims([predicted_id], 0)
|
137 |
+
|
138 |
+
attention_plot = attention_plot[:len(result), :]
|
139 |
+
# return result, attention_plot, predictions
|
140 |
+
return result
|
test.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import tempfile
|
3 |
+
import numpy as np
|
4 |
+
import gradio as gr
|
5 |
+
from gtts import gTTS
|
6 |
+
import inference_script
|
7 |
+
import vit_gpt2
|
8 |
+
import os
|
9 |
+
import warnings
|
10 |
+
warnings.filterwarnings('ignore')
|
11 |
+
|
12 |
+
|
13 |
+
def process_image_and_generate_output(image, model_selection):
|
14 |
+
|
15 |
+
if model_selection == ('Basic Model (Trained only for 15 epochs without any hyperparameter tuning, utilizing '
|
16 |
+
'inception v3)'):
|
17 |
+
result = inference_script.evaluate(image)
|
18 |
+
pred_caption = ' '.join(result).rsplit(' ', 1)[0]
|
19 |
+
pred_caption = pred_caption.replace('<unk>', '')
|
20 |
+
elif model_selection == 'ViT-GPT2 (SOTA model for Image captioning)':
|
21 |
+
result = vit_gpt2.predict_step(image)
|
22 |
+
pred_caption = result[0]
|
23 |
+
else:
|
24 |
+
return "Invalid model selection", None
|
25 |
+
|
26 |
+
# Generate speech from the caption
|
27 |
+
tts = gTTS(text=pred_caption, lang='en', slow=False)
|
28 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio:
|
29 |
+
audio_file_path = temp_audio.name
|
30 |
+
tts.save(audio_file_path)
|
31 |
+
|
32 |
+
# Read the audio file
|
33 |
+
with open(audio_file_path, "rb") as f:
|
34 |
+
audio_content = f.read()
|
35 |
+
|
36 |
+
# Clean up the temporary audio file
|
37 |
+
os.unlink(audio_file_path)
|
38 |
+
return pred_caption, audio_content
|
39 |
+
|
40 |
+
|
41 |
+
iface = gr.Interface(fn=process_image_and_generate_output,
|
42 |
+
inputs=["image", gr.Radio(["Basic Model (Trained only for 15 epochs without any hyperparameter "
|
43 |
+
"tuning, utilizing inception v3)", "ViT-GPT2 (SOTA model for Image "
|
44 |
+
"captioning)"], label="Choose "
|
45 |
+
"Model")],
|
46 |
+
outputs=["text", "audio"],
|
47 |
+
title="Eye For Blind | Image Captioning & TTS",
|
48 |
+
description="Generate a caption for the uploaded image and convert it to speech.")
|
49 |
+
|
50 |
+
iface.launch()
|
vit-gpt2-image-captioning
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit dc68f91c06a1ba6f15268e5b9c13ae7a7c514084
|
vit_gpt2.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
from transformers import TFVisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
model = TFVisionEncoderDecoderModel.from_pretrained("vit-gpt2-image-captioning", from_pt=True)
|
6 |
+
feature_extractor = ViTImageProcessor.from_pretrained("vit-gpt2-image-captioning")
|
7 |
+
tokenizer = AutoTokenizer.from_pretrained("vit-gpt2-image-captioning")
|
8 |
+
|
9 |
+
max_length = 16
|
10 |
+
num_beams = 4
|
11 |
+
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
|
12 |
+
|
13 |
+
|
14 |
+
def predict_step(img_array):
|
15 |
+
i_image = Image.fromarray(img_array)
|
16 |
+
|
17 |
+
if i_image.mode != "RGB":
|
18 |
+
i_image = i_image.convert(mode="RGB")
|
19 |
+
|
20 |
+
pixel_values = feature_extractor(images=i_image, return_tensors="tf", do_normalize=True).pixel_values
|
21 |
+
|
22 |
+
output_ids = model.generate(pixel_values, **gen_kwargs)
|
23 |
+
|
24 |
+
pred = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
|
25 |
+
pred = [p.strip() for p in pred]
|
26 |
+
return pred
|