chaouch commited on
Commit
20c0d70
·
1 Parent(s): e74749f
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import numpy as np
4
+ import pytesseract
5
+ import cv2
6
+ from PIL import Image
7
+ from evaluate import load
8
+ import librosa
9
+
10
+ asr = pipeline("automatic-speech-recognition", model="openai/whisper-base")
11
+ wer = load("wer")
12
+
13
+
14
+ def extract_text(image):
15
+ result = pytesseract.image_to_data(image, output_type='dict')
16
+ n_boxes = len(result['level'])
17
+ data = {}
18
+ k = 0
19
+ for i in range(n_boxes):
20
+ if result['conf'][i] >= 0.3 and result['text'][i] != '' and result['conf'][i] != -1:
21
+ data[k] = {}
22
+ (x, y, w, h) = (result['left'][i], result['top']
23
+ [i], result['width'][i], result['height'][i])
24
+ data[k]["coordinates"] = (x, y, w, h)
25
+ text, conf = result['text'][k], result['conf'][k]
26
+ data[k]["text"] = text
27
+ data[k]["conf"] = conf
28
+ k += 1
29
+ return data
30
+
31
+
32
+ def draw_rectangle(image, x, y, w, h, color=(0, 0, 255), thickness=2):
33
+ image_array = np.array(image)
34
+ image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
35
+ cv2.rectangle(image_array, (x, y), (x + w, y + h), color, thickness)
36
+ return Image.fromarray(cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB))
37
+
38
+
39
+ def transcribe(audio):
40
+ if isinstance(audio, str): # If audio is a file path
41
+ y, sr = librosa.load(audio)
42
+ elif isinstance(audio, tuple) and len(audio) == 2: # If audio is (sampling_rate, raw_audio)
43
+ sr, y = audio
44
+ y = y.astype(np.float32)
45
+ else:
46
+ raise ValueError("Invalid input. Audio should be a file path or a tuple of (sampling_rate, raw_audio).")
47
+
48
+ y /= np.max(np.abs(y))
49
+
50
+ # Call your ASR (Automatic Speech Recognition) function here
51
+ # For now, let's assume it's called 'asr'
52
+ transcribed_text = asr({"sampling_rate": sr, "raw": y})["text"]
53
+
54
+ return transcribed_text
55
+
56
+
57
+ def clean_transcription(transcription):
58
+ text = transcription.lower()
59
+ words = text.split()
60
+ cleaned_words = [words[0]]
61
+ for word in words[1:]:
62
+ if word != cleaned_words[-1]:
63
+ cleaned_words.append(word)
64
+ return ' '.join(cleaned_words)
65
+
66
+
67
+ def match(refence, spoken):
68
+ wer_score = wer.compute(references=[refence], predictions=[spoken])
69
+ score = 1 - wer_score
70
+ return score
71
+
72
+
73
+ def split_to_l(text, answer):
74
+ l = len(answer.split(" "))
75
+ text_words = text.split(" ")
76
+ chunks = []
77
+ indices = []
78
+ for i in range(0, len(text_words), l):
79
+ chunk = " ".join(text_words[i: i + l])
80
+ chunks.append(chunk)
81
+ indices.append(i)
82
+ return chunks, indices, l
83
+
84
+
85
+ def reindex_data(data, index, l):
86
+ reindexed_data = {}
87
+ for i in range(l):
88
+ original_index = index + i
89
+ reindexed_data[i] = data[original_index]
90
+ return reindexed_data
91
+
92
+
93
+ def process_image(im, data):
94
+ im_array = np.array(im)
95
+ hg, wg, _ = im_array.shape
96
+ text_y = np.max([data[i]["coordinates"][1]
97
+ for i in range(len(data))])
98
+ text_x = np.max([data[i]["coordinates"][0]
99
+ for i in range(len(data))])
100
+ text_start_x = np.min([data[i]["coordinates"][0]
101
+ for i in range(len(data))])
102
+ text_start_y = np.min([data[i]["coordinates"][1]
103
+ for i in range(len(data))])
104
+ max_height = int(np.mean([data[i]["coordinates"][3]
105
+ for i in range(len(data))]))
106
+ max_width = int(np.mean([data[i]["coordinates"][2]
107
+ for i in range(len(data))]))
108
+ text = [data[i]["text"] for i in range(len(data))]
109
+ wall = np.zeros((hg, wg, 3), np.uint8)
110
+
111
+ wall[text_start_y:text_y + max_height, text_start_x:text_x + max_width] = \
112
+ im_array[text_start_y:text_y + max_height,
113
+ text_start_x:text_x + max_width, :]
114
+
115
+ for i in range(1, len(data)):
116
+ x, y, w, h = data[i]["coordinates"]
117
+ wall = draw_rectangle(wall, x, y, w, h)
118
+ return wall
119
+
120
+
121
+ def run(stream, image):
122
+ data = extract_text(image)
123
+ im_text_ = [data[i]["text"] for i in range(len(data))]
124
+ im_text = " ".join(im_text_)
125
+ trns_text = transcribe(stream)
126
+ chunks, index, l = split_to_l(im_text, trns_text)
127
+ im_array = np.array(Image.open(image))
128
+ data2 = None
129
+ for i in range(len(chunks)):
130
+ if match(chunks[i], trns_text) > 0.1:
131
+ data2 = reindex_data(data, index[i], l)
132
+ break
133
+ if data2 is not None:
134
+ return process_image(im_array, data2)
135
+ else:
136
+ return im_array
137
+
138
+ demo = gr.Blocks()
139
+
140
+
141
+ demo1 = gr.Interface(
142
+ run,
143
+ [gr.Audio(sources=["microphone"] , type="numpy"), gr.Image(
144
+ type="filepath", label="Image")],
145
+ gr.Image(type="pil", label="output Image"),
146
+ examples=[["the-king-and-three-sisters-around-the-world-stories-for-children.png"]]
147
+ )
148
+ demo2 = gr.Interface(
149
+ run,
150
+ [gr.Audio(sources=["upload"]), gr.Image(
151
+ type="filepath", label="Image")],
152
+ gr.Image(type="pil", label="output Image"),
153
+ examples=[["the-king-and-three-sisters-around-the-world-stories-for-children.png"]]
154
+ )
155
+ with demo:
156
+ gr.TabbedInterface([demo1, demo2],
157
+ ["Microphone", "Audio File"])
158
+
159
+ demo.launch()
160
+ """
161
+ data = extract_text(im)
162
+ im_text_ = [data[i]["text"] for i in range(len(data))]
163
+ im_text = " ".join(im_text_)
164
+ trns_text = transcribe_wav("tmpmucht0kh.wav")
165
+ chunks, index, l = split_to_l(im_text, trns_text)
166
+ im_array = np.array(Image.open(im))
167
+ for i in range(len(chunks)):
168
+ if match(chunks[i], trns_text) > 0.5:
169
+ print(chunks[i])
170
+ print(match(chunks[i], trns_text))
171
+ print(index[i])
172
+ print(l)
173
+ print(im_array.shape)
174
+ print(fuse_rectangles(im_array, data, index[i], l))
175
+
176
+ strem = "tmpq0eha4we.wav"
177
+ im = "the-king-and-three-sisters-around-the-world-stories-for-children.png"
178
+ text = "A KING AND THREE SISTERS"
179
+ che_text = "A KING AND THREE SISTERS"
180
+ print(match(text, che_text))
181
+ data = extract_text(im)
182
+ text_transcript = transcribe_wav(strem)
183
+ print(text_transcript)
184
+ im_text_ = [data[i]["text"] for i in range(len(data))]
185
+ im_text = " ".join(im_text_)
186
+ print(im_text)
187
+ wall = run(strem, im)
188
+ wall.show()"""
189
+
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ tesseract-ocr-all
2
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ jiwer
2
+ evaluate
3
+ transformers
4
+ pytesseract
5
+ opencv-contrib-python
6
+ numpy
7
+ torch
8
+ librosa
the-king-and-three-sisters-around-the-world-stories-for-children.png ADDED