chaouch commited on
Commit
129cbf7
·
1 Parent(s): 334a9f9
Files changed (2) hide show
  1. app.py +1 -166
  2. recite_module.py +264 -0
app.py CHANGED
@@ -1,140 +1,5 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
- import numpy as np
4
- import pytesseract
5
- import cv2
6
- from PIL import Image
7
- from evaluate import load
8
- import librosa
9
-
10
- asr = pipeline("automatic-speech-recognition", model="openai/whisper-base")
11
- wer = load("wer")
12
-
13
-
14
- def extract_text(image):
15
- result = pytesseract.image_to_data(image, output_type='dict')
16
- n_boxes = len(result['level'])
17
- data = {}
18
- k = 0
19
- for i in range(n_boxes):
20
- if result['conf'][i] >= 0.3 and result['text'][i] != '' and result['conf'][i] != -1:
21
- data[k] = {}
22
- (x, y, w, h) = (result['left'][i], result['top']
23
- [i], result['width'][i], result['height'][i])
24
- data[k]["coordinates"] = (x, y, w, h)
25
- text, conf = result['text'][k], result['conf'][k]
26
- data[k]["text"] = text
27
- data[k]["conf"] = conf
28
- k += 1
29
- return data
30
-
31
-
32
- def draw_rectangle(image, x, y, w, h, color=(0, 0, 255), thickness=2):
33
- image_array = np.array(image)
34
- image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
35
- cv2.rectangle(image_array, (x, y), (x + w, y + h), color, thickness)
36
- return Image.fromarray(cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB))
37
-
38
-
39
- def transcribe(audio):
40
- if isinstance(audio, str): # If audio is a file path
41
- y, sr = librosa.load(audio)
42
- elif isinstance(audio, tuple) and len(audio) == 2: # If audio is (sampling_rate, raw_audio)
43
- sr, y = audio
44
- y = y.astype(np.float32)
45
- else:
46
- raise ValueError("Invalid input. Audio should be a file path or a tuple of (sampling_rate, raw_audio).")
47
-
48
- y /= np.max(np.abs(y))
49
-
50
- # Call your ASR (Automatic Speech Recognition) function here
51
- # For now, let's assume it's called 'asr'
52
- transcribed_text = asr({"sampling_rate": sr, "raw": y})["text"]
53
-
54
- return transcribed_text
55
-
56
-
57
- def clean_transcription(transcription):
58
- text = transcription.lower()
59
- words = text.split()
60
- cleaned_words = [words[0]]
61
- for word in words[1:]:
62
- if word != cleaned_words[-1]:
63
- cleaned_words.append(word)
64
- return ' '.join(cleaned_words)
65
-
66
-
67
- def match(refence, spoken):
68
- wer_score = wer.compute(references=[refence], predictions=[spoken])
69
- score = 1 - wer_score
70
- return score
71
-
72
-
73
- def split_to_l(text, answer):
74
- l = len(answer.split(" "))
75
- text_words = text.split(" ")
76
- chunks = []
77
- indices = []
78
- for i in range(0, len(text_words), l):
79
- chunk = " ".join(text_words[i: i + l])
80
- chunks.append(chunk)
81
- indices.append(i)
82
- return chunks, indices, l
83
-
84
-
85
- def reindex_data(data, index, l):
86
- reindexed_data = {}
87
- for i in range(l):
88
- original_index = index + i
89
- reindexed_data[i] = data[original_index]
90
- return reindexed_data
91
-
92
-
93
- def process_image(im, data):
94
- im_array = np.array(im)
95
- hg, wg, _ = im_array.shape
96
- text_y = np.max([data[i]["coordinates"][1]
97
- for i in range(len(data))])
98
- text_x = np.max([data[i]["coordinates"][0]
99
- for i in range(len(data))])
100
- text_start_x = np.min([data[i]["coordinates"][0]
101
- for i in range(len(data))])
102
- text_start_y = np.min([data[i]["coordinates"][1]
103
- for i in range(len(data))])
104
- max_height = int(np.mean([data[i]["coordinates"][3]
105
- for i in range(len(data))]))
106
- max_width = int(np.mean([data[i]["coordinates"][2]
107
- for i in range(len(data))]))
108
- text = [data[i]["text"] for i in range(len(data))]
109
- wall = np.zeros((hg, wg, 3), np.uint8)
110
-
111
- wall[text_start_y:text_y + max_height, text_start_x:text_x + max_width] = \
112
- im_array[text_start_y:text_y + max_height,
113
- text_start_x:text_x + max_width, :]
114
-
115
- for i in range(1, len(data)):
116
- x, y, w, h = data[i]["coordinates"]
117
- wall = draw_rectangle(wall, x, y, w, h)
118
- return wall
119
-
120
-
121
- def run(stream, image):
122
- data = extract_text(image)
123
- im_text_ = [data[i]["text"] for i in range(len(data))]
124
- im_text = " ".join(im_text_)
125
- trns_text = transcribe(stream)
126
- chunks, index, l = split_to_l(im_text, trns_text)
127
- im_array = np.array(Image.open(image))
128
- data2 = None
129
- for i in range(len(chunks)):
130
- if match(chunks[i], trns_text) > 0.1:
131
- data2 = reindex_data(data, index[i], l)
132
- break
133
- if data2 is not None:
134
- return process_image(im_array, data2)
135
- else:
136
- return im_array
137
-
138
  demo = gr.Blocks()
139
 
140
 
@@ -155,33 +20,3 @@ with demo:
155
  ["Microphone", "Audio File"])
156
 
157
  demo.launch()
158
- """
159
- data = extract_text(im)
160
- im_text_ = [data[i]["text"] for i in range(len(data))]
161
- im_text = " ".join(im_text_)
162
- trns_text = transcribe_wav("tmpmucht0kh.wav")
163
- chunks, index, l = split_to_l(im_text, trns_text)
164
- im_array = np.array(Image.open(im))
165
- for i in range(len(chunks)):
166
- if match(chunks[i], trns_text) > 0.5:
167
- print(chunks[i])
168
- print(match(chunks[i], trns_text))
169
- print(index[i])
170
- print(l)
171
- print(im_array.shape)
172
- print(fuse_rectangles(im_array, data, index[i], l))
173
-
174
- strem = "tmpq0eha4we.wav"
175
- im = "the-king-and-three-sisters-around-the-world-stories-for-children.png"
176
- text = "A KING AND THREE SISTERS"
177
- che_text = "A KING AND THREE SISTERS"
178
- print(match(text, che_text))
179
- data = extract_text(im)
180
- text_transcript = transcribe_wav(strem)
181
- print(text_transcript)
182
- im_text_ = [data[i]["text"] for i in range(len(data))]
183
- im_text = " ".join(im_text_)
184
- print(im_text)
185
- wall = run(strem, im)
186
- wall.show()"""
187
-
 
1
  import gradio as gr
2
+ from recite_module import run
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  demo = gr.Blocks()
4
 
5
 
 
20
  ["Microphone", "Audio File"])
21
 
22
  demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
recite_module.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import numpy as np
4
+ import pytesseract
5
+ import cv2
6
+ from PIL import Image
7
+ from evaluate import load
8
+ import librosa
9
+
10
+ asr = pipeline("automatic-speech-recognition", model="openai/whisper-base")
11
+ wer = load("wer")
12
+
13
+
14
+ def extract_text(image):
15
+ """
16
+ Extracts text from an image using OCR.
17
+ Args:
18
+ image (PIL.Image.Image): Input image.
19
+ Returns:
20
+ dict: Extracted text with confidence and coordinates.
21
+ Raises:
22
+ ValueError: If the input image is not a PIL Image object.
23
+ """
24
+ if not isinstance(image, Image.Image):
25
+ raise ValueError("Invalid input. Image should be a PIL Image object.")
26
+
27
+ result = pytesseract.image_to_data(image, output_type='dict')
28
+ n_boxes = len(result['level'])
29
+ data = {}
30
+ k = 0
31
+ for i in range(n_boxes):
32
+ if result['conf'][i] >= 0.3 and result['text'][i] != '' and result['conf'][i] != -1:
33
+ data[k] = {}
34
+ (x, y, w, h) = (result['left'][i], result['top']
35
+ [i], result['width'][i], result['height'][i])
36
+ data[k]["coordinates"] = (x, y, w, h)
37
+ text, conf = result['text'][k], result['conf'][k]
38
+ data[k]["text"] = text
39
+ data[k]["conf"] = conf
40
+ k += 1
41
+ return data
42
+
43
+
44
+ def draw_rectangle(image, x, y, w, h, color=(0, 0, 255), thickness=2):
45
+ """
46
+ Draws a rectangle on the given image.
47
+ Args:
48
+ image (PIL.Image.Image): Input image.
49
+ x (int): x-coordinate of the top-left corner of the rectangle.
50
+ y (int): y-coordinate of the top-left corner of the rectangle.
51
+ w (int): Width of the rectangle.
52
+ h (int): Height of the rectangle.
53
+ color (tuple, optional): Color of the rectangle in RGB format.
54
+ thickness (int, optional): Thickness of the rectangle's border.
55
+ Returns:
56
+ PIL.Image.Image: Image with the rectangle drawn on it.
57
+ Raises:
58
+ ValueError: If the input image is not a PIL Image object.
59
+ """
60
+ if not isinstance(image, Image.Image):
61
+ raise ValueError("Invalid input. Image should be a PIL Image object.")
62
+
63
+ image_array = np.array(image)
64
+ image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
65
+ cv2.rectangle(image_array, (x, y), (x + w, y + h), color, thickness)
66
+ return Image.fromarray(cv2.cvtColor(image_array, cv2.COLOR_BGR2RGB))
67
+
68
+
69
+ def transcribe(audio):
70
+ """
71
+ Transcribes audio into text using ASR.
72
+ Parameters:
73
+ audio (str or tuple): Audio source.
74
+ Returns:
75
+ str: Transcribed text.
76
+ Raises:
77
+ ValueError: If the input audio is not valid.
78
+ """
79
+ if not isinstance(audio, (str, tuple)):
80
+ raise ValueError(
81
+ "Invalid input. Audio should be either a file path or a tuple of (sampling_rate, raw_audio).")
82
+
83
+ if isinstance(audio, str): # If audio is a file path
84
+ y, sr = librosa.load(audio)
85
+ # If audio is (sampling_rate, raw_audio)
86
+ elif isinstance(audio, tuple) and len(audio) == 2:
87
+ sr, y = audio
88
+ y = y.astype(np.float32)
89
+ else:
90
+ raise ValueError(
91
+ "Invalid input. Audio should be a file path or a tuple of (sampling_rate, raw_audio).")
92
+
93
+ y /= np.max(np.abs(y))
94
+
95
+ transcribed_text = asr(
96
+ {"sampling_rate": sr, "raw": y}, language="en")["text"]
97
+
98
+ return transcribed_text
99
+
100
+
101
+ def clean_transcription(transcription):
102
+ """
103
+ Cleans the transcription by removing consecutive duplicate words.
104
+ Args:
105
+ transcription (str): Input transcription.
106
+ Returns:
107
+ str: Cleaned transcription.
108
+ Raises:
109
+ ValueError: If the input transcription is not a string.
110
+ """
111
+ if not isinstance(transcription, str):
112
+ raise ValueError("Invalid input. Transcription should be a string.")
113
+
114
+ text = transcription.lower()
115
+ words = text.split()
116
+ cleaned_words = [words[0]]
117
+ for word in words[1:]:
118
+ if word != cleaned_words[-1]:
119
+ cleaned_words.append(word)
120
+ return ' '.join(cleaned_words)
121
+
122
+
123
+ def match(refence, spoken):
124
+ """
125
+ Calculates the match score between a reference and spoken string.
126
+ Args:
127
+ reference (str): Reference string.
128
+ spoken (str): Spoken string.
129
+ Returns:
130
+ float: Match score between 0 and 1.
131
+ Raises:
132
+ ValueError: If either reference or spoken is not a string.
133
+ """
134
+ if not isinstance(refence, str) or not isinstance(spoken, str):
135
+ raise ValueError(
136
+ "Invalid input. Reference and spoken should be strings.")
137
+
138
+ if spoken == "":
139
+ return 0
140
+ wer_score = wer.compute(references=[refence], predictions=[spoken])
141
+ score = 1 - wer_score
142
+ return score
143
+
144
+
145
+ def split_to_l(text, answer):
146
+ """
147
+ Splits the given text into chunks of length 'l' based on the answer.
148
+ Args:
149
+ text (str): The input text to be split.
150
+ answer (str): The answer used to determine the chunk size.
151
+ Returns:
152
+ tuple: A tuple containing the chunks of text, the indices of the chunks, and the length of each chunk.
153
+ """
154
+ if not isinstance(text, str) or not isinstance(answer, str):
155
+ raise ValueError("Invalid input. Text and answer should be strings.")
156
+
157
+ l = len(answer.split(" "))
158
+ text_words = text.split(" ")
159
+ chunks = []
160
+ indices = []
161
+ for i in range(0, len(text_words), l):
162
+ chunk = " ".join(text_words[i: i + l])
163
+ chunks.append(chunk)
164
+ indices.append(i)
165
+ return chunks, indices, l
166
+
167
+
168
+ def reindex_data(data, index, l):
169
+ """
170
+ Reindexes a dictionary with keys ranging from 0 to l-1.
171
+ Args:
172
+ data (dict): Original dictionary.
173
+ index (int): Starting index for reindexing.
174
+ l (int): Length of the reindexed dictionary.
175
+ Returns:
176
+ dict: Reindexed dictionary.
177
+ Raises:
178
+ ValueError: If the input data is not a dictionary, or if index or l are not integers.
179
+ """
180
+ if not isinstance(data, dict) or not isinstance(index, int) or not isinstance(l, int):
181
+ raise ValueError(
182
+ "Invalid input. Data should be a dictionary, index and l should be integers.")
183
+
184
+ reindexed_data = {}
185
+ for i in range(l):
186
+ original_index = index + i
187
+ reindexed_data[i] = data[original_index]
188
+ return reindexed_data
189
+
190
+
191
+ def process_image(im, data):
192
+ """
193
+ Processes an image by extracting text regions.
194
+ Args:
195
+ im (PIL.Image.Image): Input image.
196
+ data (dict): Data containing information about text regions.
197
+ Returns:
198
+ numpy.ndarray: Processed image with text regions highlighted.
199
+ Raises:
200
+ ValueError: If the input image is not a PIL Image object or if the data is not a dictionary.
201
+ """
202
+ if not isinstance(im, Image.Image) or not isinstance(data, dict):
203
+ raise ValueError(
204
+ "Invalid input. Image should be a PIL Image object and data should be a dictionary.")
205
+
206
+ im_array = np.array(im)
207
+ hg, wg, _ = im_array.shape
208
+ text_y = np.max([data[i]["coordinates"][1]
209
+ for i in range(len(data))])
210
+ text_x = np.max([data[i]["coordinates"][0]
211
+ for i in range(len(data))])
212
+ text_start_x = np.min([data[i]["coordinates"][0]
213
+ for i in range(len(data))])
214
+ text_start_y = np.min([data[i]["coordinates"][1]
215
+ for i in range(len(data))])
216
+ max_height = int(np.mean([data[i]["coordinates"][3]
217
+ for i in range(len(data))]))
218
+ max_width = int(np.mean([data[i]["coordinates"][2]
219
+ for i in range(len(data))]))
220
+ wall = np.zeros((hg, wg, 3), np.uint8)
221
+
222
+ wall[text_start_y:text_y + max_height, text_start_x:text_x + max_width] = \
223
+ im_array[text_start_y:text_y + max_height,
224
+ text_start_x:text_x + max_width, :]
225
+
226
+ for i in range(1, len(data)):
227
+ x, y, w, h = data[i]["coordinates"]
228
+ wall = draw_rectangle(wall, x, y, w, h)
229
+ return wall
230
+
231
+
232
+ def run(stream, image):
233
+ """
234
+ Processes an image and transcribes audio.
235
+ Args:
236
+ stream (str or tuple): Audio source.
237
+ image (PIL.Image.Image): Input image.
238
+ Returns:
239
+ numpy.ndarray or PIL.Image.Image: Processed image data.
240
+ Raises:
241
+ ValueError: If the input stream is not a valid type or if the input image is not a PIL Image object.
242
+ """
243
+ if not isinstance(stream, (str, tuple)):
244
+ raise ValueError(
245
+ "Invalid input. Stream should be either a file path or a tuple of (sampling_rate, raw_audio).")
246
+
247
+ if not isinstance(image, Image.Image):
248
+ raise ValueError("Invalid input. Image should be a PIL Image object.")
249
+
250
+ data = extract_text(image)
251
+ im_text_ = [data[i]["text"] for i in range(len(data))]
252
+ im_text = " ".join(im_text_)
253
+ trns_text = transcribe(stream)
254
+ chunks, index, l = split_to_l(im_text, trns_text)
255
+ im_array = np.array(Image.open(image))
256
+ data2 = None
257
+ for i in range(len(chunks)):
258
+ if match(chunks[i], trns_text) > 0.5:
259
+ data2 = reindex_data(data, index[i], l)
260
+ break
261
+ if data2 is not None:
262
+ return process_image(im_array, data2)
263
+ else:
264
+ return im_array