chaouch commited on
Commit
3a35ced
·
1 Parent(s): 129cbf7
__pycache__/recite_module.cpython-310.pyc ADDED
Binary file (8.75 kB). View file
 
app.py CHANGED
@@ -13,7 +13,7 @@ demo2 = gr.Interface(
13
  run,
14
  [gr.Audio(sources=["upload"]), gr.Image(
15
  type="filepath", label="Image")],
16
- gr.Image(type="pil", label="output Image")
17
  )
18
  with demo:
19
  gr.TabbedInterface([demo1, demo2],
 
13
  run,
14
  [gr.Audio(sources=["upload"]), gr.Image(
15
  type="filepath", label="Image")],
16
+ [gr.Image(type="pil", label="output Image")]
17
  )
18
  with demo:
19
  gr.TabbedInterface([demo1, demo2],
recite_module.py CHANGED
@@ -6,6 +6,8 @@ import cv2
6
  from PIL import Image
7
  from evaluate import load
8
  import librosa
 
 
9
 
10
  asr = pipeline("automatic-speech-recognition", model="openai/whisper-base")
11
  wer = load("wer")
@@ -21,8 +23,6 @@ def extract_text(image):
21
  Raises:
22
  ValueError: If the input image is not a PIL Image object.
23
  """
24
- if not isinstance(image, Image.Image):
25
- raise ValueError("Invalid input. Image should be a PIL Image object.")
26
 
27
  result = pytesseract.image_to_data(image, output_type='dict')
28
  n_boxes = len(result['level'])
@@ -57,8 +57,6 @@ def draw_rectangle(image, x, y, w, h, color=(0, 0, 255), thickness=2):
57
  Raises:
58
  ValueError: If the input image is not a PIL Image object.
59
  """
60
- if not isinstance(image, Image.Image):
61
- raise ValueError("Invalid input. Image should be a PIL Image object.")
62
 
63
  image_array = np.array(image)
64
  image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
@@ -93,7 +91,7 @@ def transcribe(audio):
93
  y /= np.max(np.abs(y))
94
 
95
  transcribed_text = asr(
96
- {"sampling_rate": sr, "raw": y}, language="en")["text"]
97
 
98
  return transcribed_text
99
 
@@ -137,7 +135,11 @@ def match(refence, spoken):
137
 
138
  if spoken == "":
139
  return 0
140
- wer_score = wer.compute(references=[refence], predictions=[spoken])
 
 
 
 
141
  score = 1 - wer_score
142
  return score
143
 
@@ -199,9 +201,6 @@ def process_image(im, data):
199
  Raises:
200
  ValueError: If the input image is not a PIL Image object or if the data is not a dictionary.
201
  """
202
- if not isinstance(im, Image.Image) or not isinstance(data, dict):
203
- raise ValueError(
204
- "Invalid input. Image should be a PIL Image object and data should be a dictionary.")
205
 
206
  im_array = np.array(im)
207
  hg, wg, _ = im_array.shape
@@ -244,9 +243,6 @@ def run(stream, image):
244
  raise ValueError(
245
  "Invalid input. Stream should be either a file path or a tuple of (sampling_rate, raw_audio).")
246
 
247
- if not isinstance(image, Image.Image):
248
- raise ValueError("Invalid input. Image should be a PIL Image object.")
249
-
250
  data = extract_text(image)
251
  im_text_ = [data[i]["text"] for i in range(len(data))]
252
  im_text = " ".join(im_text_)
@@ -255,10 +251,11 @@ def run(stream, image):
255
  im_array = np.array(Image.open(image))
256
  data2 = None
257
  for i in range(len(chunks)):
258
- if match(chunks[i], trns_text) > 0.5:
 
259
  data2 = reindex_data(data, index[i], l)
260
  break
261
  if data2 is not None:
262
  return process_image(im_array, data2)
263
  else:
264
- return im_array
 
6
  from PIL import Image
7
  from evaluate import load
8
  import librosa
9
+ from transformers.models.whisper.english_normalizer import BasicTextNormalizer
10
+
11
 
12
  asr = pipeline("automatic-speech-recognition", model="openai/whisper-base")
13
  wer = load("wer")
 
23
  Raises:
24
  ValueError: If the input image is not a PIL Image object.
25
  """
 
 
26
 
27
  result = pytesseract.image_to_data(image, output_type='dict')
28
  n_boxes = len(result['level'])
 
57
  Raises:
58
  ValueError: If the input image is not a PIL Image object.
59
  """
 
 
60
 
61
  image_array = np.array(image)
62
  image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
 
91
  y /= np.max(np.abs(y))
92
 
93
  transcribed_text = asr(
94
+ {"sampling_rate": sr, "raw": y})["text"]
95
 
96
  return transcribed_text
97
 
 
135
 
136
  if spoken == "":
137
  return 0
138
+ normalizer = BasicTextNormalizer()
139
+ spoken = clean_transcription(spoken)
140
+ predection = normalizer(spoken)
141
+ refence = normalizer(refence)
142
+ wer_score = wer.compute(references=[refence], predictions=[predection])
143
  score = 1 - wer_score
144
  return score
145
 
 
201
  Raises:
202
  ValueError: If the input image is not a PIL Image object or if the data is not a dictionary.
203
  """
 
 
 
204
 
205
  im_array = np.array(im)
206
  hg, wg, _ = im_array.shape
 
243
  raise ValueError(
244
  "Invalid input. Stream should be either a file path or a tuple of (sampling_rate, raw_audio).")
245
 
 
 
 
246
  data = extract_text(image)
247
  im_text_ = [data[i]["text"] for i in range(len(data))]
248
  im_text = " ".join(im_text_)
 
251
  im_array = np.array(Image.open(image))
252
  data2 = None
253
  for i in range(len(chunks)):
254
+ print(match(chunks[i], trns_text))
255
+ if match(chunks[i], trns_text) >= 0.10:
256
  data2 = reindex_data(data, index[i], l)
257
  break
258
  if data2 is not None:
259
  return process_image(im_array, data2)
260
  else:
261
+ return im_array