Spaces:
Sleeping
Sleeping
fix_doc
Browse files- __pycache__/recite_module.cpython-310.pyc +0 -0
- app.py +1 -1
- recite_module.py +11 -14
__pycache__/recite_module.cpython-310.pyc
ADDED
Binary file (8.75 kB). View file
|
|
app.py
CHANGED
@@ -13,7 +13,7 @@ demo2 = gr.Interface(
|
|
13 |
run,
|
14 |
[gr.Audio(sources=["upload"]), gr.Image(
|
15 |
type="filepath", label="Image")],
|
16 |
-
gr.Image(type="pil", label="output Image")
|
17 |
)
|
18 |
with demo:
|
19 |
gr.TabbedInterface([demo1, demo2],
|
|
|
13 |
run,
|
14 |
[gr.Audio(sources=["upload"]), gr.Image(
|
15 |
type="filepath", label="Image")],
|
16 |
+
[gr.Image(type="pil", label="output Image")]
|
17 |
)
|
18 |
with demo:
|
19 |
gr.TabbedInterface([demo1, demo2],
|
recite_module.py
CHANGED
@@ -6,6 +6,8 @@ import cv2
|
|
6 |
from PIL import Image
|
7 |
from evaluate import load
|
8 |
import librosa
|
|
|
|
|
9 |
|
10 |
asr = pipeline("automatic-speech-recognition", model="openai/whisper-base")
|
11 |
wer = load("wer")
|
@@ -21,8 +23,6 @@ def extract_text(image):
|
|
21 |
Raises:
|
22 |
ValueError: If the input image is not a PIL Image object.
|
23 |
"""
|
24 |
-
if not isinstance(image, Image.Image):
|
25 |
-
raise ValueError("Invalid input. Image should be a PIL Image object.")
|
26 |
|
27 |
result = pytesseract.image_to_data(image, output_type='dict')
|
28 |
n_boxes = len(result['level'])
|
@@ -57,8 +57,6 @@ def draw_rectangle(image, x, y, w, h, color=(0, 0, 255), thickness=2):
|
|
57 |
Raises:
|
58 |
ValueError: If the input image is not a PIL Image object.
|
59 |
"""
|
60 |
-
if not isinstance(image, Image.Image):
|
61 |
-
raise ValueError("Invalid input. Image should be a PIL Image object.")
|
62 |
|
63 |
image_array = np.array(image)
|
64 |
image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
|
@@ -93,7 +91,7 @@ def transcribe(audio):
|
|
93 |
y /= np.max(np.abs(y))
|
94 |
|
95 |
transcribed_text = asr(
|
96 |
-
{"sampling_rate": sr, "raw": y}
|
97 |
|
98 |
return transcribed_text
|
99 |
|
@@ -137,7 +135,11 @@ def match(refence, spoken):
|
|
137 |
|
138 |
if spoken == "":
|
139 |
return 0
|
140 |
-
|
|
|
|
|
|
|
|
|
141 |
score = 1 - wer_score
|
142 |
return score
|
143 |
|
@@ -199,9 +201,6 @@ def process_image(im, data):
|
|
199 |
Raises:
|
200 |
ValueError: If the input image is not a PIL Image object or if the data is not a dictionary.
|
201 |
"""
|
202 |
-
if not isinstance(im, Image.Image) or not isinstance(data, dict):
|
203 |
-
raise ValueError(
|
204 |
-
"Invalid input. Image should be a PIL Image object and data should be a dictionary.")
|
205 |
|
206 |
im_array = np.array(im)
|
207 |
hg, wg, _ = im_array.shape
|
@@ -244,9 +243,6 @@ def run(stream, image):
|
|
244 |
raise ValueError(
|
245 |
"Invalid input. Stream should be either a file path or a tuple of (sampling_rate, raw_audio).")
|
246 |
|
247 |
-
if not isinstance(image, Image.Image):
|
248 |
-
raise ValueError("Invalid input. Image should be a PIL Image object.")
|
249 |
-
|
250 |
data = extract_text(image)
|
251 |
im_text_ = [data[i]["text"] for i in range(len(data))]
|
252 |
im_text = " ".join(im_text_)
|
@@ -255,10 +251,11 @@ def run(stream, image):
|
|
255 |
im_array = np.array(Image.open(image))
|
256 |
data2 = None
|
257 |
for i in range(len(chunks)):
|
258 |
-
|
|
|
259 |
data2 = reindex_data(data, index[i], l)
|
260 |
break
|
261 |
if data2 is not None:
|
262 |
return process_image(im_array, data2)
|
263 |
else:
|
264 |
-
return im_array
|
|
|
6 |
from PIL import Image
|
7 |
from evaluate import load
|
8 |
import librosa
|
9 |
+
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
|
10 |
+
|
11 |
|
12 |
asr = pipeline("automatic-speech-recognition", model="openai/whisper-base")
|
13 |
wer = load("wer")
|
|
|
23 |
Raises:
|
24 |
ValueError: If the input image is not a PIL Image object.
|
25 |
"""
|
|
|
|
|
26 |
|
27 |
result = pytesseract.image_to_data(image, output_type='dict')
|
28 |
n_boxes = len(result['level'])
|
|
|
57 |
Raises:
|
58 |
ValueError: If the input image is not a PIL Image object.
|
59 |
"""
|
|
|
|
|
60 |
|
61 |
image_array = np.array(image)
|
62 |
image_array = cv2.cvtColor(image_array, cv2.COLOR_RGB2BGR)
|
|
|
91 |
y /= np.max(np.abs(y))
|
92 |
|
93 |
transcribed_text = asr(
|
94 |
+
{"sampling_rate": sr, "raw": y})["text"]
|
95 |
|
96 |
return transcribed_text
|
97 |
|
|
|
135 |
|
136 |
if spoken == "":
|
137 |
return 0
|
138 |
+
normalizer = BasicTextNormalizer()
|
139 |
+
spoken = clean_transcription(spoken)
|
140 |
+
predection = normalizer(spoken)
|
141 |
+
refence = normalizer(refence)
|
142 |
+
wer_score = wer.compute(references=[refence], predictions=[predection])
|
143 |
score = 1 - wer_score
|
144 |
return score
|
145 |
|
|
|
201 |
Raises:
|
202 |
ValueError: If the input image is not a PIL Image object or if the data is not a dictionary.
|
203 |
"""
|
|
|
|
|
|
|
204 |
|
205 |
im_array = np.array(im)
|
206 |
hg, wg, _ = im_array.shape
|
|
|
243 |
raise ValueError(
|
244 |
"Invalid input. Stream should be either a file path or a tuple of (sampling_rate, raw_audio).")
|
245 |
|
|
|
|
|
|
|
246 |
data = extract_text(image)
|
247 |
im_text_ = [data[i]["text"] for i in range(len(data))]
|
248 |
im_text = " ".join(im_text_)
|
|
|
251 |
im_array = np.array(Image.open(image))
|
252 |
data2 = None
|
253 |
for i in range(len(chunks)):
|
254 |
+
print(match(chunks[i], trns_text))
|
255 |
+
if match(chunks[i], trns_text) >= 0.10:
|
256 |
data2 = reindex_data(data, index[i], l)
|
257 |
break
|
258 |
if data2 is not None:
|
259 |
return process_image(im_array, data2)
|
260 |
else:
|
261 |
+
return im_array
|