Update app.py
Browse files
app.py
CHANGED
@@ -6,9 +6,6 @@ import os
|
|
6 |
import uuid
|
7 |
import json
|
8 |
|
9 |
-
import jieba
|
10 |
-
|
11 |
-
import nemo.collections.asr as nemo_asr
|
12 |
from nemo.collections.asr.models import ASRModel
|
13 |
from nemo.utils import logging
|
14 |
|
@@ -17,17 +14,6 @@ from align import main, AlignmentConfig, ASSFileConfig
|
|
17 |
|
18 |
SAMPLE_RATE = 16000
|
19 |
|
20 |
-
# Pre-download and cache the model in disk space
|
21 |
-
logging.setLevel(logging.ERROR)
|
22 |
-
for tmp_model_name in [
|
23 |
-
"stt_en_fastconformer_hybrid_large_pc",
|
24 |
-
"stt_de_fastconformer_hybrid_large_pc",
|
25 |
-
"stt_es_fastconformer_hybrid_large_pc",
|
26 |
-
"stt_fr_conformer_ctc_large",
|
27 |
-
"stt_zh_citrinet_1024_gamma_0_25",
|
28 |
-
]:
|
29 |
-
tmp_model = ASRModel.from_pretrained(tmp_model_name, map_location='cpu')
|
30 |
-
del tmp_model
|
31 |
logging.setLevel(logging.INFO)
|
32 |
|
33 |
|
@@ -102,9 +88,7 @@ def delete_mp4s_except_given_filepath(filepath):
|
|
102 |
os.remove(mp4_file)
|
103 |
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Progress()):
|
108 |
# Create utt_id, specify output_video_filepath and delete any MP4s
|
109 |
# that are not that filepath. These stray MP4s can be created
|
110 |
# if a user refreshes or exits the page while this 'align' function is executing.
|
@@ -115,24 +99,15 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
|
|
115 |
delete_mp4s_except_given_filepath(output_video_filepath)
|
116 |
|
117 |
output_info = ""
|
|
|
118 |
|
119 |
progress(0, desc="Validating input")
|
120 |
|
121 |
-
# choose model
|
122 |
-
if lang in ["en", "de", "es"]:
|
123 |
-
model_name = f"stt_{lang}_fastconformer_hybrid_large_pc"
|
124 |
-
elif lang in ["fr"]:
|
125 |
-
model_name = f"stt_{lang}_conformer_ctc_large"
|
126 |
-
elif lang in ["zh"]:
|
127 |
-
model_name = f"stt_{lang}_citrinet_1024_gamma_0_25"
|
128 |
-
|
129 |
# decide which of Mic / File_Upload is used as input & do error handling
|
130 |
if (Microphone is not None) and (File_Upload is not None):
|
131 |
raise gr.Error("Please use either the microphone or file upload input - not both")
|
132 |
-
|
133 |
elif (Microphone is None) and (File_Upload is None):
|
134 |
raise gr.Error("You have to either use the microphone or upload an audio file")
|
135 |
-
|
136 |
elif Microphone is not None:
|
137 |
file = Microphone
|
138 |
else:
|
@@ -148,6 +123,7 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
|
|
148 |
|
149 |
# loading model
|
150 |
progress(0.1, desc="Loading speech recognition model")
|
|
|
151 |
model = ASRModel.from_pretrained(model_name)
|
152 |
|
153 |
if text: # check input text is not too long compared to audio
|
@@ -185,9 +161,9 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
|
|
185 |
" transcription errors, and clicking 'Submit' again."
|
186 |
)
|
187 |
|
188 |
-
|
189 |
-
|
190 |
-
text = "
|
191 |
|
192 |
data = {
|
193 |
"audio_filepath": audio_path,
|
@@ -213,7 +189,7 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
|
|
213 |
additional_segment_grouping_separator="|",
|
214 |
# transcribe_device='cpu',
|
215 |
# viterbi_device='cpu',
|
216 |
-
save_output_file_formats=["ass"],
|
217 |
ass_file_config=ASSFileConfig(
|
218 |
fontsize=45,
|
219 |
resegment_text_to_fill_space=resegment_text_to_fill_space,
|
@@ -231,12 +207,11 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
|
|
231 |
progress(0.95, desc="Saving generated alignments")
|
232 |
|
233 |
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
|
240 |
|
241 |
ffmpeg_command = (
|
242 |
f"ffmpeg -y -i {audio_path} "
|
@@ -248,7 +223,28 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
|
|
248 |
|
249 |
os.system(ffmpeg_command)
|
250 |
|
251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
|
253 |
|
254 |
def delete_non_tmp_video(video_path):
|
@@ -273,14 +269,16 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
|
|
273 |
|
274 |
with gr.Column(scale=1):
|
275 |
gr.Markdown("## Input")
|
276 |
-
lang_drop = gr.Dropdown(choices=["de", "en", "es", "fr", "zh"], value="en", label="Audio language",)
|
277 |
-
|
278 |
mic_in = gr.Audio(sources=["microphone"], type='filepath', label="Microphone input (max 4 mins)")
|
279 |
audio_file_in = gr.Audio(sources=["upload"], type='filepath', label="File upload (max 4 mins)")
|
280 |
ref_text = gr.Textbox(
|
281 |
label="[Optional] The reference text. Use '|' separators to specify which text will appear together. "
|
282 |
"Leave this field blank to use an ASR model's transcription as the reference text instead."
|
283 |
)
|
|
|
|
|
|
|
|
|
284 |
|
285 |
gr.Markdown("[Optional] For fun - adjust the colors of the text in the output video")
|
286 |
with gr.Row():
|
@@ -292,8 +290,11 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
|
|
292 |
|
293 |
with gr.Column(scale=1):
|
294 |
gr.Markdown("## Output")
|
295 |
-
video_out = gr.Video(label="
|
296 |
-
text_out = gr.Textbox(label="
|
|
|
|
|
|
|
297 |
|
298 |
with gr.Row():
|
299 |
gr.HTML(
|
@@ -306,12 +307,26 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
|
|
306 |
|
307 |
submit_button.click(
|
308 |
fn=align,
|
309 |
-
inputs=[
|
310 |
-
outputs=[video_out, text_out, non_tmp_output_video_filepath],
|
311 |
).then(
|
312 |
fn=delete_non_tmp_video, inputs=[non_tmp_output_video_filepath], outputs=None,
|
313 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
|
315 |
demo.queue()
|
316 |
demo.launch()
|
317 |
-
|
|
|
6 |
import uuid
|
7 |
import json
|
8 |
|
|
|
|
|
|
|
9 |
from nemo.collections.asr.models import ASRModel
|
10 |
from nemo.utils import logging
|
11 |
|
|
|
14 |
|
15 |
SAMPLE_RATE = 16000
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
logging.setLevel(logging.INFO)
|
18 |
|
19 |
|
|
|
88 |
os.remove(mp4_file)
|
89 |
|
90 |
|
91 |
+
def align(Microphone, File_Upload, text, col1, col2, col3, split_on_newline, progress=gr.Progress()):
|
|
|
|
|
92 |
# Create utt_id, specify output_video_filepath and delete any MP4s
|
93 |
# that are not that filepath. These stray MP4s can be created
|
94 |
# if a user refreshes or exits the page while this 'align' function is executing.
|
|
|
99 |
delete_mp4s_except_given_filepath(output_video_filepath)
|
100 |
|
101 |
output_info = ""
|
102 |
+
ass_text=""
|
103 |
|
104 |
progress(0, desc="Validating input")
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
# decide which of Mic / File_Upload is used as input & do error handling
|
107 |
if (Microphone is not None) and (File_Upload is not None):
|
108 |
raise gr.Error("Please use either the microphone or file upload input - not both")
|
|
|
109 |
elif (Microphone is None) and (File_Upload is None):
|
110 |
raise gr.Error("You have to either use the microphone or upload an audio file")
|
|
|
111 |
elif Microphone is not None:
|
112 |
file = Microphone
|
113 |
else:
|
|
|
123 |
|
124 |
# loading model
|
125 |
progress(0.1, desc="Loading speech recognition model")
|
126 |
+
model_name = "ayymen/stt_zgh_fastconformer_ctc_small"
|
127 |
model = ASRModel.from_pretrained(model_name)
|
128 |
|
129 |
if text: # check input text is not too long compared to audio
|
|
|
161 |
" transcription errors, and clicking 'Submit' again."
|
162 |
)
|
163 |
|
164 |
+
# split text on new lines if requested
|
165 |
+
if split_on_newline:
|
166 |
+
text = "|".join(list(filter(None, text.split("\n"))))
|
167 |
|
168 |
data = {
|
169 |
"audio_filepath": audio_path,
|
|
|
189 |
additional_segment_grouping_separator="|",
|
190 |
# transcribe_device='cpu',
|
191 |
# viterbi_device='cpu',
|
192 |
+
save_output_file_formats=["ass", "ctm"],
|
193 |
ass_file_config=ASSFileConfig(
|
194 |
fontsize=45,
|
195 |
resegment_text_to_fill_space=resegment_text_to_fill_space,
|
|
|
207 |
progress(0.95, desc="Saving generated alignments")
|
208 |
|
209 |
|
210 |
+
# make video file from the word-level ASS file
|
211 |
+
ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
|
212 |
+
|
213 |
+
with open(ass_file_for_video, "r") as ass_file:
|
214 |
+
ass_text = ass_file.read()
|
|
|
215 |
|
216 |
ffmpeg_command = (
|
217 |
f"ffmpeg -y -i {audio_path} "
|
|
|
223 |
|
224 |
os.system(ffmpeg_command)
|
225 |
|
226 |
+
# save ASS file
|
227 |
+
ass_path = "word_level.ass"
|
228 |
+
with open(ass_path, "w", encoding="utf-8") as f:
|
229 |
+
f.write(ass_text)
|
230 |
+
|
231 |
+
# save word-level CTM file
|
232 |
+
with open(f"{tmpdir}/nfa_output/ctm/words/{utt_id}.ctm", "r") as word_ctm_file:
|
233 |
+
word_ctm_text = word_ctm_file.read()
|
234 |
+
|
235 |
+
word_ctm_path = "word_level.ctm"
|
236 |
+
with open(word_ctm_path, "w", encoding="utf-8") as f:
|
237 |
+
f.write(word_ctm_text)
|
238 |
+
|
239 |
+
# save segment-level CTM file
|
240 |
+
with open(f"{tmpdir}/nfa_output/ctm/segments/{utt_id}.ctm", "r") as segment_ctm_file:
|
241 |
+
segment_ctm_text = segment_ctm_file.read()
|
242 |
+
|
243 |
+
segment_ctm_path = "segment_level.ctm"
|
244 |
+
with open(segment_ctm_path, "w", encoding="utf-8") as f:
|
245 |
+
f.write(segment_ctm_text)
|
246 |
+
|
247 |
+
return output_video_filepath, gr.update(value=output_info, visible=True if output_info else False), output_video_filepath, gr.update(value=ass_path, visible=True), gr.update(value=word_ctm_path, visible=True), gr.update(value=segment_ctm_path, visible=True)
|
248 |
|
249 |
|
250 |
def delete_non_tmp_video(video_path):
|
|
|
269 |
|
270 |
with gr.Column(scale=1):
|
271 |
gr.Markdown("## Input")
|
|
|
|
|
272 |
mic_in = gr.Audio(sources=["microphone"], type='filepath', label="Microphone input (max 4 mins)")
|
273 |
audio_file_in = gr.Audio(sources=["upload"], type='filepath', label="File upload (max 4 mins)")
|
274 |
ref_text = gr.Textbox(
|
275 |
label="[Optional] The reference text. Use '|' separators to specify which text will appear together. "
|
276 |
"Leave this field blank to use an ASR model's transcription as the reference text instead."
|
277 |
)
|
278 |
+
split_on_newline = gr.Checkbox(
|
279 |
+
True,
|
280 |
+
label="Separate text on new lines",
|
281 |
+
)
|
282 |
|
283 |
gr.Markdown("[Optional] For fun - adjust the colors of the text in the output video")
|
284 |
with gr.Row():
|
|
|
290 |
|
291 |
with gr.Column(scale=1):
|
292 |
gr.Markdown("## Output")
|
293 |
+
video_out = gr.Video(label="Output Video")
|
294 |
+
text_out = gr.Textbox(label="Output Info", visible=False)
|
295 |
+
ass_file = gr.File(label="ASS File", visible=False)
|
296 |
+
word_ctm_file = gr.File(label="Word-level CTM File", visible=False)
|
297 |
+
segment_ctm_file = gr.File(label="Segment-level CTM File", visible=False)
|
298 |
|
299 |
with gr.Row():
|
300 |
gr.HTML(
|
|
|
307 |
|
308 |
submit_button.click(
|
309 |
fn=align,
|
310 |
+
inputs=[mic_in, audio_file_in, ref_text, col1, col2, col3, split_on_newline],
|
311 |
+
outputs=[video_out, text_out, non_tmp_output_video_filepath, ass_file, word_ctm_file, segment_ctm_file],
|
312 |
).then(
|
313 |
fn=delete_non_tmp_video, inputs=[non_tmp_output_video_filepath], outputs=None,
|
314 |
)
|
315 |
+
example_2 = """ⵜⴰⴽⵟⵟⵓⵎⵜ ⵏ ⵜⵙⴰⴷⵓⴼⵜ.
|
316 |
+
ⵙ ⵉⵙⵎ ⵏ ⵕⴱⴱⵉ ⴰⵎⴰⵍⵍⴰⵢ ⴰⵎⵙⵎⵓⵍⵍⵓ.
|
317 |
+
ⴰⵎⵓⵢ ⵉ ⵕⴱⴱⵉ ⵍⵍⵉ ⵎⵓ ⵜⴳⴰ ⵜⵓⵍⵖⵉⵜ ⵜⵉⵏⵏⵙ, ⵕⴱⴱⵉ ⵏ ⵉⵖⵥⵡⴰⵕⵏ, ⴽⵔⴰ ⴳⴰⵏ.
|
318 |
+
ⴰⵎⴰⵍⵍⴰⵢ ⴰⵎⵙⵎⵓⵍⵍⵓ, ⵖ ⵜⵎⵣⵡⴰⵔⵓⵜ ⵓⵍⴰ ⵖ ⵜⵎⴳⴳⴰⵔⵓⵜ.
|
319 |
+
ⴰⴳⵍⵍⵉⴷ ⵏ ⵡⴰⵙⵙ ⵏ ⵓⴼⵔⴰ, ⴰⵙⵙ ⵏ ⵓⵙⵙⵃⵙⵓ, ⴽⵔⴰⵉⴳⴰⵜ ⵢⴰⵏ ⴷ ⵎⴰⴷ ⵉⵙⴽⵔ.
|
320 |
+
ⵀⴰ ⵏⵏ ⴽⵢⵢⵉ ⴽⴰ ⵙ ⵏⵙⵙⵓⵎⴷ, ⴷ ⴽⵢⵢⵉ ⴽⴰ ⴰⴷ ⵏⵎⵎⵜⵔ.
|
321 |
+
ⵙⵎⵓⵏ ⴰⵖ, ⵜⵎⵍⵜ ⴰⵖ, ⴰⵖⴰⵔⴰⵙ ⵢⵓⵖⴷⵏ.
|
322 |
+
ⴰⵖⴰⵔⴰⵙ ⵏ ⵖⵡⵉⵍⵍⵉ ⵜⵙⵏⵏⵓⴼⴰⵜ, ⵓⵔ ⴷ ⴰⵢⵜ ⵜⵉⵢⵓⵔⵉ, ⵓⵍⴰ ⵉⵎⵓⴹⴹⴰⵕ."""
|
323 |
+
examples = gr.Examples(
|
324 |
+
examples=[
|
325 |
+
["common_voice_zgh_37837257.mp3", "ⵎⵍ ⵉⵢⵉ ⵎⴰⴷ ⴷ ⵜⴻⵜⵜⵎⵓⵏⴷ ⴰⴷ ⴰⴽ ⵎⵍⵖ ⵎⴰⴷ ⵜⴳⵉⴷ"],
|
326 |
+
["Voice1410.wav", example_2]
|
327 |
+
],
|
328 |
+
inputs=[audio_file_in, ref_text]
|
329 |
+
)
|
330 |
|
331 |
demo.queue()
|
332 |
demo.launch()
|
|