Add option to split on \n. Add .ass output
#1
by
DEVMAXXING
- opened
app.py
CHANGED
@@ -104,7 +104,7 @@ def delete_mp4s_except_given_filepath(filepath):
|
|
104 |
|
105 |
|
106 |
|
107 |
-
def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Progress()):
|
108 |
# Create utt_id, specify output_video_filepath and delete any MP4s
|
109 |
# that are not that filepath. These stray MP4s can be created
|
110 |
# if a user refreshes or exits the page while this 'align' function is executing.
|
@@ -115,6 +115,7 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
|
|
115 |
delete_mp4s_except_given_filepath(output_video_filepath)
|
116 |
|
117 |
output_info = ""
|
|
|
118 |
|
119 |
progress(0, desc="Validating input")
|
120 |
|
@@ -197,6 +198,10 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
|
|
197 |
with open(manifest_path, 'w') as fout:
|
198 |
fout.write(f"{json.dumps(data)}\n")
|
199 |
|
|
|
|
|
|
|
|
|
200 |
# run alignment
|
201 |
if "|" in text:
|
202 |
resegment_text_to_fill_space = False
|
@@ -238,6 +243,9 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
|
|
238 |
# make video file from the word-level ASS file
|
239 |
ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
|
240 |
|
|
|
|
|
|
|
241 |
ffmpeg_command = (
|
242 |
f"ffmpeg -y -i {audio_path} "
|
243 |
"-f lavfi -i color=c=white:s=1280x720:r=50 "
|
@@ -248,7 +256,7 @@ def align(lang, Microphone, File_Upload, text, col1, col2, col3, progress=gr.Pro
|
|
248 |
|
249 |
os.system(ffmpeg_command)
|
250 |
|
251 |
-
return output_video_filepath, gr.update(value=output_info, visible=True), output_video_filepath
|
252 |
|
253 |
|
254 |
def delete_non_tmp_video(video_path):
|
@@ -281,6 +289,9 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
|
|
281 |
label="[Optional] The reference text. Use '|' separators to specify which text will appear together. "
|
282 |
"Leave this field blank to use an ASR model's transcription as the reference text instead."
|
283 |
)
|
|
|
|
|
|
|
284 |
|
285 |
gr.Markdown("[Optional] For fun - adjust the colors of the text in the output video")
|
286 |
with gr.Row():
|
@@ -294,6 +305,7 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
|
|
294 |
gr.Markdown("## Output")
|
295 |
video_out = gr.Video(label="output video")
|
296 |
text_out = gr.Textbox(label="output info", visible=False)
|
|
|
297 |
|
298 |
with gr.Row():
|
299 |
gr.HTML(
|
@@ -306,8 +318,8 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
|
|
306 |
|
307 |
submit_button.click(
|
308 |
fn=align,
|
309 |
-
inputs=[lang_drop, mic_in, audio_file_in, ref_text, col1, col2, col3,],
|
310 |
-
outputs=[video_out, text_out, non_tmp_output_video_filepath],
|
311 |
).then(
|
312 |
fn=delete_non_tmp_video, inputs=[non_tmp_output_video_filepath], outputs=None,
|
313 |
)
|
|
|
104 |
|
105 |
|
106 |
|
107 |
+
def align(lang, Microphone, File_Upload, text, col1, col2, col3, split_on_newline, progress=gr.Progress()):
|
108 |
# Create utt_id, specify output_video_filepath and delete any MP4s
|
109 |
# that are not that filepath. These stray MP4s can be created
|
110 |
# if a user refreshes or exits the page while this 'align' function is executing.
|
|
|
115 |
delete_mp4s_except_given_filepath(output_video_filepath)
|
116 |
|
117 |
output_info = ""
|
118 |
+
ass_text=""
|
119 |
|
120 |
progress(0, desc="Validating input")
|
121 |
|
|
|
198 |
with open(manifest_path, 'w') as fout:
|
199 |
fout.write(f"{json.dumps(data)}\n")
|
200 |
|
201 |
+
# split text on new lines if requested
|
202 |
+
if split_on_newline:
|
203 |
+
text = "|".join(list(filter(None, text.split("\n"))))
|
204 |
+
|
205 |
# run alignment
|
206 |
if "|" in text:
|
207 |
resegment_text_to_fill_space = False
|
|
|
243 |
# make video file from the word-level ASS file
|
244 |
ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
|
245 |
|
246 |
+
with open(ass_file_for_video, "r") as ass_file:
|
247 |
+
ass_text = ass_file.read()
|
248 |
+
|
249 |
ffmpeg_command = (
|
250 |
f"ffmpeg -y -i {audio_path} "
|
251 |
"-f lavfi -i color=c=white:s=1280x720:r=50 "
|
|
|
256 |
|
257 |
os.system(ffmpeg_command)
|
258 |
|
259 |
+
return output_video_filepath, gr.update(value=output_info, visible=True), output_video_filepath, ass_text
|
260 |
|
261 |
|
262 |
def delete_non_tmp_video(video_path):
|
|
|
289 |
label="[Optional] The reference text. Use '|' separators to specify which text will appear together. "
|
290 |
"Leave this field blank to use an ASR model's transcription as the reference text instead."
|
291 |
)
|
292 |
+
split_on_newline = gr.Checkbox(
|
293 |
+
label="Separate text on new lines", default=False
|
294 |
+
)
|
295 |
|
296 |
gr.Markdown("[Optional] For fun - adjust the colors of the text in the output video")
|
297 |
with gr.Row():
|
|
|
305 |
gr.Markdown("## Output")
|
306 |
video_out = gr.Video(label="output video")
|
307 |
text_out = gr.Textbox(label="output info", visible=False)
|
308 |
+
ass_out = gr.Textbox(label="output .ass")
|
309 |
|
310 |
with gr.Row():
|
311 |
gr.HTML(
|
|
|
318 |
|
319 |
submit_button.click(
|
320 |
fn=align,
|
321 |
+
inputs=[lang_drop, mic_in, audio_file_in, ref_text, col1, col2, col3,split_on_newline,],
|
322 |
+
outputs=[video_out, text_out, non_tmp_output_video_filepath, ass_out],
|
323 |
).then(
|
324 |
fn=delete_non_tmp_video, inputs=[non_tmp_output_video_filepath], outputs=None,
|
325 |
)
|