erastorgueva-nv
commited on
Commit
•
abb41a8
1
Parent(s):
700a61a
get latest NFA which should ensure subtitles show until end of video
Browse files- utils/make_ass_files.py +68 -8
utils/make_ass_files.py
CHANGED
@@ -23,7 +23,9 @@ For the word-level ASS files, the text will be highlighted word-by-word, with th
|
|
23 |
by the NFA alignemtns.
|
24 |
"""
|
25 |
|
|
|
26 |
import os
|
|
|
27 |
|
28 |
from utils.constants import BLANK_TOKEN, SPACE_TOKEN
|
29 |
from utils.data_prep import Segment, Token, Word
|
@@ -74,8 +76,13 @@ def make_ass_files(
|
|
74 |
if ass_file_config.resegment_text_to_fill_space:
|
75 |
utt_obj = resegment_utt_obj(utt_obj, ass_file_config)
|
76 |
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
return utt_obj
|
81 |
|
@@ -166,9 +173,7 @@ def resegment_utt_obj(utt_obj, ass_file_config):
|
|
166 |
return utt_obj
|
167 |
|
168 |
|
169 |
-
def make_word_level_ass_file(
|
170 |
-
utt_obj, output_dir_root, ass_file_config,
|
171 |
-
):
|
172 |
|
173 |
default_style_dict = {
|
174 |
"Name": "Default",
|
@@ -298,14 +303,33 @@ def make_word_level_ass_file(
|
|
298 |
)
|
299 |
f.write(subtitle_text + '\n')
|
300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
utt_obj.saved_output_files[f"words_level_ass_filepath"] = output_file
|
302 |
|
303 |
return utt_obj
|
304 |
|
305 |
|
306 |
-
def make_token_level_ass_file(
|
307 |
-
utt_obj, output_dir_root, ass_file_config,
|
308 |
-
):
|
309 |
|
310 |
default_style_dict = {
|
311 |
"Name": "Default",
|
@@ -457,6 +481,42 @@ def make_token_level_ass_file(
|
|
457 |
)
|
458 |
f.write(subtitle_text + '\n')
|
459 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
460 |
utt_obj.saved_output_files[f"tokens_level_ass_filepath"] = output_file
|
461 |
|
462 |
return utt_obj
|
|
|
23 |
by the NFA alignemtns.
|
24 |
"""
|
25 |
|
26 |
+
import math
|
27 |
import os
|
28 |
+
import soundfile as sf
|
29 |
|
30 |
from utils.constants import BLANK_TOKEN, SPACE_TOKEN
|
31 |
from utils.data_prep import Segment, Token, Word
|
|
|
76 |
if ass_file_config.resegment_text_to_fill_space:
|
77 |
utt_obj = resegment_utt_obj(utt_obj, ass_file_config)
|
78 |
|
79 |
+
# get duration of the utterance, so we know the final timestamp of the final set of subtitles,
|
80 |
+
# which we will keep showing until the end
|
81 |
+
with sf.SoundFile(utt_obj.audio_filepath) as f:
|
82 |
+
audio_dur = f.frames / f.samplerate
|
83 |
+
|
84 |
+
utt_obj = make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur)
|
85 |
+
utt_obj = make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur)
|
86 |
|
87 |
return utt_obj
|
88 |
|
|
|
173 |
return utt_obj
|
174 |
|
175 |
|
176 |
+
def make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur):
|
|
|
|
|
177 |
|
178 |
default_style_dict = {
|
179 |
"Name": "Default",
|
|
|
303 |
)
|
304 |
f.write(subtitle_text + '\n')
|
305 |
|
306 |
+
# write final set of subtitles for text after speech has been spoken
|
307 |
+
words_in_final_segment = []
|
308 |
+
for segment_or_token in utt_obj.segments_and_tokens[::-1]:
|
309 |
+
if type(segment_or_token) is Segment:
|
310 |
+
final_segment = segment_or_token
|
311 |
+
|
312 |
+
for word_or_token in final_segment.words_and_tokens:
|
313 |
+
if type(word_or_token) is Word:
|
314 |
+
words_in_final_segment.append(word_or_token)
|
315 |
+
break
|
316 |
+
|
317 |
+
text_after_speech = already_spoken_color_code + " ".join([x.text for x in words_in_final_segment]) + r"{\r}"
|
318 |
+
# note: for now doing some extra padding with math.ceil(audio_dur)+1) to account for the fact that the video with subtitles can become
|
319 |
+
# longer than the original audio during the MP4 creation stage.
|
320 |
+
subtitle_text = (
|
321 |
+
f"Dialogue: 0,{seconds_to_ass_format(words_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,,"
|
322 |
+
+ text_after_speech.rstrip()
|
323 |
+
)
|
324 |
+
|
325 |
+
f.write(subtitle_text + '\n')
|
326 |
+
|
327 |
utt_obj.saved_output_files[f"words_level_ass_filepath"] = output_file
|
328 |
|
329 |
return utt_obj
|
330 |
|
331 |
|
332 |
+
def make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur):
|
|
|
|
|
333 |
|
334 |
default_style_dict = {
|
335 |
"Name": "Default",
|
|
|
481 |
)
|
482 |
f.write(subtitle_text + '\n')
|
483 |
|
484 |
+
# Write final set of subtitles for text after speech has been spoken.
|
485 |
+
# To do this, we need to collect 'tokens_in_final_segment' so that we know what the final line is.
|
486 |
+
tokens_in_final_segment = []
|
487 |
+
for segment_or_token in utt_obj.segments_and_tokens[::-1]:
|
488 |
+
# Collect tokens from final segment - will 'break' so we only look at the final one.
|
489 |
+
if type(segment_or_token) is Segment:
|
490 |
+
# 'segment_or_token' is known to be Segment, which has attribute 'words_and_tokens'
|
491 |
+
for word_or_token in segment_or_token.words_and_tokens:
|
492 |
+
if type(word_or_token) is Token:
|
493 |
+
if word_or_token.text != BLANK_TOKEN:
|
494 |
+
tokens_in_final_segment.append(word_or_token)
|
495 |
+
else:
|
496 |
+
# 'word_or_token' is known to be a Word, which has attribute 'tokens'
|
497 |
+
for token in word_or_token.tokens:
|
498 |
+
if token.text != BLANK_TOKEN:
|
499 |
+
tokens_in_final_segment.append(token)
|
500 |
+
break
|
501 |
+
|
502 |
+
for token in tokens_in_final_segment:
|
503 |
+
token.text_cased = token.text_cased.replace(
|
504 |
+
"▁", " "
|
505 |
+
) # replace underscores used in subword tokens with spaces
|
506 |
+
token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space
|
507 |
+
|
508 |
+
text_after_speech = (
|
509 |
+
already_spoken_color_code + "".join([x.text_cased for x in tokens_in_final_segment]) + r"{\r}"
|
510 |
+
)
|
511 |
+
# note: for now doing some extra padding with math.ceil(audio_dur)+1) to account for the fact that the video with subtitles can become
|
512 |
+
# longer than the original audio during the MP4 creation stage.
|
513 |
+
subtitle_text = (
|
514 |
+
f"Dialogue: 0,{seconds_to_ass_format(tokens_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,,"
|
515 |
+
+ text_after_speech.rstrip()
|
516 |
+
)
|
517 |
+
|
518 |
+
f.write(subtitle_text + '\n')
|
519 |
+
|
520 |
utt_obj.saved_output_files[f"tokens_level_ass_filepath"] = output_file
|
521 |
|
522 |
return utt_obj
|