Spaces:
Sleeping
Sleeping
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
""" | |
This file contains functions for make ASS-format subtitle files based on the generated alignment. | |
ASS files can be generated highlighting token-level alignments or word-level alignments. | |
In both cases, 'segment' boundaries will be used to determine which parts of the text will appear | |
at the same time. | |
For the token-level ASS files, the text will be highlighted token-by-token, with the timings determined | |
by the NFA alignments. | |
For the word-level ASS files, the text will be highlighted word-by-word, with the timings determined | |
by the NFA alignemtns. | |
""" | |
import math | |
import os | |
import soundfile as sf | |
from utils.constants import BLANK_TOKEN, SPACE_TOKEN | |
from utils.data_prep import Segment, Token, Word | |
PLAYERRESX = 384 | |
PLAYERRESY = 288 | |
MARGINL = 10 | |
MARGINR = 10 | |
MARGINV = 20 | |
def seconds_to_ass_format(seconds_float): | |
seconds_float = float(seconds_float) | |
mm, ss_decimals = divmod(seconds_float, 60) | |
hh, mm = divmod(mm, 60) | |
hh = str(round(hh)) | |
if len(hh) == 1: | |
hh = '0' + hh | |
mm = str(round(mm)) | |
if len(mm) == 1: | |
mm = '0' + mm | |
ss_decimals = f"{ss_decimals:.2f}" | |
if len(ss_decimals.split(".")[0]) == 1: | |
ss_decimals = "0" + ss_decimals | |
srt_format_time = f"{hh}:{mm}:{ss_decimals}" | |
return srt_format_time | |
def rgb_list_to_hex_bgr(rgb_list): | |
r, g, b = rgb_list | |
return f"{b:x}{g:x}{r:x}" | |
def make_ass_files( | |
utt_obj, output_dir_root, ass_file_config, | |
): | |
# don't try to make files if utt_obj.segments_and_tokens is empty, which will happen | |
# in the case of the ground truth text being empty or the number of tokens being too large vs audio duration | |
if not utt_obj.segments_and_tokens: | |
return utt_obj | |
if ass_file_config.resegment_text_to_fill_space: | |
utt_obj = resegment_utt_obj(utt_obj, ass_file_config) | |
# get duration of the utterance, so we know the final timestamp of the final set of subtitles, | |
# which we will keep showing until the end | |
with sf.SoundFile(utt_obj.audio_filepath) as f: | |
audio_dur = f.frames / f.samplerate | |
utt_obj = make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur) | |
utt_obj = make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur) | |
return utt_obj | |
def _get_word_n_chars(word): | |
n_chars = 0 | |
for token in word.tokens: | |
if token.text != BLANK_TOKEN: | |
n_chars += len(token.text) | |
return n_chars | |
def _get_segment_n_chars(segment): | |
n_chars = 0 | |
for word_or_token in segment.words_and_tokens: | |
if word_or_token.text == SPACE_TOKEN: | |
n_chars += 1 | |
elif word_or_token.text != BLANK_TOKEN: | |
n_chars += len(word_or_token.text) | |
return n_chars | |
def resegment_utt_obj(utt_obj, ass_file_config): | |
# get list of just all words and tokens | |
all_words_and_tokens = [] | |
for segment_or_token in utt_obj.segments_and_tokens: | |
if type(segment_or_token) is Segment: | |
all_words_and_tokens.extend(segment_or_token.words_and_tokens) | |
else: | |
all_words_and_tokens.append(segment_or_token) | |
# figure out how many chars will fit into one 'slide' and thus should be the max | |
# size of a segment | |
approx_chars_per_line = (PLAYERRESX - MARGINL - MARGINR) / ( | |
ass_file_config.fontsize * 0.6 | |
) # assume chars 0.6 as wide as they are tall | |
approx_lines_per_segment = (PLAYERRESY - MARGINV) / ( | |
ass_file_config.fontsize * 1.15 | |
) # assume line spacing is 1.15 | |
if approx_lines_per_segment > ass_file_config.max_lines_per_segment: | |
approx_lines_per_segment = ass_file_config.max_lines_per_segment | |
max_chars_per_segment = int(approx_chars_per_line * approx_lines_per_segment) | |
new_segments_and_tokens = [] | |
all_words_and_tokens_pointer = 0 | |
for word_or_token in all_words_and_tokens: | |
if type(word_or_token) is Token: | |
new_segments_and_tokens.append(word_or_token) | |
all_words_and_tokens_pointer += 1 | |
else: | |
break | |
new_segments_and_tokens.append(Segment()) | |
while all_words_and_tokens_pointer < len(all_words_and_tokens): | |
word_or_token = all_words_and_tokens[all_words_and_tokens_pointer] | |
if type(word_or_token) is Word: | |
# if this is going to be the first word in the segment, we definitely want | |
# to add it to the segment | |
if not new_segments_and_tokens[-1].words_and_tokens: | |
new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) | |
else: | |
# if not the first word, check what the new length of the segment will be | |
# if short enough - add this word to this segment; | |
# if too long - add to a new segment | |
this_word_n_chars = _get_word_n_chars(word_or_token) | |
segment_so_far_n_chars = _get_segment_n_chars(new_segments_and_tokens[-1]) | |
if this_word_n_chars + segment_so_far_n_chars < max_chars_per_segment: | |
new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) | |
else: | |
new_segments_and_tokens.append(Segment()) | |
new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) | |
else: # i.e. word_or_token is a token | |
# currently this breaks the convention of tokens at the end/beginning | |
# of segments being listed as separate tokens in segment.word_and_tokens | |
# TODO: change code so we follow this convention | |
new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) | |
all_words_and_tokens_pointer += 1 | |
utt_obj.segments_and_tokens = new_segments_and_tokens | |
return utt_obj | |
def make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur): | |
default_style_dict = { | |
"Name": "Default", | |
"Fontname": "Arial", | |
"Fontsize": str(ass_file_config.fontsize), | |
"PrimaryColour": "&Hffffff", | |
"SecondaryColour": "&Hffffff", | |
"OutlineColour": "&H0", | |
"BackColour": "&H0", | |
"Bold": "0", | |
"Italic": "0", | |
"Underline": "0", | |
"StrikeOut": "0", | |
"ScaleX": "100", | |
"ScaleY": "100", | |
"Spacing": "0", | |
"Angle": "0", | |
"BorderStyle": "1", | |
"Outline": "1", | |
"Shadow": "0", | |
"Alignment": None, # will specify below | |
"MarginL": str(MARGINL), | |
"MarginR": str(MARGINR), | |
"MarginV": str(MARGINV), | |
"Encoding": "0", | |
} | |
if ass_file_config.vertical_alignment == "top": | |
default_style_dict["Alignment"] = "8" # text will be 'center-justified' and in the top of the screen | |
elif ass_file_config.vertical_alignment == "center": | |
default_style_dict["Alignment"] = "5" # text will be 'center-justified' and in the middle of the screen | |
elif ass_file_config.vertical_alignment == "bottom": | |
default_style_dict["Alignment"] = "2" # text will be 'center-justified' and in the bottom of the screen | |
else: | |
raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment") | |
output_dir = os.path.join(output_dir_root, "ass", "words") | |
os.makedirs(output_dir, exist_ok=True) | |
output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass") | |
already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}" | |
being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}" | |
not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}" | |
with open(output_file, 'w') as f: | |
default_style_top_line = "Format: " + ", ".join(default_style_dict.keys()) | |
default_style_bottom_line = "Style: " + ",".join(default_style_dict.values()) | |
f.write( | |
( | |
"[Script Info]\n" | |
"ScriptType: v4.00+\n" | |
f"PlayResX: {PLAYERRESX}\n" | |
f"PlayResY: {PLAYERRESY}\n" | |
"\n" | |
"[V4+ Styles]\n" | |
f"{default_style_top_line}\n" | |
f"{default_style_bottom_line}\n" | |
"\n" | |
"[Events]\n" | |
"Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n" | |
) | |
) | |
# write first set of subtitles for text before speech starts to be spoken | |
words_in_first_segment = [] | |
for segment_or_token in utt_obj.segments_and_tokens: | |
if type(segment_or_token) is Segment: | |
first_segment = segment_or_token | |
for word_or_token in first_segment.words_and_tokens: | |
if type(word_or_token) is Word: | |
words_in_first_segment.append(word_or_token) | |
break | |
text_before_speech = not_yet_spoken_color_code + " ".join([x.text for x in words_in_first_segment]) + r"{\r}" | |
subtitle_text = ( | |
f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(words_in_first_segment[0].t_start)},Default,,0,0,0,," | |
+ text_before_speech.rstrip() | |
) | |
f.write(subtitle_text + '\n') | |
for segment_or_token in utt_obj.segments_and_tokens: | |
if type(segment_or_token) is Segment: | |
segment = segment_or_token | |
words_in_segment = [] | |
for word_or_token in segment.words_and_tokens: | |
if type(word_or_token) is Word: | |
words_in_segment.append(word_or_token) | |
for word_i, word in enumerate(words_in_segment): | |
text_before = " ".join([x.text for x in words_in_segment[:word_i]]) | |
if text_before != "": | |
text_before += " " | |
text_before = already_spoken_color_code + text_before + r"{\r}" | |
if word_i < len(words_in_segment) - 1: | |
text_after = " " + " ".join([x.text for x in words_in_segment[word_i + 1 :]]) | |
else: | |
text_after = "" | |
text_after = not_yet_spoken_color_code + text_after + r"{\r}" | |
aligned_text = being_spoken_color_code + word.text + r"{\r}" | |
aligned_text_off = already_spoken_color_code + word.text + r"{\r}" | |
subtitle_text = ( | |
f"Dialogue: 0,{seconds_to_ass_format(word.t_start)},{seconds_to_ass_format(word.t_end)},Default,,0,0,0,," | |
+ text_before | |
+ aligned_text | |
+ text_after.rstrip() | |
) | |
f.write(subtitle_text + '\n') | |
# add subtitles without word-highlighting for when words are not being spoken | |
if word_i < len(words_in_segment) - 1: | |
last_word_end = float(words_in_segment[word_i].t_end) | |
next_word_start = float(words_in_segment[word_i + 1].t_start) | |
if next_word_start - last_word_end > 0.001: | |
subtitle_text = ( | |
f"Dialogue: 0,{seconds_to_ass_format(last_word_end)},{seconds_to_ass_format(next_word_start)},Default,,0,0,0,," | |
+ text_before | |
+ aligned_text_off | |
+ text_after.rstrip() | |
) | |
f.write(subtitle_text + '\n') | |
# write final set of subtitles for text after speech has been spoken | |
words_in_final_segment = [] | |
for segment_or_token in utt_obj.segments_and_tokens[::-1]: | |
if type(segment_or_token) is Segment: | |
final_segment = segment_or_token | |
for word_or_token in final_segment.words_and_tokens: | |
if type(word_or_token) is Word: | |
words_in_final_segment.append(word_or_token) | |
break | |
text_after_speech = already_spoken_color_code + " ".join([x.text for x in words_in_final_segment]) + r"{\r}" | |
# note: for now doing some extra padding with math.ceil(audio_dur)+1) to account for the fact that the video with subtitles can become | |
# longer than the original audio during the MP4 creation stage. | |
subtitle_text = ( | |
f"Dialogue: 0,{seconds_to_ass_format(words_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,," | |
+ text_after_speech.rstrip() | |
) | |
f.write(subtitle_text + '\n') | |
utt_obj.saved_output_files[f"words_level_ass_filepath"] = output_file | |
return utt_obj | |
def make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur): | |
default_style_dict = { | |
"Name": "Default", | |
"Fontname": "Arial", | |
"Fontsize": str(ass_file_config.fontsize), | |
"PrimaryColour": "&Hffffff", | |
"SecondaryColour": "&Hffffff", | |
"OutlineColour": "&H0", | |
"BackColour": "&H0", | |
"Bold": "0", | |
"Italic": "0", | |
"Underline": "0", | |
"StrikeOut": "0", | |
"ScaleX": "100", | |
"ScaleY": "100", | |
"Spacing": "0", | |
"Angle": "0", | |
"BorderStyle": "1", | |
"Outline": "1", | |
"Shadow": "0", | |
"Alignment": None, # will specify below | |
"MarginL": str(MARGINL), | |
"MarginR": str(MARGINR), | |
"MarginV": str(MARGINV), | |
"Encoding": "0", | |
} | |
if ass_file_config.vertical_alignment == "top": | |
default_style_dict["Alignment"] = "8" # text will be 'center-justified' and in the top of the screen | |
elif ass_file_config.vertical_alignment == "center": | |
default_style_dict["Alignment"] = "5" # text will be 'center-justified' and in the middle of the screen | |
elif ass_file_config.vertical_alignment == "bottom": | |
default_style_dict["Alignment"] = "2" # text will be 'center-justified' and in the bottom of the screen | |
else: | |
raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment") | |
output_dir = os.path.join(output_dir_root, "ass", "tokens") | |
os.makedirs(output_dir, exist_ok=True) | |
output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass") | |
already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}" | |
being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}" | |
not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}" | |
with open(output_file, 'w') as f: | |
default_style_top_line = "Format: " + ", ".join(default_style_dict.keys()) | |
default_style_bottom_line = "Style: " + ",".join(default_style_dict.values()) | |
f.write( | |
( | |
"[Script Info]\n" | |
"ScriptType: v4.00+\n" | |
f"PlayResX: {PLAYERRESX}\n" | |
f"PlayResY: {PLAYERRESY}\n" | |
"ScaledBorderAndShadow: yes\n" | |
"\n" | |
"[V4+ Styles]\n" | |
f"{default_style_top_line}\n" | |
f"{default_style_bottom_line}\n" | |
"\n" | |
"[Events]\n" | |
"Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n" | |
) | |
) | |
# write first set of subtitles for text before speech starts to be spoken | |
tokens_in_first_segment = [] | |
for segment_or_token in utt_obj.segments_and_tokens: | |
if type(segment_or_token) is Segment: | |
for word_or_token in segment_or_token.words_and_tokens: | |
if type(word_or_token) is Token: | |
if word_or_token.text != BLANK_TOKEN: | |
tokens_in_first_segment.append(word_or_token) | |
else: | |
for token in word_or_token.tokens: | |
if token.text != BLANK_TOKEN: | |
tokens_in_first_segment.append(token) | |
break | |
for token in tokens_in_first_segment: | |
token.text_cased = token.text_cased.replace( | |
"▁", " " | |
) # replace underscores used in subword tokens with spaces | |
token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space | |
text_before_speech = ( | |
not_yet_spoken_color_code + "".join([x.text_cased for x in tokens_in_first_segment]) + r"{\r}" | |
) | |
subtitle_text = ( | |
f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(tokens_in_first_segment[0].t_start)},Default,,0,0,0,," | |
+ text_before_speech.rstrip() | |
) | |
f.write(subtitle_text + '\n') | |
for segment_or_token in utt_obj.segments_and_tokens: | |
if type(segment_or_token) is Segment: | |
segment = segment_or_token | |
tokens_in_segment = [] # make list of (non-blank) tokens | |
for word_or_token in segment.words_and_tokens: | |
if type(word_or_token) is Token: | |
if word_or_token.text != BLANK_TOKEN: | |
tokens_in_segment.append(word_or_token) | |
else: | |
for token in word_or_token.tokens: | |
if token.text != BLANK_TOKEN: | |
tokens_in_segment.append(token) | |
for token in tokens_in_segment: | |
token.text_cased = token.text_cased.replace( | |
"▁", " " | |
) # replace underscores used in subword tokens with spaces | |
token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space | |
for token_i, token in enumerate(tokens_in_segment): | |
text_before = "".join([x.text_cased for x in tokens_in_segment[:token_i]]) | |
text_before = already_spoken_color_code + text_before + r"{\r}" | |
if token_i < len(tokens_in_segment) - 1: | |
text_after = "".join([x.text_cased for x in tokens_in_segment[token_i + 1 :]]) | |
else: | |
text_after = "" | |
text_after = not_yet_spoken_color_code + text_after + r"{\r}" | |
aligned_text = being_spoken_color_code + token.text_cased + r"{\r}" | |
aligned_text_off = already_spoken_color_code + token.text_cased + r"{\r}" | |
subtitle_text = ( | |
f"Dialogue: 0,{seconds_to_ass_format(token.t_start)},{seconds_to_ass_format(token.t_end)},Default,,0,0,0,," | |
+ text_before | |
+ aligned_text | |
+ text_after.rstrip() | |
) | |
f.write(subtitle_text + '\n') | |
# add subtitles without word-highlighting for when words are not being spoken | |
if token_i < len(tokens_in_segment) - 1: | |
last_token_end = float(tokens_in_segment[token_i].t_end) | |
next_token_start = float(tokens_in_segment[token_i + 1].t_start) | |
if next_token_start - last_token_end > 0.001: | |
subtitle_text = ( | |
f"Dialogue: 0,{seconds_to_ass_format(last_token_end)},{seconds_to_ass_format(next_token_start)},Default,,0,0,0,," | |
+ text_before | |
+ aligned_text_off | |
+ text_after.rstrip() | |
) | |
f.write(subtitle_text + '\n') | |
# Write final set of subtitles for text after speech has been spoken. | |
# To do this, we need to collect 'tokens_in_final_segment' so that we know what the final line is. | |
tokens_in_final_segment = [] | |
for segment_or_token in utt_obj.segments_and_tokens[::-1]: | |
# Collect tokens from final segment - will 'break' so we only look at the final one. | |
if type(segment_or_token) is Segment: | |
# 'segment_or_token' is known to be Segment, which has attribute 'words_and_tokens' | |
for word_or_token in segment_or_token.words_and_tokens: | |
if type(word_or_token) is Token: | |
if word_or_token.text != BLANK_TOKEN: | |
tokens_in_final_segment.append(word_or_token) | |
else: | |
# 'word_or_token' is known to be a Word, which has attribute 'tokens' | |
for token in word_or_token.tokens: | |
if token.text != BLANK_TOKEN: | |
tokens_in_final_segment.append(token) | |
break | |
for token in tokens_in_final_segment: | |
token.text_cased = token.text_cased.replace( | |
"▁", " " | |
) # replace underscores used in subword tokens with spaces | |
token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") # space token with actual space | |
text_after_speech = ( | |
already_spoken_color_code + "".join([x.text_cased for x in tokens_in_final_segment]) + r"{\r}" | |
) | |
# note: for now doing some extra padding with math.ceil(audio_dur)+1) to account for the fact that the video with subtitles can become | |
# longer than the original audio during the MP4 creation stage. | |
subtitle_text = ( | |
f"Dialogue: 0,{seconds_to_ass_format(tokens_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,," | |
+ text_after_speech.rstrip() | |
) | |
f.write(subtitle_text + '\n') | |
utt_obj.saved_output_files[f"tokens_level_ass_filepath"] = output_file | |
return utt_obj | |