|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
This file contains functions for make ASS-format subtitle files based on the generated alignment. |
|
ASS files can be generated highlighting token-level alignments or word-level alignments. |
|
In both cases, 'segment' boundaries will be used to determine which parts of the text will appear |
|
at the same time. |
|
For the token-level ASS files, the text will be highlighted token-by-token, with the timings determined |
|
by the NFA alignments. |
|
For the word-level ASS files, the text will be highlighted word-by-word, with the timings determined |
|
by the NFA alignemtns. |
|
""" |
|
|
|
import math |
|
import os |
|
import soundfile as sf |
|
|
|
from utils.constants import BLANK_TOKEN, SPACE_TOKEN |
|
from utils.data_prep import Segment, Token, Word |
|
|
|
PLAYERRESX = 384 |
|
PLAYERRESY = 288 |
|
MARGINL = 10 |
|
MARGINR = 10 |
|
MARGINV = 20 |
|
|
|
|
|
def seconds_to_ass_format(seconds_float): |
|
seconds_float = float(seconds_float) |
|
mm, ss_decimals = divmod(seconds_float, 60) |
|
hh, mm = divmod(mm, 60) |
|
|
|
hh = str(round(hh)) |
|
if len(hh) == 1: |
|
hh = '0' + hh |
|
|
|
mm = str(round(mm)) |
|
if len(mm) == 1: |
|
mm = '0' + mm |
|
|
|
ss_decimals = f"{ss_decimals:.2f}" |
|
if len(ss_decimals.split(".")[0]) == 1: |
|
ss_decimals = "0" + ss_decimals |
|
|
|
srt_format_time = f"{hh}:{mm}:{ss_decimals}" |
|
|
|
return srt_format_time |
|
|
|
|
|
def rgb_list_to_hex_bgr(rgb_list): |
|
r, g, b = rgb_list |
|
return f"{b:x}{g:x}{r:x}" |
|
|
|
|
|
def make_ass_files( |
|
utt_obj, output_dir_root, ass_file_config, |
|
): |
|
|
|
|
|
|
|
if not utt_obj.segments_and_tokens: |
|
return utt_obj |
|
|
|
if ass_file_config.resegment_text_to_fill_space: |
|
utt_obj = resegment_utt_obj(utt_obj, ass_file_config) |
|
|
|
|
|
|
|
with sf.SoundFile(utt_obj.audio_filepath) as f: |
|
audio_dur = f.frames / f.samplerate |
|
|
|
utt_obj = make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur) |
|
utt_obj = make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur) |
|
|
|
return utt_obj |
|
|
|
|
|
def _get_word_n_chars(word): |
|
n_chars = 0 |
|
for token in word.tokens: |
|
if token.text != BLANK_TOKEN: |
|
n_chars += len(token.text) |
|
return n_chars |
|
|
|
|
|
def _get_segment_n_chars(segment): |
|
n_chars = 0 |
|
for word_or_token in segment.words_and_tokens: |
|
if word_or_token.text == SPACE_TOKEN: |
|
n_chars += 1 |
|
elif word_or_token.text != BLANK_TOKEN: |
|
n_chars += len(word_or_token.text) |
|
return n_chars |
|
|
|
|
|
def resegment_utt_obj(utt_obj, ass_file_config): |
|
|
|
|
|
all_words_and_tokens = [] |
|
for segment_or_token in utt_obj.segments_and_tokens: |
|
if type(segment_or_token) is Segment: |
|
all_words_and_tokens.extend(segment_or_token.words_and_tokens) |
|
else: |
|
all_words_and_tokens.append(segment_or_token) |
|
|
|
|
|
|
|
approx_chars_per_line = (PLAYERRESX - MARGINL - MARGINR) / ( |
|
ass_file_config.fontsize * 0.6 |
|
) |
|
approx_lines_per_segment = (PLAYERRESY - MARGINV) / ( |
|
ass_file_config.fontsize * 1.15 |
|
) |
|
if approx_lines_per_segment > ass_file_config.max_lines_per_segment: |
|
approx_lines_per_segment = ass_file_config.max_lines_per_segment |
|
|
|
max_chars_per_segment = int(approx_chars_per_line * approx_lines_per_segment) |
|
|
|
new_segments_and_tokens = [] |
|
all_words_and_tokens_pointer = 0 |
|
for word_or_token in all_words_and_tokens: |
|
if type(word_or_token) is Token: |
|
new_segments_and_tokens.append(word_or_token) |
|
all_words_and_tokens_pointer += 1 |
|
else: |
|
break |
|
|
|
new_segments_and_tokens.append(Segment()) |
|
|
|
while all_words_and_tokens_pointer < len(all_words_and_tokens): |
|
word_or_token = all_words_and_tokens[all_words_and_tokens_pointer] |
|
if type(word_or_token) is Word: |
|
|
|
|
|
|
|
if not new_segments_and_tokens[-1].words_and_tokens: |
|
new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) |
|
|
|
else: |
|
|
|
|
|
|
|
this_word_n_chars = _get_word_n_chars(word_or_token) |
|
segment_so_far_n_chars = _get_segment_n_chars(new_segments_and_tokens[-1]) |
|
if this_word_n_chars + segment_so_far_n_chars < max_chars_per_segment: |
|
new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) |
|
else: |
|
new_segments_and_tokens.append(Segment()) |
|
new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) |
|
|
|
else: |
|
|
|
|
|
|
|
new_segments_and_tokens[-1].words_and_tokens.append(word_or_token) |
|
|
|
all_words_and_tokens_pointer += 1 |
|
|
|
utt_obj.segments_and_tokens = new_segments_and_tokens |
|
|
|
return utt_obj |
|
|
|
|
|
def make_word_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur): |
|
|
|
default_style_dict = { |
|
"Name": "Default", |
|
"Fontname": "Arial", |
|
"Fontsize": str(ass_file_config.fontsize), |
|
"PrimaryColour": "&Hffffff", |
|
"SecondaryColour": "&Hffffff", |
|
"OutlineColour": "&H0", |
|
"BackColour": "&H0", |
|
"Bold": "0", |
|
"Italic": "0", |
|
"Underline": "0", |
|
"StrikeOut": "0", |
|
"ScaleX": "100", |
|
"ScaleY": "100", |
|
"Spacing": "0", |
|
"Angle": "0", |
|
"BorderStyle": "1", |
|
"Outline": "1", |
|
"Shadow": "0", |
|
"Alignment": None, |
|
"MarginL": str(MARGINL), |
|
"MarginR": str(MARGINR), |
|
"MarginV": str(MARGINV), |
|
"Encoding": "0", |
|
} |
|
|
|
if ass_file_config.vertical_alignment == "top": |
|
default_style_dict["Alignment"] = "8" |
|
elif ass_file_config.vertical_alignment == "center": |
|
default_style_dict["Alignment"] = "5" |
|
elif ass_file_config.vertical_alignment == "bottom": |
|
default_style_dict["Alignment"] = "2" |
|
else: |
|
raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment") |
|
|
|
output_dir = os.path.join(output_dir_root, "ass", "words") |
|
os.makedirs(output_dir, exist_ok=True) |
|
output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass") |
|
|
|
already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}" |
|
being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}" |
|
not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}" |
|
|
|
with open(output_file, 'w') as f: |
|
default_style_top_line = "Format: " + ", ".join(default_style_dict.keys()) |
|
default_style_bottom_line = "Style: " + ",".join(default_style_dict.values()) |
|
|
|
f.write( |
|
( |
|
"[Script Info]\n" |
|
"ScriptType: v4.00+\n" |
|
f"PlayResX: {PLAYERRESX}\n" |
|
f"PlayResY: {PLAYERRESY}\n" |
|
"\n" |
|
"[V4+ Styles]\n" |
|
f"{default_style_top_line}\n" |
|
f"{default_style_bottom_line}\n" |
|
"\n" |
|
"[Events]\n" |
|
"Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n" |
|
) |
|
) |
|
|
|
|
|
words_in_first_segment = [] |
|
for segment_or_token in utt_obj.segments_and_tokens: |
|
if type(segment_or_token) is Segment: |
|
first_segment = segment_or_token |
|
|
|
for word_or_token in first_segment.words_and_tokens: |
|
if type(word_or_token) is Word: |
|
words_in_first_segment.append(word_or_token) |
|
break |
|
|
|
text_before_speech = not_yet_spoken_color_code + " ".join([x.text for x in words_in_first_segment]) + r"{\r}" |
|
subtitle_text = ( |
|
f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(words_in_first_segment[0].t_start)},Default,,0,0,0,," |
|
+ text_before_speech.rstrip() |
|
) |
|
|
|
f.write(subtitle_text + '\n') |
|
|
|
for segment_or_token in utt_obj.segments_and_tokens: |
|
if type(segment_or_token) is Segment: |
|
segment = segment_or_token |
|
|
|
words_in_segment = [] |
|
for word_or_token in segment.words_and_tokens: |
|
if type(word_or_token) is Word: |
|
words_in_segment.append(word_or_token) |
|
|
|
for word_i, word in enumerate(words_in_segment): |
|
|
|
text_before = " ".join([x.text for x in words_in_segment[:word_i]]) |
|
if text_before != "": |
|
text_before += " " |
|
text_before = already_spoken_color_code + text_before + r"{\r}" |
|
|
|
if word_i < len(words_in_segment) - 1: |
|
text_after = " " + " ".join([x.text for x in words_in_segment[word_i + 1 :]]) |
|
else: |
|
text_after = "" |
|
text_after = not_yet_spoken_color_code + text_after + r"{\r}" |
|
|
|
aligned_text = being_spoken_color_code + word.text + r"{\r}" |
|
aligned_text_off = already_spoken_color_code + word.text + r"{\r}" |
|
|
|
subtitle_text = ( |
|
f"Dialogue: 0,{seconds_to_ass_format(word.t_start)},{seconds_to_ass_format(word.t_end)},Default,,0,0,0,," |
|
+ text_before |
|
+ aligned_text |
|
+ text_after.rstrip() |
|
) |
|
f.write(subtitle_text + '\n') |
|
|
|
|
|
if word_i < len(words_in_segment) - 1: |
|
last_word_end = float(words_in_segment[word_i].t_end) |
|
next_word_start = float(words_in_segment[word_i + 1].t_start) |
|
if next_word_start - last_word_end > 0.001: |
|
subtitle_text = ( |
|
f"Dialogue: 0,{seconds_to_ass_format(last_word_end)},{seconds_to_ass_format(next_word_start)},Default,,0,0,0,," |
|
+ text_before |
|
+ aligned_text_off |
|
+ text_after.rstrip() |
|
) |
|
f.write(subtitle_text + '\n') |
|
|
|
|
|
words_in_final_segment = [] |
|
for segment_or_token in utt_obj.segments_and_tokens[::-1]: |
|
if type(segment_or_token) is Segment: |
|
final_segment = segment_or_token |
|
|
|
for word_or_token in final_segment.words_and_tokens: |
|
if type(word_or_token) is Word: |
|
words_in_final_segment.append(word_or_token) |
|
break |
|
|
|
text_after_speech = already_spoken_color_code + " ".join([x.text for x in words_in_final_segment]) + r"{\r}" |
|
|
|
|
|
subtitle_text = ( |
|
f"Dialogue: 0,{seconds_to_ass_format(words_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,," |
|
+ text_after_speech.rstrip() |
|
) |
|
|
|
f.write(subtitle_text + '\n') |
|
|
|
utt_obj.saved_output_files[f"words_level_ass_filepath"] = output_file |
|
|
|
return utt_obj |
|
|
|
|
|
def make_token_level_ass_file(utt_obj, output_dir_root, ass_file_config, audio_dur): |
|
|
|
default_style_dict = { |
|
"Name": "Default", |
|
"Fontname": "Arial", |
|
"Fontsize": str(ass_file_config.fontsize), |
|
"PrimaryColour": "&Hffffff", |
|
"SecondaryColour": "&Hffffff", |
|
"OutlineColour": "&H0", |
|
"BackColour": "&H0", |
|
"Bold": "0", |
|
"Italic": "0", |
|
"Underline": "0", |
|
"StrikeOut": "0", |
|
"ScaleX": "100", |
|
"ScaleY": "100", |
|
"Spacing": "0", |
|
"Angle": "0", |
|
"BorderStyle": "1", |
|
"Outline": "1", |
|
"Shadow": "0", |
|
"Alignment": None, |
|
"MarginL": str(MARGINL), |
|
"MarginR": str(MARGINR), |
|
"MarginV": str(MARGINV), |
|
"Encoding": "0", |
|
} |
|
|
|
if ass_file_config.vertical_alignment == "top": |
|
default_style_dict["Alignment"] = "8" |
|
elif ass_file_config.vertical_alignment == "center": |
|
default_style_dict["Alignment"] = "5" |
|
elif ass_file_config.vertical_alignment == "bottom": |
|
default_style_dict["Alignment"] = "2" |
|
else: |
|
raise ValueError(f"got an unexpected value for ass_file_config.vertical_alignment") |
|
|
|
output_dir = os.path.join(output_dir_root, "ass", "tokens") |
|
os.makedirs(output_dir, exist_ok=True) |
|
output_file = os.path.join(output_dir, f"{utt_obj.utt_id}.ass") |
|
|
|
already_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_already_spoken_rgb) + r"&}" |
|
being_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_being_spoken_rgb) + r"&}" |
|
not_yet_spoken_color_code = r"{\c&H" + rgb_list_to_hex_bgr(ass_file_config.text_not_yet_spoken_rgb) + r"&}" |
|
|
|
with open(output_file, 'w') as f: |
|
default_style_top_line = "Format: " + ", ".join(default_style_dict.keys()) |
|
default_style_bottom_line = "Style: " + ",".join(default_style_dict.values()) |
|
|
|
f.write( |
|
( |
|
"[Script Info]\n" |
|
"ScriptType: v4.00+\n" |
|
f"PlayResX: {PLAYERRESX}\n" |
|
f"PlayResY: {PLAYERRESY}\n" |
|
"ScaledBorderAndShadow: yes\n" |
|
"\n" |
|
"[V4+ Styles]\n" |
|
f"{default_style_top_line}\n" |
|
f"{default_style_bottom_line}\n" |
|
"\n" |
|
"[Events]\n" |
|
"Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n" |
|
) |
|
) |
|
|
|
|
|
tokens_in_first_segment = [] |
|
for segment_or_token in utt_obj.segments_and_tokens: |
|
if type(segment_or_token) is Segment: |
|
for word_or_token in segment_or_token.words_and_tokens: |
|
if type(word_or_token) is Token: |
|
if word_or_token.text != BLANK_TOKEN: |
|
tokens_in_first_segment.append(word_or_token) |
|
else: |
|
for token in word_or_token.tokens: |
|
if token.text != BLANK_TOKEN: |
|
tokens_in_first_segment.append(token) |
|
|
|
break |
|
|
|
for token in tokens_in_first_segment: |
|
token.text_cased = token.text_cased.replace( |
|
"▁", " " |
|
) |
|
token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") |
|
|
|
text_before_speech = ( |
|
not_yet_spoken_color_code + "".join([x.text_cased for x in tokens_in_first_segment]) + r"{\r}" |
|
) |
|
subtitle_text = ( |
|
f"Dialogue: 0,{seconds_to_ass_format(0)},{seconds_to_ass_format(tokens_in_first_segment[0].t_start)},Default,,0,0,0,," |
|
+ text_before_speech.rstrip() |
|
) |
|
|
|
f.write(subtitle_text + '\n') |
|
|
|
for segment_or_token in utt_obj.segments_and_tokens: |
|
if type(segment_or_token) is Segment: |
|
segment = segment_or_token |
|
|
|
tokens_in_segment = [] |
|
for word_or_token in segment.words_and_tokens: |
|
if type(word_or_token) is Token: |
|
if word_or_token.text != BLANK_TOKEN: |
|
tokens_in_segment.append(word_or_token) |
|
else: |
|
for token in word_or_token.tokens: |
|
if token.text != BLANK_TOKEN: |
|
tokens_in_segment.append(token) |
|
|
|
for token in tokens_in_segment: |
|
token.text_cased = token.text_cased.replace( |
|
"▁", " " |
|
) |
|
token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") |
|
|
|
for token_i, token in enumerate(tokens_in_segment): |
|
|
|
text_before = "".join([x.text_cased for x in tokens_in_segment[:token_i]]) |
|
text_before = already_spoken_color_code + text_before + r"{\r}" |
|
|
|
if token_i < len(tokens_in_segment) - 1: |
|
text_after = "".join([x.text_cased for x in tokens_in_segment[token_i + 1 :]]) |
|
else: |
|
text_after = "" |
|
text_after = not_yet_spoken_color_code + text_after + r"{\r}" |
|
|
|
aligned_text = being_spoken_color_code + token.text_cased + r"{\r}" |
|
aligned_text_off = already_spoken_color_code + token.text_cased + r"{\r}" |
|
|
|
subtitle_text = ( |
|
f"Dialogue: 0,{seconds_to_ass_format(token.t_start)},{seconds_to_ass_format(token.t_end)},Default,,0,0,0,," |
|
+ text_before |
|
+ aligned_text |
|
+ text_after.rstrip() |
|
) |
|
f.write(subtitle_text + '\n') |
|
|
|
|
|
if token_i < len(tokens_in_segment) - 1: |
|
last_token_end = float(tokens_in_segment[token_i].t_end) |
|
next_token_start = float(tokens_in_segment[token_i + 1].t_start) |
|
if next_token_start - last_token_end > 0.001: |
|
subtitle_text = ( |
|
f"Dialogue: 0,{seconds_to_ass_format(last_token_end)},{seconds_to_ass_format(next_token_start)},Default,,0,0,0,," |
|
+ text_before |
|
+ aligned_text_off |
|
+ text_after.rstrip() |
|
) |
|
f.write(subtitle_text + '\n') |
|
|
|
|
|
|
|
tokens_in_final_segment = [] |
|
for segment_or_token in utt_obj.segments_and_tokens[::-1]: |
|
|
|
if type(segment_or_token) is Segment: |
|
|
|
for word_or_token in segment_or_token.words_and_tokens: |
|
if type(word_or_token) is Token: |
|
if word_or_token.text != BLANK_TOKEN: |
|
tokens_in_final_segment.append(word_or_token) |
|
else: |
|
|
|
for token in word_or_token.tokens: |
|
if token.text != BLANK_TOKEN: |
|
tokens_in_final_segment.append(token) |
|
break |
|
|
|
for token in tokens_in_final_segment: |
|
token.text_cased = token.text_cased.replace( |
|
"▁", " " |
|
) |
|
token.text_cased = token.text_cased.replace(SPACE_TOKEN, " ") |
|
|
|
text_after_speech = ( |
|
already_spoken_color_code + "".join([x.text_cased for x in tokens_in_final_segment]) + r"{\r}" |
|
) |
|
|
|
|
|
subtitle_text = ( |
|
f"Dialogue: 0,{seconds_to_ass_format(tokens_in_final_segment[-1].t_end)},{seconds_to_ass_format(math.ceil(audio_dur)+1)},Default,,0,0,0,," |
|
+ text_after_speech.rstrip() |
|
) |
|
|
|
f.write(subtitle_text + '\n') |
|
|
|
utt_obj.saved_output_files[f"tokens_level_ass_filepath"] = output_file |
|
|
|
return utt_obj |
|
|