|
import os |
|
import options |
|
import pronouncing |
|
|
|
from tqdm.auto import tqdm |
|
from typing import List |
|
from dataset import GridDataset |
|
|
|
base = os.path.abspath('..') |
|
anno_dir = os.path.join(base, options.alignments_dir) |
|
phonemes_dir = os.path.join(base, options.phonemes_dir) |
|
images_dir = os.path.join(base, options.images_dir) |
|
datasets_filenames = ['overlap_train.txt', 'overlap_val.txt'] |
|
|
|
max_vid_len = 0 |
|
max_text_len = 0 |
|
max_phonemes_len = 0 |
|
|
|
for datasets_filename in datasets_filenames: |
|
datasets_filepath = os.path.join(base, 'data', datasets_filename) |
|
new_datasets_filepath = os.path.join( |
|
base, 'data', 'phonemes_' + datasets_filename |
|
) |
|
|
|
video_filepaths = open(datasets_filepath, 'r').readlines() |
|
valid_filepaths = [] |
|
|
|
for video_filepath in tqdm(video_filepaths): |
|
video_filepath = video_filepath.strip() |
|
basename = os.path.basename(video_filepath) |
|
parts = video_filepath.split('/') |
|
speaker_dirname = parts[0] |
|
|
|
align_file = os.path.join( |
|
anno_dir, speaker_dirname, f'{basename}.align' |
|
) |
|
vid_images_dir = os.path.join( |
|
images_dir, speaker_dirname, basename |
|
) |
|
new_video_filepath = os.path.join( |
|
options.video_dir, speaker_dirname, f'{basename}.mpg' |
|
) |
|
|
|
image_filenames = os.listdir(vid_images_dir) |
|
image_filenames = [ |
|
filename for filename in image_filenames |
|
if filename.endswith('.jpg') |
|
] |
|
|
|
|
|
|
|
|
|
vid_len = len(image_filenames) |
|
|
|
try: |
|
sentence: List[str] = GridDataset.load_sentence( |
|
align_file, char_map=options.text_char_map |
|
) |
|
except FileNotFoundError: |
|
continue |
|
|
|
text_len = len(sentence) |
|
sentence_str = ''.join(sentence) |
|
phonemes_sentence = GridDataset.text_to_phonemes( |
|
sentence_str, as_str=False |
|
) |
|
|
|
phonemes_len = len(phonemes_sentence) |
|
|
|
|
|
max_vid_len = max(vid_len, max_vid_len) |
|
max_text_len = max(text_len, max_text_len) |
|
max_phonemes_len = max(phonemes_len, max_phonemes_len) |
|
assert ( |
|
(max_vid_len > 2 * max_text_len) and |
|
(max_vid_len > 2 * max_phonemes_len) |
|
) |
|
|
|
valid_filepaths.append(new_video_filepath) |
|
|
|
open(new_datasets_filepath, 'w').write('\n'.join(valid_filepaths)) |
|
print('new valid filepaths written to:', new_datasets_filepath) |
|
|
|
|
|
print('MAX_VID_LEN', max_vid_len) |
|
print('MAX_TEXT_LEN', max_text_len) |
|
print('MAX_PHONEMES_LEN', max_phonemes_len) |