|
import sys |
|
|
|
sys.path.append('..') |
|
|
|
import options |
|
import os.path |
|
import pronouncing |
|
import options as opt |
|
|
|
from Loader import GridLoader |
|
from tqdm.auto import tqdm |
|
from dataset import GridDataset |
|
from typing import List |
|
|
|
VALID_FILE_EXT = ('.txt', '.align') |
|
EXCLUDED_PHONEMES = ('foreign', 'french') |
|
MAX_VID_LEN = 100 |
|
CTC_SCALE = 2 |
|
|
|
base = os.path.abspath('..') |
|
anno_dir = os.path.join(base, options.alignments_dir) |
|
phonemes_dir = os.path.join(base, options.phonemes_dir) |
|
images_dir = os.path.join(base, options.images_dir) |
|
speaker_dirnames = sorted(os.listdir(anno_dir)) |
|
|
|
valid_sentence_pairs = [] |
|
sentence_pairs = [] |
|
|
|
for speaker_dirname in tqdm(speaker_dirnames): |
|
speaker_dir = os.path.join(anno_dir, speaker_dirname) |
|
filenames = os.listdir(speaker_dir) |
|
|
|
for filename in filenames: |
|
_, ext = os.path.splitext(filename) |
|
if ext not in VALID_FILE_EXT: |
|
continue |
|
|
|
align_file = os.path.join(speaker_dir, filename) |
|
sentence_pairs.append((speaker_dirname, filename)) |
|
|
|
sentence_pairs = sorted(sentence_pairs) |
|
pbar = tqdm(sentence_pairs) |
|
pairs_without_phonemes = 0 |
|
max_valid_vid_len = 0 |
|
max_valid_phonemes_len = 0 |
|
|
|
unique_phonemes = set() |
|
valid_unique_phonemes = set() |
|
unique_text_chars = set() |
|
unique_words = set() |
|
valid_unique_words = set() |
|
max_length = 0 |
|
|
|
for sentence_pair in pbar: |
|
speaker_dirname, filename = sentence_pair |
|
basename, _ = os.path.splitext(filename) |
|
align_file = os.path.join(anno_dir, speaker_dirname, filename) |
|
|
|
pair_str = f'{speaker_dirname}/{basename}' |
|
vid_images_dir = os.path.join(images_dir, speaker_dirname, basename) |
|
image_filenames = os.listdir(vid_images_dir) |
|
image_filenames = [ |
|
filename for filename in image_filenames |
|
if filename.endswith('.jpg') |
|
] |
|
|
|
vid_len = len(image_filenames) |
|
|
|
phonemes_speaker_dir = os.path.join(phonemes_dir, speaker_dirname) |
|
if not os.path.exists(phonemes_speaker_dir): |
|
os.mkdir(phonemes_speaker_dir) |
|
|
|
phonemes_file = os.path.join(phonemes_dir, speaker_dirname, filename) |
|
sentence: List[str] = GridDataset.load_sentence( |
|
align_file, char_map=opt.text_char_map |
|
) |
|
|
|
sentence_str = ''.join(sentence) |
|
sentence_words = sentence_str.split(' ') |
|
sentence_phonemes = [] |
|
flat_sentence_phonemes = [] |
|
has_valid_phonemes = True |
|
|
|
for char in sentence_str: |
|
unique_text_chars.add(char) |
|
|
|
for word in sentence_words: |
|
phoneme_set = pronouncing.phones_for_word(word) |
|
if len(phoneme_set) == 0: |
|
pbar.desc = f'NO-PHONEMES: {word} [{pairs_without_phonemes}]' |
|
has_valid_phonemes = False |
|
pairs_without_phonemes += 1 |
|
break |
|
|
|
phonemes = pronouncing.phones_for_word(word)[0] |
|
phonemes = phonemes.split(' ') |
|
assert len(phonemes) > 0 |
|
|
|
length = 0 |
|
for phoneme in phonemes: |
|
if phoneme in EXCLUDED_PHONEMES: |
|
has_valid_phonemes = False |
|
pairs_without_phonemes += 1 |
|
break |
|
|
|
unique_phonemes.add(phoneme) |
|
|
|
if not has_valid_phonemes: |
|
break |
|
|
|
sentence_phonemes.append(phonemes) |
|
flat_sentence_phonemes.extend(phonemes) |
|
flat_sentence_phonemes.append(' ') |
|
|
|
unique_words.add(word) |
|
length += len(phonemes) |
|
|
|
if not has_valid_phonemes: |
|
continue |
|
|
|
if flat_sentence_phonemes[-1] == ' ': |
|
flat_sentence_phonemes = flat_sentence_phonemes[:-1] |
|
|
|
is_valid_video = ( |
|
(vid_len > 0) and |
|
(vid_len < MAX_VID_LEN) and |
|
|
|
(vid_len > CTC_SCALE * len(flat_sentence_phonemes)) and |
|
has_valid_phonemes |
|
) |
|
|
|
if is_valid_video: |
|
valid_sentence_pairs.append(sentence_pair) |
|
num_flat_phonemes = len(flat_sentence_phonemes) |
|
|
|
if vid_len > max_valid_vid_len: |
|
max_valid_vid_len = vid_len |
|
if num_flat_phonemes > max_valid_phonemes_len: |
|
max_valid_phonemes_len = num_flat_phonemes |
|
|
|
for word in sentence_words: |
|
valid_unique_words.add(word) |
|
|
|
for phonemes in sentence_phonemes: |
|
for phoneme in phonemes: |
|
valid_unique_phonemes.add(phoneme) |
|
|
|
|
|
|
|
raw_phonemes = '\n'.join([ |
|
' '.join(phonemes) for phonemes in sentence_phonemes |
|
]) |
|
|
|
|
|
if not os.path.exists(phonemes_file): |
|
open(phonemes_file, 'w').write(raw_phonemes) |
|
|
|
|
|
|
|
valid_pair_dirs = [] |
|
for sentence_pair in valid_sentence_pairs: |
|
speaker_dirname, filename = sentence_pair |
|
basename, _ = os.path.splitext(filename) |
|
pair_str = f'{speaker_dirname}/{basename}' |
|
valid_pair_dirs.append(pair_str) |
|
|
|
open(f'../data/{opt.dataset}-CTC{CTC_SCALE}-valid-pairs.txt', 'w').write( |
|
'\n'.join(valid_pair_dirs) |
|
) |
|
|
|
print('VALID PAIRS', len(valid_pair_dirs)) |
|
print('VALID UNIQUE WORDS', valid_unique_words) |
|
print('PAIRS W/O PHONEMES', pairs_without_phonemes) |
|
print('UNIQUE PHONEMES', sorted(list(unique_phonemes))) |
|
print('VALID UNIQUE PHONEMES', sorted(list(valid_unique_phonemes))) |
|
print('UNIQUE CHARS', sorted(list(unique_text_chars))) |
|
print('MAX VALID PHONEMES LEN', max_valid_phonemes_len) |
|
print('MAX VALID VID LEN', max_valid_vid_len) |
|
print('>>>') |
|
|