File size: 5,377 Bytes
df07554 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import sys
sys.path.append('..')
import options
import os.path
import pronouncing
import options as opt
from Loader import GridLoader
from tqdm.auto import tqdm
from dataset import GridDataset
from typing import List
VALID_FILE_EXT = ('.txt', '.align')
EXCLUDED_PHONEMES = ('foreign', 'french')
MAX_VID_LEN = 100
CTC_SCALE = 2
base = os.path.abspath('..')
anno_dir = os.path.join(base, options.alignments_dir)
phonemes_dir = os.path.join(base, options.phonemes_dir)
images_dir = os.path.join(base, options.images_dir)
speaker_dirnames = sorted(os.listdir(anno_dir))
valid_sentence_pairs = []
sentence_pairs = []
for speaker_dirname in tqdm(speaker_dirnames):
speaker_dir = os.path.join(anno_dir, speaker_dirname)
filenames = os.listdir(speaker_dir)
for filename in filenames:
_, ext = os.path.splitext(filename)
if ext not in VALID_FILE_EXT:
continue
align_file = os.path.join(speaker_dir, filename)
sentence_pairs.append((speaker_dirname, filename))
sentence_pairs = sorted(sentence_pairs)
pbar = tqdm(sentence_pairs)
pairs_without_phonemes = 0
max_valid_vid_len = 0
max_valid_phonemes_len = 0
unique_phonemes = set()
valid_unique_phonemes = set()
unique_text_chars = set()
unique_words = set()
valid_unique_words = set()
max_length = 0
for sentence_pair in pbar:
speaker_dirname, filename = sentence_pair
basename, _ = os.path.splitext(filename)
align_file = os.path.join(anno_dir, speaker_dirname, filename)
pair_str = f'{speaker_dirname}/{basename}'
vid_images_dir = os.path.join(images_dir, speaker_dirname, basename)
image_filenames = os.listdir(vid_images_dir)
image_filenames = [
filename for filename in image_filenames
if filename.endswith('.jpg')
]
vid_len = len(image_filenames)
phonemes_speaker_dir = os.path.join(phonemes_dir, speaker_dirname)
if not os.path.exists(phonemes_speaker_dir):
os.mkdir(phonemes_speaker_dir)
phonemes_file = os.path.join(phonemes_dir, speaker_dirname, filename)
sentence: List[str] = GridDataset.load_sentence(
align_file, char_map=opt.text_char_map
)
sentence_str = ''.join(sentence)
sentence_words = sentence_str.split(' ')
sentence_phonemes = []
flat_sentence_phonemes = []
has_valid_phonemes = True
for char in sentence_str:
unique_text_chars.add(char)
for word in sentence_words:
phoneme_set = pronouncing.phones_for_word(word)
if len(phoneme_set) == 0:
pbar.desc = f'NO-PHONEMES: {word} [{pairs_without_phonemes}]'
has_valid_phonemes = False
pairs_without_phonemes += 1
break
phonemes = pronouncing.phones_for_word(word)[0]
phonemes = phonemes.split(' ')
assert len(phonemes) > 0
length = 0
for phoneme in phonemes:
if phoneme in EXCLUDED_PHONEMES:
has_valid_phonemes = False
pairs_without_phonemes += 1
break
unique_phonemes.add(phoneme)
if not has_valid_phonemes:
break
sentence_phonemes.append(phonemes)
flat_sentence_phonemes.extend(phonemes)
flat_sentence_phonemes.append(' ')
unique_words.add(word)
length += len(phonemes)
if not has_valid_phonemes:
continue
if flat_sentence_phonemes[-1] == ' ':
flat_sentence_phonemes = flat_sentence_phonemes[:-1]
is_valid_video = (
(vid_len > 0) and
(vid_len < MAX_VID_LEN) and
# (vid_len > 2 * len(sentence_str)) and
(vid_len > CTC_SCALE * len(flat_sentence_phonemes)) and
has_valid_phonemes
)
if is_valid_video:
valid_sentence_pairs.append(sentence_pair)
num_flat_phonemes = len(flat_sentence_phonemes)
if vid_len > max_valid_vid_len:
max_valid_vid_len = vid_len
if num_flat_phonemes > max_valid_phonemes_len:
max_valid_phonemes_len = num_flat_phonemes
for word in sentence_words:
valid_unique_words.add(word)
for phonemes in sentence_phonemes:
for phoneme in phonemes:
valid_unique_phonemes.add(phoneme)
# sentence_phonemes = ' '.join(sentence_phonemes)
# print(sentence_phonemes)
raw_phonemes = '\n'.join([
' '.join(phonemes) for phonemes in sentence_phonemes
])
# print(phonemes_file)
if not os.path.exists(phonemes_file):
open(phonemes_file, 'w').write(raw_phonemes)
# input('>>> ')
valid_pair_dirs = []
for sentence_pair in valid_sentence_pairs:
speaker_dirname, filename = sentence_pair
basename, _ = os.path.splitext(filename)
pair_str = f'{speaker_dirname}/{basename}'
valid_pair_dirs.append(pair_str)
open(f'../data/{opt.dataset}-CTC{CTC_SCALE}-valid-pairs.txt', 'w').write(
'\n'.join(valid_pair_dirs)
)
print('VALID PAIRS', len(valid_pair_dirs))
print('VALID UNIQUE WORDS', valid_unique_words)
print('PAIRS W/O PHONEMES', pairs_without_phonemes)
print('UNIQUE PHONEMES', sorted(list(unique_phonemes)))
print('VALID UNIQUE PHONEMES', sorted(list(valid_unique_phonemes)))
print('UNIQUE CHARS', sorted(list(unique_text_chars)))
print('MAX VALID PHONEMES LEN', max_valid_phonemes_len)
print('MAX VALID VID LEN', max_valid_vid_len)
print('>>>')
# print(sentence_pairs[:10]) |