torchnet / scripts /phenome_extractor.py
milselarch's picture
push to main
df07554
import sys
sys.path.append('..')
import options
import os.path
import pronouncing
import options as opt
from Loader import GridLoader
from tqdm.auto import tqdm
from dataset import GridDataset
from typing import List
VALID_FILE_EXT = ('.txt', '.align')
EXCLUDED_PHONEMES = ('foreign', 'french')
MAX_VID_LEN = 100
CTC_SCALE = 2
base = os.path.abspath('..')
anno_dir = os.path.join(base, options.alignments_dir)
phonemes_dir = os.path.join(base, options.phonemes_dir)
images_dir = os.path.join(base, options.images_dir)
speaker_dirnames = sorted(os.listdir(anno_dir))
valid_sentence_pairs = []
sentence_pairs = []
for speaker_dirname in tqdm(speaker_dirnames):
speaker_dir = os.path.join(anno_dir, speaker_dirname)
filenames = os.listdir(speaker_dir)
for filename in filenames:
_, ext = os.path.splitext(filename)
if ext not in VALID_FILE_EXT:
continue
align_file = os.path.join(speaker_dir, filename)
sentence_pairs.append((speaker_dirname, filename))
sentence_pairs = sorted(sentence_pairs)
pbar = tqdm(sentence_pairs)
pairs_without_phonemes = 0
max_valid_vid_len = 0
max_valid_phonemes_len = 0
unique_phonemes = set()
valid_unique_phonemes = set()
unique_text_chars = set()
unique_words = set()
valid_unique_words = set()
max_length = 0
for sentence_pair in pbar:
speaker_dirname, filename = sentence_pair
basename, _ = os.path.splitext(filename)
align_file = os.path.join(anno_dir, speaker_dirname, filename)
pair_str = f'{speaker_dirname}/{basename}'
vid_images_dir = os.path.join(images_dir, speaker_dirname, basename)
image_filenames = os.listdir(vid_images_dir)
image_filenames = [
filename for filename in image_filenames
if filename.endswith('.jpg')
]
vid_len = len(image_filenames)
phonemes_speaker_dir = os.path.join(phonemes_dir, speaker_dirname)
if not os.path.exists(phonemes_speaker_dir):
os.mkdir(phonemes_speaker_dir)
phonemes_file = os.path.join(phonemes_dir, speaker_dirname, filename)
sentence: List[str] = GridDataset.load_sentence(
align_file, char_map=opt.text_char_map
)
sentence_str = ''.join(sentence)
sentence_words = sentence_str.split(' ')
sentence_phonemes = []
flat_sentence_phonemes = []
has_valid_phonemes = True
for char in sentence_str:
unique_text_chars.add(char)
for word in sentence_words:
phoneme_set = pronouncing.phones_for_word(word)
if len(phoneme_set) == 0:
pbar.desc = f'NO-PHONEMES: {word} [{pairs_without_phonemes}]'
has_valid_phonemes = False
pairs_without_phonemes += 1
break
phonemes = pronouncing.phones_for_word(word)[0]
phonemes = phonemes.split(' ')
assert len(phonemes) > 0
length = 0
for phoneme in phonemes:
if phoneme in EXCLUDED_PHONEMES:
has_valid_phonemes = False
pairs_without_phonemes += 1
break
unique_phonemes.add(phoneme)
if not has_valid_phonemes:
break
sentence_phonemes.append(phonemes)
flat_sentence_phonemes.extend(phonemes)
flat_sentence_phonemes.append(' ')
unique_words.add(word)
length += len(phonemes)
if not has_valid_phonemes:
continue
if flat_sentence_phonemes[-1] == ' ':
flat_sentence_phonemes = flat_sentence_phonemes[:-1]
is_valid_video = (
(vid_len > 0) and
(vid_len < MAX_VID_LEN) and
# (vid_len > 2 * len(sentence_str)) and
(vid_len > CTC_SCALE * len(flat_sentence_phonemes)) and
has_valid_phonemes
)
if is_valid_video:
valid_sentence_pairs.append(sentence_pair)
num_flat_phonemes = len(flat_sentence_phonemes)
if vid_len > max_valid_vid_len:
max_valid_vid_len = vid_len
if num_flat_phonemes > max_valid_phonemes_len:
max_valid_phonemes_len = num_flat_phonemes
for word in sentence_words:
valid_unique_words.add(word)
for phonemes in sentence_phonemes:
for phoneme in phonemes:
valid_unique_phonemes.add(phoneme)
# sentence_phonemes = ' '.join(sentence_phonemes)
# print(sentence_phonemes)
raw_phonemes = '\n'.join([
' '.join(phonemes) for phonemes in sentence_phonemes
])
# print(phonemes_file)
if not os.path.exists(phonemes_file):
open(phonemes_file, 'w').write(raw_phonemes)
# input('>>> ')
valid_pair_dirs = []
for sentence_pair in valid_sentence_pairs:
speaker_dirname, filename = sentence_pair
basename, _ = os.path.splitext(filename)
pair_str = f'{speaker_dirname}/{basename}'
valid_pair_dirs.append(pair_str)
open(f'../data/{opt.dataset}-CTC{CTC_SCALE}-valid-pairs.txt', 'w').write(
'\n'.join(valid_pair_dirs)
)
print('VALID PAIRS', len(valid_pair_dirs))
print('VALID UNIQUE WORDS', valid_unique_words)
print('PAIRS W/O PHONEMES', pairs_without_phonemes)
print('UNIQUE PHONEMES', sorted(list(unique_phonemes)))
print('VALID UNIQUE PHONEMES', sorted(list(valid_unique_phonemes)))
print('UNIQUE CHARS', sorted(list(unique_text_chars)))
print('MAX VALID PHONEMES LEN', max_valid_phonemes_len)
print('MAX VALID VID LEN', max_valid_vid_len)
print('>>>')
# print(sentence_pairs[:10])