File size: 5,377 Bytes
df07554
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import sys

sys.path.append('..')

import options
import os.path
import pronouncing
import options as opt

from Loader import GridLoader
from tqdm.auto import tqdm
from dataset import GridDataset
from typing import List

VALID_FILE_EXT = ('.txt', '.align')
EXCLUDED_PHONEMES = ('foreign', 'french')
MAX_VID_LEN = 100
CTC_SCALE = 2

base = os.path.abspath('..')
anno_dir = os.path.join(base, options.alignments_dir)
phonemes_dir = os.path.join(base, options.phonemes_dir)
images_dir = os.path.join(base, options.images_dir)
speaker_dirnames = sorted(os.listdir(anno_dir))

valid_sentence_pairs = []
sentence_pairs = []

for speaker_dirname in tqdm(speaker_dirnames):
    speaker_dir = os.path.join(anno_dir, speaker_dirname)
    filenames = os.listdir(speaker_dir)

    for filename in filenames:
        _, ext = os.path.splitext(filename)
        if ext not in VALID_FILE_EXT:
            continue

        align_file = os.path.join(speaker_dir, filename)
        sentence_pairs.append((speaker_dirname, filename))

sentence_pairs = sorted(sentence_pairs)
pbar = tqdm(sentence_pairs)
pairs_without_phonemes = 0
max_valid_vid_len = 0
max_valid_phonemes_len = 0

unique_phonemes = set()
valid_unique_phonemes = set()
unique_text_chars = set()
unique_words = set()
valid_unique_words = set()
max_length = 0

for sentence_pair in pbar:
    speaker_dirname, filename = sentence_pair
    basename, _ = os.path.splitext(filename)
    align_file = os.path.join(anno_dir, speaker_dirname, filename)

    pair_str = f'{speaker_dirname}/{basename}'
    vid_images_dir = os.path.join(images_dir, speaker_dirname, basename)
    image_filenames = os.listdir(vid_images_dir)
    image_filenames = [
        filename for filename in image_filenames
        if filename.endswith('.jpg')
    ]

    vid_len = len(image_filenames)

    phonemes_speaker_dir = os.path.join(phonemes_dir, speaker_dirname)
    if not os.path.exists(phonemes_speaker_dir):
        os.mkdir(phonemes_speaker_dir)

    phonemes_file = os.path.join(phonemes_dir, speaker_dirname, filename)
    sentence: List[str] = GridDataset.load_sentence(
        align_file, char_map=opt.text_char_map
    )

    sentence_str = ''.join(sentence)
    sentence_words = sentence_str.split(' ')
    sentence_phonemes = []
    flat_sentence_phonemes = []
    has_valid_phonemes = True

    for char in sentence_str:
        unique_text_chars.add(char)

    for word in sentence_words:
        phoneme_set = pronouncing.phones_for_word(word)
        if len(phoneme_set) == 0:
            pbar.desc = f'NO-PHONEMES: {word} [{pairs_without_phonemes}]'
            has_valid_phonemes = False
            pairs_without_phonemes += 1
            break

        phonemes = pronouncing.phones_for_word(word)[0]
        phonemes = phonemes.split(' ')
        assert len(phonemes) > 0

        length = 0
        for phoneme in phonemes:
            if phoneme in EXCLUDED_PHONEMES:
                has_valid_phonemes = False
                pairs_without_phonemes += 1
                break

            unique_phonemes.add(phoneme)

        if not has_valid_phonemes:
            break

        sentence_phonemes.append(phonemes)
        flat_sentence_phonemes.extend(phonemes)
        flat_sentence_phonemes.append(' ')

        unique_words.add(word)
        length += len(phonemes)

    if not has_valid_phonemes:
        continue

    if flat_sentence_phonemes[-1] == ' ':
        flat_sentence_phonemes = flat_sentence_phonemes[:-1]

    is_valid_video = (
        (vid_len > 0) and
        (vid_len < MAX_VID_LEN) and
        # (vid_len > 2 * len(sentence_str)) and
        (vid_len > CTC_SCALE * len(flat_sentence_phonemes)) and
        has_valid_phonemes
    )

    if is_valid_video:
        valid_sentence_pairs.append(sentence_pair)
        num_flat_phonemes = len(flat_sentence_phonemes)

        if vid_len > max_valid_vid_len:
            max_valid_vid_len = vid_len
        if num_flat_phonemes > max_valid_phonemes_len:
            max_valid_phonemes_len = num_flat_phonemes

        for word in sentence_words:
            valid_unique_words.add(word)

        for phonemes in sentence_phonemes:
            for phoneme in phonemes:
                valid_unique_phonemes.add(phoneme)

    # sentence_phonemes = ' '.join(sentence_phonemes)
    # print(sentence_phonemes)
    raw_phonemes = '\n'.join([
        ' '.join(phonemes) for phonemes in sentence_phonemes
    ])

    # print(phonemes_file)
    if not os.path.exists(phonemes_file):
        open(phonemes_file, 'w').write(raw_phonemes)

    # input('>>> ')

valid_pair_dirs = []
for sentence_pair in valid_sentence_pairs:
    speaker_dirname, filename = sentence_pair
    basename, _ = os.path.splitext(filename)
    pair_str = f'{speaker_dirname}/{basename}'
    valid_pair_dirs.append(pair_str)

open(f'../data/{opt.dataset}-CTC{CTC_SCALE}-valid-pairs.txt', 'w').write(
    '\n'.join(valid_pair_dirs)
)

print('VALID PAIRS', len(valid_pair_dirs))
print('VALID UNIQUE WORDS', valid_unique_words)
print('PAIRS W/O PHONEMES', pairs_without_phonemes)
print('UNIQUE PHONEMES', sorted(list(unique_phonemes)))
print('VALID UNIQUE PHONEMES', sorted(list(valid_unique_phonemes)))
print('UNIQUE CHARS', sorted(list(unique_text_chars)))
print('MAX VALID PHONEMES LEN', max_valid_phonemes_len)
print('MAX VALID VID LEN', max_valid_vid_len)
print('>>>')
# print(sentence_pairs[:10])