File size: 3,740 Bytes
31ad50e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import argparse
import os
from typing import Tuple

from scipy.io import wavfile
from torchaudio.datasets import LIBRITTS
from tqdm import tqdm


def load_libritts_item(
    fileid: str,
    path: str,
    ext_audio: str,
    ext_original_txt: str,
    ext_normalized_txt: str,
) -> Tuple[int, int, str, str, int, int, str]:
    speaker_id, chapter_id, segment_id, utterance_id = fileid.split("_")
    utterance_id = fileid

    normalized_text = utterance_id + ext_normalized_txt
    normalized_text = os.path.join(path, speaker_id, chapter_id, normalized_text)

    original_text = utterance_id + ext_original_txt
    original_text = os.path.join(path, speaker_id, chapter_id, original_text)

    file_audio = utterance_id + ext_audio
    file_audio = os.path.join(path, speaker_id, chapter_id, file_audio)

    # Load audio
    sample_rate, wav = wavfile.read(file_audio)
    n_frames = wav.shape[0]

    # Load original text
    # with open(original_text) as ft:
    #     original_text = ft.readline()

    # Load normalized text
    with open(normalized_text, "r") as ft:
        normalized_text = ft.readline()

    return (
        n_frames,
        sample_rate,
        None,
        normalized_text,
        int(speaker_id),
        int(chapter_id),
        utterance_id,
    )


class LIBRITTS_16K(LIBRITTS):
    def __getitem__(self, n: int) -> Tuple[int, int, str, str, int, int, str]:
        """Load the n-th sample from the dataset.

        Args:
            n (int): The index of the sample to be loaded

        Returns:
            (Tensor, int, str, str, str, int, int, str):
            ``(waveform_length, sample_rate, original_text, normalized_text, speaker_id, chapter_id, utterance_id)``
        """
        fileid = self._walker[n]
        return load_libritts_item(
            fileid,
            self._path,
            self._ext_audio,
            self._ext_original_txt,
            self._ext_normalized_txt,
        )


def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "root", metavar="DIR", help="root directory containing wav files to index"
    )
    parser.add_argument(
        "--dest", default=".", type=str, metavar="DIR", help="output directory"
    )
    parser.add_argument(
        "--split", required=True, type=str, help="dataset splits"
    )
    parser.add_argument(
        "--wav-root", default=None, type=str, metavar="DIR", help="saved waveform root directory for tsv"
    )
    parser.add_argument(
        "--spkemb-npy-dir", required=True, type=str, help="speaker embedding directory"
    )
    return parser

def main(args):
    dest_dir = args.dest
    wav_root = args.wav_root
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    dataset = LIBRITTS_16K(os.path.dirname(args.root), url=args.split, folder_in_archive=os.path.basename(args.root))
    tsv_f = open(os.path.join(dest_dir, f"{args.split}.tsv"), "w")
    txt_f = open(os.path.join(dest_dir, f"{args.split}.txt"), "w")
    print(wav_root, file=tsv_f)

    for n_frames, sr, ori_text, norm_text, spk_id, chap_id, utt_id in tqdm(dataset, desc="tsv/txt/wav"):
        assert sr == 16000, f"sampling rate {sr} != 16000"
        utt_file = os.path.join(args.split, f"{spk_id}", f"{chap_id}", f"{utt_id}.wav")
        spk_file = os.path.join(args.spkemb_npy_dir, f"{spk_id}-{chap_id}-{utt_id}.npy")
        assert os.path.exists(os.path.join(wav_root, utt_file))
        assert os.path.exists(os.path.join(wav_root, spk_file))

        print(f"{utt_file}\t{n_frames}\t{spk_file}", file=tsv_f)
        print(norm_text, file=txt_f)

    tsv_f.close()
    txt_f.close()


if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()
    main(args)