File size: 4,034 Bytes
82334b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Copyright 2024 LY Corporation

# LY Corporation licenses this file to you under the Apache License,
# version 2.0 (the "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at:

#   https://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import argparse
import sys
from pathlib import Path
from shutil import copy2

import numpy as np
import soundfile as sf
from common import getLogger, load_libritts_spk_metadata
from tqdm.auto import tqdm

format = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"


def get_parser():
    parser = argparse.ArgumentParser(
        description="Finalize MFA and LibriTTS-R data",
    )
    parser.add_argument(
        "in_dir", type=str, help="LibriTTS per-speaker restructured data root"
    )
    parser.add_argument("mfa_dir", type=str, help="MFA output directory")
    parser.add_argument("out_dir", type=str, help="Output directory")
    parser.add_argument("--debug", action="store_true", help="Debug")

    return parser


def read_and_save(in_file, out_file):
    # let's make sure to have int16 dtype for saved files
    x, sr = sf.read(in_file)
    assert sr == 24000
    if x.dtype == np.float32 or x.dtype == np.float64:
        assert np.abs(x).max() <= 1.0
        x = (x * 32767).astype(np.int16)
    assert x.dtype == np.int16
    sf.write(out_file, x, sr)


if __name__ == "__main__":
    args = get_parser().parse_args(sys.argv[1:])

    spk2meta = load_libritts_spk_metadata(debug=args.debug)
    in_dir = Path(args.in_dir)
    mfa_dir = Path(args.mfa_dir)
    out_dir = Path(args.out_dir)
    out_dir.mkdir(exist_ok=True, parents=True)

    logger = getLogger(
        verbose=100, filename="log/finalize_mfa.log", name="finalize_mfa"
    )

    total_duration = 0
    missing_utt_ids = []
    for spk, _ in tqdm(spk2meta.items()):
        spk_in_dir = in_dir / spk
        spk_mfa_dir = mfa_dir / spk

        if not spk_in_dir.exists():
            logger.warning(f"No input dir for {spk}")
            continue

        out_tgr_dir = out_dir / spk / "textgrid"
        out_wav_dir = out_dir / spk / "wav24k"
        out_txt_dir = out_dir / spk
        for d in [out_tgr_dir, out_wav_dir, out_txt_dir]:
            d.mkdir(exist_ok=True, parents=True)

        org_wav_files = sorted(list(spk_in_dir.glob("*.wav")))
        org_utt_ids = [f.stem for f in org_wav_files]

        textgrid_files = sorted(list(spk_mfa_dir.glob("*.TextGrid")))
        # valid utt_ids
        utt_ids = [f.stem for f in textgrid_files]
        wav_files = [spk_in_dir / f"{utt_id}.wav" for utt_id in utt_ids]

        if len(org_utt_ids) != len(utt_ids):
            spk_missing_utt_ids = list(set(org_utt_ids) - set(utt_ids))
            logger.warning(f"Missing {len(spk_missing_utt_ids)} utt_ids for {spk}")
            missing_utt_ids.extend(spk_missing_utt_ids)

        phones = {}
        for utt_id in utt_ids:
            # wav
            in_wav_file = spk_in_dir / f"{utt_id}.wav"
            assert in_wav_file.exists()
            out_wav_file = out_wav_dir / f"{utt_id}.wav"
            read_and_save(in_wav_file, out_wav_file)

            # textgrid
            in_textgrid_file = spk_mfa_dir / f"{utt_id}.TextGrid"
            assert in_textgrid_file.exists()
            out_textgrid_file = out_tgr_dir / f"{utt_id}.TextGrid"
            copy2(in_textgrid_file, out_textgrid_file)

    logger.info(f"Total duration: {total_duration/3600:.2f} hours")
    logger.info(f"Numbere of missing utterance IDs: {len(missing_utt_ids)}")

    # Write missing_utt_ids.txt
    with open(out_dir / "missing_utt_ids.txt", "w") as f:
        for utt_id in missing_utt_ids:
            f.write(f"{utt_id}\n")