File size: 4,152 Bytes
2d8da09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import multiprocessing as mp
from itertools import repeat
from pathlib import Path

import librosa
from tqdm import tqdm

from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
from nemo.collections.asr.parts.utils.vad_utils import get_frame_labels, load_speech_segments_from_rttm

"""
This script generates a manifest file for synthetic data generated using the NeMo multispeaker speech data simulator.
The audio created from the simulator can be used to train a VAD model using the manifest file contains the following fields:
The manifest file contains the following fields:
    
    audio_filepath (str): Path to audio file.
    offset (float): Offset in seconds for the start of the audio file.
    duration (float): Duration in seconds for the audio file.
    text (str): Transcription of the audio file.
    label (list): List of frame labels for the audio file.
    orig_sample_rate (int): Original sample rate of the audio file.
    vad_frame_unit_secs (float): Duration in seconds for each frame label.

Usage:
    python build_synthetic_vad_manifest.py \
        --input_dir /path/to/synthetic/data \
        --frame_length 0.04 \
        --output_file /path/to/output/manifest.json
"""


def generate_manifest_entry(inputs):
    """
    Generates a manifest entry for a single audio file. 
    This function is parallelized using multiprocessing.Pool.

    Args:
        inputs (tuple): Tuple containing audio file path and frame length in seconds.
            inputs[0]: 
                audio_filepath (str): Path to audio file.
            inputs[1]: 
                vad_frame_unit_secs (float): Duration in seconds for each frame label.

    Returns:
        entry (dict): Dictionary containing manifest entry.
    """
    audio_filepath, vad_frame_unit_secs = inputs
    audio_filepath = Path(audio_filepath)
    y, sr = librosa.load(str(audio_filepath))
    dur = librosa.get_duration(y=y, sr=sr)

    manifest_path = audio_filepath.parent / Path(f"{audio_filepath.stem}.json")
    audio_manifest = read_manifest(manifest_path)
    text = " ".join([x["text"] for x in audio_manifest])

    rttm_path = audio_filepath.parent / Path(f"{audio_filepath.stem}.rttm")
    segments = load_speech_segments_from_rttm(rttm_path)
    labels = get_frame_labels(segments, vad_frame_unit_secs, 0.0, dur)

    entry = {
        "audio_filepath": str(audio_filepath.absolute()),
        "offset": 0.0,
        "duration": dur,
        "text": text,
        "label": labels,
        "orig_sample_rate": sr,
        "vad_frame_unit_secs": vad_frame_unit_secs,
    }
    return entry


def main(args):
    wav_list = list(Path(args.input_dir).glob("*.wav"))
    print(f"Found {len(wav_list)} in directory: {args.input_dir}")

    inputs = zip(wav_list, repeat(args.frame_length))
    with mp.Pool(processes=mp.cpu_count()) as pool:
        manifest_data = list(tqdm(pool.imap(generate_manifest_entry, inputs), total=len(wav_list)))

    write_manifest(args.output_file, manifest_data)
    print(f"Manifest saved to: {args.output_file}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input_dir", default=None, help="Path to directory containing synthetic data")
    parser.add_argument(
        "-l", "--frame_length", default=0.04, type=float, help="Duration in seconds for each frame label"
    )
    parser.add_argument("-o", "--output_file", default=None, help="Path to output manifest file")

    args = parser.parse_args()
    main(args)