File size: 4,970 Bytes
67c46fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita)
# Licensed under the MIT license.
#
# This library provides utilities for kaldi-style data directory.


from __future__ import print_function
import os
import sys
import numpy as np
import subprocess
import librosa as sf
import io
from functools import lru_cache


def load_segments(segments_file):
    """load segments file as array"""
    if not os.path.exists(segments_file):
        return None
    return np.loadtxt(
        segments_file,
        dtype=[("utt", "object"), ("rec", "object"), ("st", "f"), ("et", "f")],
        ndmin=1,
    )


def load_segments_hash(segments_file):
    ret = {}
    if not os.path.exists(segments_file):
        return None
    for line in open(segments_file):
        utt, rec, st, et = line.strip().split()
        ret[utt] = (rec, float(st), float(et))
    return ret


def load_segments_rechash(segments_file):
    ret = {}
    if not os.path.exists(segments_file):
        return None
    for line in open(segments_file):
        utt, rec, st, et = line.strip().split()
        if rec not in ret:
            ret[rec] = []
        ret[rec].append({"utt": utt, "st": float(st), "et": float(et)})
    return ret


def load_wav_scp(wav_scp_file):
    """return dictionary { rec: wav_rxfilename }"""
    lines = [line.strip().split(None, 1) for line in open(wav_scp_file)]
    return {x[0]: x[1] for x in lines}


@lru_cache(maxsize=1)
def load_wav(wav_rxfilename, start=0, end=None):
    """This function reads audio file and return data in numpy.float32 array.
    "lru_cache" holds recently loaded audio so that can be called
    many times on the same audio file.
    OPTIMIZE: controls lru_cache size for random access,
    considering memory size
    """
    if wav_rxfilename.endswith("|"):
        # input piped command
        p = subprocess.Popen(wav_rxfilename[:-1], shell=True, stdout=subprocess.PIPE)
        data, samplerate = sf.load(io.BytesIO(p.stdout.read()), dtype="float32")
        # cannot seek
        data = data[start:end]
    elif wav_rxfilename == "-":
        # stdin
        data, samplerate = sf.load(sys.stdin, dtype="float32")
        # cannot seek
        data = data[start:end]
    else:
        # normal wav file
        data, samplerate = sf.load(wav_rxfilename, start=start, stop=end)
    return data, samplerate


def load_utt2spk(utt2spk_file):
    """returns dictionary { uttid: spkid }"""
    lines = [line.strip().split(None, 1) for line in open(utt2spk_file)]
    return {x[0]: x[1] for x in lines}


def load_spk2utt(spk2utt_file):
    """returns dictionary { spkid: list of uttids }"""
    if not os.path.exists(spk2utt_file):
        return None
    lines = [line.strip().split() for line in open(spk2utt_file)]
    return {x[0]: x[1:] for x in lines}


def load_reco2dur(reco2dur_file):
    """returns dictionary { recid: duration }"""
    if not os.path.exists(reco2dur_file):
        return None
    lines = [line.strip().split(None, 1) for line in open(reco2dur_file)]
    return {x[0]: float(x[1]) for x in lines}


def process_wav(wav_rxfilename, process):
    """This function returns preprocessed wav_rxfilename
    Args:
        wav_rxfilename: input
        process: command which can be connected via pipe,
                use stdin and stdout
    Returns:
        wav_rxfilename: output piped command
    """
    if wav_rxfilename.endswith("|"):
        # input piped command
        return wav_rxfilename + process + "|"
    else:
        # stdin "-" or normal file
        return "cat {} | {} |".format(wav_rxfilename, process)


def extract_segments(wavs, segments=None):
    """This function returns generator of segmented audio as
    (utterance id, numpy.float32 array)
    TODO?: sampling rate is not converted.
    """
    if segments is not None:
        # segments should be sorted by rec-id
        for seg in segments:
            wav = wavs[seg["rec"]]
            data, samplerate = load_wav(wav)
            st_sample = np.rint(seg["st"] * samplerate).astype(int)
            et_sample = np.rint(seg["et"] * samplerate).astype(int)
            yield seg["utt"], data[st_sample:et_sample]
    else:
        # segments file not found,
        # wav.scp is used as segmented audio list
        for rec in wavs:
            data, samplerate = load_wav(wavs[rec])
            yield rec, data


class KaldiData:
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.segments = load_segments_rechash(os.path.join(self.data_dir, "segments"))
        self.utt2spk = load_utt2spk(os.path.join(self.data_dir, "utt2spk"))
        self.wavs = load_wav_scp(os.path.join(self.data_dir, "wav.scp"))
        self.reco2dur = load_reco2dur(os.path.join(self.data_dir, "reco2dur"))
        self.spk2utt = load_spk2utt(os.path.join(self.data_dir, "spk2utt"))

    def load_wav(self, recid, start=0, end=None):
        data, rate = load_wav(self.wavs[recid], start, end)
        return data, rate