|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
This script converts a filelist file where each line contains |
|
<absolute path of wav file> to a manifest json file. |
|
Optionally post processes the manifest file to create dev and train split for speaker embedding |
|
training, also optionally segment an audio file in to segments of random DURATIONS and create those |
|
wav files in CWD. |
|
|
|
Args: |
|
--filelist: path to file containing list of audio files |
|
--manifest(optional): if you already have manifest file, but would like to process it for creating |
|
segments and splitting then use manifest ignoring filelist |
|
--id: index of speaker label in filename present in filelist file that is separated by '/' |
|
--out: output manifest file name |
|
--split: if you would want to split the manifest file for training purposes |
|
you may not need this for test set. output file names is <out>_<train/dev>.json, defaults to False |
|
--create_segments: if you would want to segment each manifest line to segments of [1,2,3,4] sec or less |
|
you may not need this for test set, defaults to False |
|
--min_spkrs_count: min number of samples per speaker to consider and ignore otherwise, defaults to 0 (all speakers) |
|
""" |
|
|
|
import argparse |
|
import json |
|
import os |
|
import random |
|
|
|
import librosa as l |
|
import numpy as np |
|
import soundfile as sf |
|
import sox |
|
from sklearn.model_selection import StratifiedShuffleSplit |
|
from tqdm.contrib.concurrent import process_map |
|
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest |
|
|
|
random.seed(42) |
|
|
|
DURATIONS = sorted([3], reverse=True) |
|
MIN_ENERGY = 0.01 |
|
CWD = os.getcwd() |
|
|
|
|
|
def filter_manifest_line(manifest_line): |
|
split_manifest = [] |
|
audio_path = manifest_line['audio_filepath'] |
|
start = manifest_line.get('offset', 0) |
|
dur = manifest_line['duration'] |
|
label = manifest_line['label'] |
|
endname = os.path.splitext(audio_path.split(label, 1)[-1])[0] |
|
to_path = os.path.join(CWD, 'segments', label) |
|
to_path = os.path.join(to_path, endname[1:]) |
|
os.makedirs(os.path.dirname(to_path), exist_ok=True) |
|
|
|
if dur >= min(DURATIONS): |
|
signal, sr = sf.read(audio_path) |
|
remaining_dur = dur - start |
|
|
|
segments = DURATIONS.copy() |
|
mode = int(remaining_dur // sum(DURATIONS)) |
|
rem = remaining_dur % sum(DURATIONS) |
|
segments = mode * segments |
|
|
|
for val in DURATIONS: |
|
if rem >= val: |
|
segments.append(val) |
|
rem = rem - val |
|
|
|
for temp_dur in segments: |
|
segment_audio = signal[int(start * sr) : int(start * sr + temp_dur * sr)] |
|
if l.feature.rms(y=segment_audio).mean() > MIN_ENERGY: |
|
final_string = '_' + str(start) + '_' + str(temp_dur) |
|
final_string = final_string.replace('.', '-') |
|
to_file = to_path + final_string + '.wav' |
|
|
|
c_start = int(float(start * sr)) |
|
c_end = c_start + int(float(temp_dur * sr)) |
|
segment = signal[c_start:c_end] |
|
sf.write(to_file, segment, sr) |
|
|
|
meta = manifest_line.copy() |
|
meta['audio_filepath'] = to_file |
|
meta['offset'] = 0 |
|
meta['duration'] = temp_dur |
|
split_manifest.append(meta) |
|
|
|
start = start + temp_dur |
|
|
|
return split_manifest |
|
|
|
|
|
def count_and_consider_only(speakers, lines, min_count=10): |
|
""" |
|
consider speakers only if samples per speaker is at least min_count |
|
""" |
|
uniq_speakers, indices, counts = np.unique(speakers, return_index=True, return_counts=True) |
|
print("speaker count before filtering minimum number of speaker counts: ", len(uniq_speakers)) |
|
required_speakers = {} |
|
for idx, count in enumerate(counts): |
|
if count >= min_count: |
|
required_speakers[uniq_speakers[idx]] = count |
|
|
|
print("speaker count after filtering minimum number of speaker counts: ", len(required_speakers)) |
|
required_lines = [] |
|
speakers_only = [] |
|
for idx, speaker in enumerate(speakers): |
|
if speaker in required_speakers: |
|
required_lines.append(lines[idx]) |
|
speakers_only.append(speaker) |
|
|
|
return speakers_only, required_lines |
|
|
|
|
|
def write_file(name, lines, idx): |
|
with open(name, 'w', encoding='utf-8') as fout: |
|
for i in idx: |
|
dic = lines[i] |
|
json.dump(dic, fout) |
|
fout.write('\n') |
|
print("wrote", name) |
|
|
|
|
|
def read_file(filelist, id=-1): |
|
json_lines = [] |
|
with open(filelist, 'r') as fo: |
|
lines = fo.readlines() |
|
lines = sorted(lines) |
|
for line in lines: |
|
line = line.strip() |
|
speaker = line.split('/')[id] |
|
speaker = list(speaker) |
|
speaker = ''.join(speaker) |
|
meta = {"audio_filepath": line, "offset": 0, "duration": None, "label": speaker} |
|
json_lines.append(meta) |
|
return json_lines |
|
|
|
|
|
def get_duration(json_line): |
|
dur = json_line['duration'] |
|
if dur is None: |
|
wav_path = json_line['audio_filepath'] |
|
json_line['duration'] = sox.file_info.duration(wav_path) |
|
return json_line |
|
|
|
|
|
def get_labels(lines): |
|
labels = [] |
|
for line in lines: |
|
label = line['label'] |
|
labels.append(label) |
|
return labels |
|
|
|
|
|
def main(filelist, manifest, id, out, split=False, create_segments=False, min_count=10): |
|
if os.path.exists(out): |
|
os.remove(out) |
|
if filelist: |
|
lines = read_file(filelist=filelist, id=id) |
|
lines = process_map(get_duration, lines, chunksize=100) |
|
out_file = os.path.splitext(filelist)[0] + '_manifest.json' |
|
write_file(out_file, lines, range(len(lines))) |
|
else: |
|
lines = read_manifest(manifest) |
|
|
|
lines = process_map(get_duration, lines, chunksize=100) |
|
|
|
if create_segments: |
|
print(f"creating and writing segments to {CWD}") |
|
lines = process_map(filter_manifest_line, lines, chunksize=100) |
|
temp = [] |
|
for line in lines: |
|
temp.extend(line) |
|
del lines |
|
lines = temp |
|
|
|
speakers = [x['label'] for x in lines] |
|
|
|
if min_count: |
|
speakers, lines = count_and_consider_only(speakers, lines, abs(min_count)) |
|
|
|
write_file(out, lines, range(len(lines))) |
|
path = os.path.dirname(out) |
|
if split: |
|
speakers = [x['label'] for x in lines] |
|
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42) |
|
for train_idx, test_idx in sss.split(speakers, speakers): |
|
print("number of train samples after split: ", len(train_idx)) |
|
|
|
out = os.path.join(path, 'train.json') |
|
write_file(out, lines, train_idx) |
|
out = os.path.join(path, 'dev.json') |
|
write_file(out, lines, test_idx) |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--filelist", help="path to filelist file", type=str, required=False, default=None) |
|
parser.add_argument("--manifest", help="manifest file name", type=str, required=False, default=None) |
|
parser.add_argument( |
|
"--id", |
|
help="field num seperated by '/' to be considered as speaker label from filelist file, can be ignored if manifest file is already provided with labels", |
|
type=int, |
|
required=False, |
|
default=None, |
|
) |
|
parser.add_argument("--out", help="manifest_file name", type=str, required=True) |
|
parser.add_argument( |
|
"--split", |
|
help="bool if you would want to split the manifest file for training purposes", |
|
required=False, |
|
action='store_true', |
|
) |
|
parser.add_argument( |
|
"--create_segments", |
|
help="bool if you would want to segment each manifest line to segments of 4 sec or less", |
|
required=False, |
|
action='store_true', |
|
) |
|
parser.add_argument( |
|
"--min_spkrs_count", |
|
default=0, |
|
type=int, |
|
help="min number of samples per speaker to consider and ignore otherwise", |
|
) |
|
|
|
args = parser.parse_args() |
|
|
|
main( |
|
args.filelist, args.manifest, args.id, args.out, args.split, args.create_segments, args.min_spkrs_count, |
|
) |
|
|