|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
import json |
|
import os |
|
from glob import glob |
|
|
|
import numpy as np |
|
from scipy.io import wavfile |
|
from tqdm import tqdm |
|
|
|
parser = argparse.ArgumentParser(description="Cut audio on the segments based on segments") |
|
parser.add_argument("--output_dir", type=str, help="Path to output directory", required=True) |
|
parser.add_argument( |
|
"--alignment", |
|
type=str, |
|
required=True, |
|
help="Path to a data directory with alignments or a single .txt file with timestamps - result of the ctc-segmentation", |
|
) |
|
parser.add_argument("--threshold", type=float, default=-5, help="Minimum score value accepted") |
|
parser.add_argument("--offset", type=int, default=0, help="Offset, s") |
|
parser.add_argument("--batch_size", type=int, default=64, help="Batch size for inference") |
|
parser.add_argument( |
|
"--edge_duration", |
|
type=float, |
|
help="Duration of audio for mean absolute value calculation at the edges, s", |
|
default=0.05, |
|
) |
|
parser.add_argument("--sample_rate", type=int, help="Sample rate, Hz", default=16000) |
|
parser.add_argument( |
|
"--max_duration", |
|
type=int, |
|
help="Maximum audio duration (seconds). Samples that are longer will be dropped", |
|
default=60, |
|
) |
|
|
|
|
|
def process_alignment(alignment_file: str, manifest: str, clips_dir: str, args): |
|
""" Cut original audio file into audio segments based on alignment_file |
|
|
|
Args: |
|
alignment_file: path to the file with segmented text and corresponding time stamps. |
|
The first line of the file contains the path to the original audio file |
|
manifest: path to .json manifest to save segments metadata |
|
clips_dir: path to a directory to save audio clips |
|
args: main script args |
|
""" |
|
if not os.path.exists(alignment_file): |
|
raise ValueError(f"{alignment_file} not found") |
|
|
|
base_name = os.path.basename(alignment_file).replace("_segments.txt", "") |
|
|
|
|
|
segments = [] |
|
ref_text_processed = [] |
|
ref_text_no_preprocessing = [] |
|
ref_text_normalized = [] |
|
with open(alignment_file, "r") as f: |
|
for line in f: |
|
line = line.split("|") |
|
|
|
if len(line) == 1: |
|
audio_file = line[0].strip() |
|
continue |
|
ref_text_processed.append(line[1].strip()) |
|
ref_text_no_preprocessing.append(line[2].strip()) |
|
ref_text_normalized.append(line[3].strip()) |
|
line = line[0].split() |
|
segments.append((float(line[0]) + args.offset / 1000, float(line[1]) + args.offset / 1000, float(line[2]))) |
|
|
|
|
|
sampling_rate, signal = wavfile.read(audio_file) |
|
original_duration = len(signal) / sampling_rate |
|
|
|
num_samples = int(args.edge_duration * args.sample_rate) |
|
low_score_dur = 0 |
|
high_score_dur = 0 |
|
with open(manifest, "a", encoding="utf8") as f: |
|
for i, (st, end, score) in enumerate(segments): |
|
segment = signal[round(st * sampling_rate) : round(end * sampling_rate)] |
|
duration = len(segment) / sampling_rate |
|
if duration > args.max_duration: |
|
continue |
|
if duration > 0: |
|
text_processed = ref_text_processed[i].strip() |
|
text_no_preprocessing = ref_text_no_preprocessing[i].strip() |
|
text_normalized = ref_text_normalized[i].strip() |
|
if score >= args.threshold: |
|
high_score_dur += duration |
|
audio_filepath = os.path.join(clips_dir, f"{base_name}_{i:04}.wav") |
|
wavfile.write(audio_filepath, sampling_rate, segment) |
|
|
|
assert len(signal.shape) == 1 and sampling_rate == args.sample_rate, "check sampling rate" |
|
|
|
info = { |
|
"audio_filepath": audio_filepath, |
|
"duration": duration, |
|
"text": text_processed, |
|
"text_no_preprocessing": text_no_preprocessing, |
|
"text_normalized": text_normalized, |
|
"score": round(score, 2), |
|
"start_abs": float(np.mean(np.abs(segment[:num_samples]))), |
|
"end_abs": float(np.mean(np.abs(segment[-num_samples:]))), |
|
} |
|
json.dump(info, f, ensure_ascii=False) |
|
f.write("\n") |
|
else: |
|
low_score_dur += duration |
|
|
|
|
|
del_duration = 0 |
|
begin = 0 |
|
|
|
for i, (st, end, _) in enumerate(segments): |
|
if st - begin > 0.01: |
|
segment = signal[int(begin * sampling_rate) : int(st * sampling_rate)] |
|
duration = len(segment) / sampling_rate |
|
del_duration += duration |
|
begin = end |
|
|
|
segment = signal[int(begin * sampling_rate) :] |
|
duration = len(segment) / sampling_rate |
|
del_duration += duration |
|
|
|
stats = ( |
|
args.output_dir, |
|
base_name, |
|
round(original_duration), |
|
round(high_score_dur), |
|
round(low_score_dur), |
|
round(del_duration), |
|
) |
|
return stats |
|
|
|
|
|
if __name__ == "__main__": |
|
args = parser.parse_args() |
|
print("Splitting audio files into segments...") |
|
|
|
if os.path.isdir(args.alignment): |
|
alignment_files = glob(f"{args.alignment}/*_segments.txt") |
|
else: |
|
alignment_files = [args.alignment] |
|
|
|
|
|
args.output_dir = os.path.abspath(args.output_dir) |
|
clips_dir = os.path.join(args.output_dir, "clips") |
|
manifest_dir = os.path.join(args.output_dir, "manifests") |
|
os.makedirs(clips_dir, exist_ok=True) |
|
os.makedirs(manifest_dir, exist_ok=True) |
|
|
|
manifest = os.path.join(manifest_dir, "manifest.json") |
|
if os.path.exists(manifest): |
|
os.remove(manifest) |
|
|
|
stats_file = os.path.join(args.output_dir, "stats.tsv") |
|
with open(stats_file, "w") as f: |
|
f.write("Folder\tSegment\tOriginal dur (s)\tHigh quality dur (s)\tLow quality dur (s)\tDeleted dur (s)\n") |
|
|
|
high_score_dur = 0 |
|
low_score_dur = 0 |
|
del_duration = 0 |
|
original_dur = 0 |
|
|
|
for alignment_file in tqdm(alignment_files): |
|
stats = process_alignment(alignment_file, manifest, clips_dir, args) |
|
original_dur += stats[-4] |
|
high_score_dur += stats[-3] |
|
low_score_dur += stats[-2] |
|
del_duration += stats[-1] |
|
stats = "\t".join([str(t) for t in stats]) + "\n" |
|
f.write(stats) |
|
|
|
f.write(f"Total\t\t{round(high_score_dur)}\t{round(low_score_dur)}\t{del_duration}") |
|
|
|
print(f"Original duration : {round(original_dur / 60)}min") |
|
print(f"High score segments: {round(high_score_dur / 60)}min ({round(high_score_dur/original_dur*100)}%)") |
|
print(f"Low score segments : {round(low_score_dur / 60)}min ({round(low_score_dur/original_dur*100)}%)") |
|
print(f"Deleted segments : {round(del_duration / 60)}min ({round(del_duration/original_dur*100)}%)") |
|
print(f"Stats saved at {stats_file}") |
|
print(f"Manifest saved at {manifest}") |
|
|