import argparse import librosa import torch import os from tqdm import tqdm from utils.denoise_pipeline import denoise from utils.diarization_pipeline import diarization import numpy as np import pandas as pd import soundfile as sf def filter_small_speech(segments): segments['duration'] = segments.end - segments.start durs = segments.groupby('label').sum() labels = durs[durs['duration'] / durs.sum()['duration'] > 0.015].index return segments[segments.label.isin(labels)] def save_speaker_audios(segments, denoised_audio_path, out_folder='out', out_f=48000): signal, sr = librosa.load(denoised_audio_path, sr=out_f, mono=True) os.makedirs(out_folder, exist_ok=True) out_wav_paths = [] segments = pd.DataFrame(segments) segments = filter_small_speech(segments) for label in set(segments.label): temp_df = segments[segments.label == label] output_signal = [] for _, r in temp_df.iterrows(): start = int(r["start"] * out_f) end = int(r["end"] * out_f) output_signal.append(signal[start:end]) out_wav_path = f'{out_folder}/{label}.wav' sf.write(out_wav_path, np.concatenate(output_signal), out_f, 'PCM_24') out_wav_paths.append(out_wav_path) return out_wav_paths[:10] def main_pipeline(audio_path, out_folder='out'): device = 'cuda' if torch.cuda.is_available() else 'cpu' denoised_audio_path = denoise(audio_path, device) segments = diarization(denoised_audio_path) denoised_audio_paths = save_speaker_audios(segments, denoised_audio_path, out_folder) return denoised_audio_path, denoised_audio_paths if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--audio-path', default='dialog.mp3', help='Path to audio') parser.add_argument('--out-folder-path', default='out', help='Path to result folder') opt = parser.parse_args() for _ in tqdm(range(10)): main_pipeline(audio_path=opt.audio_path, out_folder=opt.out_folder_path)