denoise_and_diarization / main_pipeline.py
agorlanov
fix_readme
d172563
raw
history blame
2.05 kB
import argparse
import librosa
import torch
import os
from tqdm import tqdm
from utils.denoise_pipeline import denoise
from utils.diarization_pipeline import diarization
import numpy as np
import pandas as pd
import soundfile as sf
def filter_small_speech(segments):
segments['duration'] = segments.end - segments.start
durs = segments.groupby('label').sum()
labels = durs[durs['duration'] / durs.sum()['duration'] > 0.015].index
return segments[segments.label.isin(labels)]
def save_speaker_audios(segments, denoised_audio_path, out_folder='out', out_f=48000):
signal, sr = librosa.load(denoised_audio_path, sr=out_f, mono=True)
os.makedirs(out_folder, exist_ok=True)
out_wav_paths = []
segments = pd.DataFrame(segments)
segments = filter_small_speech(segments)
for label in set(segments.label):
temp_df = segments[segments.label == label]
output_signal = []
for _, r in temp_df.iterrows():
start = int(r["start"] * out_f)
end = int(r["end"] * out_f)
output_signal.append(signal[start:end])
out_wav_path = f'{out_folder}/{label}.wav'
sf.write(out_wav_path, np.concatenate(output_signal), out_f, 'PCM_24')
out_wav_paths.append(out_wav_path)
return out_wav_paths[:10]
def main_pipeline(audio_path, out_folder='out'):
device = 'cuda' if torch.cuda.is_available() else 'cpu'
denoised_audio_path = denoise(audio_path, device)
segments = diarization(denoised_audio_path)
denoised_audio_paths = save_speaker_audios(segments, denoised_audio_path, out_folder)
return denoised_audio_path, denoised_audio_paths
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--audio-path', default='dialog.mp3', help='Path to audio')
parser.add_argument('--out-folder-path', default='out', help='Path to result folder')
opt = parser.parse_args()
for _ in tqdm(range(10)):
main_pipeline(audio_path=opt.audio_path, out_folder=opt.out_folder_path)