File size: 2,046 Bytes
a227627
c39b8bf
9872c27
156571b
a227627
 
c39b8bf
 
 
 
 
 
 
 
d6b32ee
2f53d2f
d6b32ee
 
 
 
 
c39b8bf
 
156571b
c39b8bf
d6b32ee
c39b8bf
d6b32ee
c39b8bf
 
 
156571b
c39b8bf
 
 
4c18976
c39b8bf
156571b
 
c39b8bf
 
ad99144
c39b8bf
 
87d93bf
9872c27
 
 
c39b8bf
87d93bf
6144c99
c39b8bf
 
 
a227627
2f53d2f
d172563
a227627
ad99144
a227627
87d93bf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import argparse
import librosa
import torch
import os
from tqdm import tqdm

from utils.denoise_pipeline import denoise
from utils.diarization_pipeline import diarization
import numpy as np

import pandas as pd
import soundfile as sf


def filter_small_speech(segments):
    segments['duration'] = segments.end - segments.start
    durs = segments.groupby('label').sum()
    labels = durs[durs['duration'] / durs.sum()['duration'] > 0.015].index
    return segments[segments.label.isin(labels)]


def save_speaker_audios(segments, denoised_audio_path, out_folder='out', out_f=48000):
    signal, sr = librosa.load(denoised_audio_path, sr=out_f, mono=True)
    os.makedirs(out_folder, exist_ok=True)
    out_wav_paths = []

    segments = pd.DataFrame(segments)
    segments = filter_small_speech(segments)

    for label in set(segments.label):
        temp_df = segments[segments.label == label]
        output_signal = []
        for _, r in temp_df.iterrows():
            start = int(r["start"] * out_f)
            end = int(r["end"] * out_f)
            output_signal.append(signal[start:end])

        out_wav_path = f'{out_folder}/{label}.wav'
        sf.write(out_wav_path, np.concatenate(output_signal), out_f, 'PCM_24')
        out_wav_paths.append(out_wav_path)

    return out_wav_paths[:10]


def main_pipeline(audio_path, out_folder='out'):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    denoised_audio_path = denoise(audio_path, device)
    segments = diarization(denoised_audio_path)
    denoised_audio_paths = save_speaker_audios(segments, denoised_audio_path, out_folder)
    return denoised_audio_path, denoised_audio_paths


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--audio-path', default='dialog.mp3', help='Path to audio')
    parser.add_argument('--out-folder-path', default='out', help='Path to result folder')
    opt = parser.parse_args()

    for _ in tqdm(range(10)):
        main_pipeline(audio_path=opt.audio_path, out_folder=opt.out_folder_path)