Maximofn commited on
Commit
befe577
·
1 Parent(s): 2a1684e

Remove separe_vocals.py script for audio voice separation

Browse files

- Delete standalone script for voice separation using SpeechBrain and ModelScope
- Remove functions for audio processing, sample rate conversion, and voice extraction
- Likely consolidating or refactoring voice separation functionality elsewhere

Files changed (1) hide show
  1. separe_vocals.py +0 -164
separe_vocals.py DELETED
@@ -1,164 +0,0 @@
1
- from modelscope.pipelines import pipeline
2
- from modelscope.utils.constant import Tasks
3
- import soundfile as sf
4
- import numpy as np
5
- import os
6
- import torch
7
- import argparse
8
- import speechbrain as sb
9
- from speechbrain.dataio.dataio import read_audio
10
- from speechbrain.pretrained import SepformerSeparation as separator
11
- import torchaudio
12
-
13
- SAMPLE_RATE = 8000
14
- MODEL_SPEECHBRAIN = "SPEECHBRAIN"
15
- MODEL_MODELSCOPE = "MODELSCOPE"
16
- # MODEL = MODEL_SPEECHBRAIN
17
- MODEL = MODEL_MODELSCOPE
18
-
19
- def get_sample_rate(audio_file_path):
20
- """
21
- Get the sample rate of an audio file
22
- Args:
23
- audio_file_path (str): Path to the audio file
24
-
25
- Returns:
26
- int: Sample rate of the audio file
27
- """
28
- _, sample_rate = sf.read(audio_file_path, always_2d=True)
29
- return sample_rate
30
-
31
- def change_sample_rate(input_audio_file_path, output_audio_file_path, sample_rate):
32
- """
33
- Change the sample rate of an audio file
34
- Args:
35
- input_audio_file_path (str): Path to the input audio file
36
- output_audio_file_path (str): Path to the output audio file
37
- sample_rate (int): Sample rate to change to
38
- """
39
- os.system(f'ffmpeg -i {input_audio_file_path} -ar {sample_rate} -loglevel error {output_audio_file_path}')
40
-
41
- def audio_is_stereo(audio_file_path):
42
- """
43
- Check if an audio file is stereo
44
- Args:
45
- audio_file_path (str): Path to the audio file
46
-
47
- Returns:
48
- bool: True if the audio file is stereo, False otherwise
49
- """
50
- audio, _ = sf.read(audio_file_path, always_2d=True)
51
- return audio.shape[1] == 2
52
-
53
- def set_mono(input_audio_file_path, output_audio_file_path):
54
- """
55
- Set an audio file to mono
56
- Args:
57
- input_audio_file_path (str): Path to the input audio file
58
- output_audio_file_path (str): Path to the output audio file
59
- """
60
- os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 -loglevel error {output_audio_file_path}')
61
-
62
- def write_number_speakers_txt(output_folder, num_speakers):
63
- """
64
- Write the number of speakers in a txt file
65
- Args:
66
- output_folder (str): Path to the output folder
67
- num_speakers (int): Number of speakers
68
- """
69
- with open(f"{output_folder}/speakers.txt", 'w') as f:
70
- f.write(str(num_speakers))
71
-
72
- def separate_vocals_speechbrain(input_audio_file_path, output_folder, model):
73
- file, _ = input_audio_file_path.split(".")
74
- _, file = file.split("/")
75
-
76
- est_sources = model.separate_file(path=input_audio_file_path)
77
- num_vocals = est_sources.shape[2]
78
- speakers = 0
79
- for i in range(num_vocals):
80
- save_file = f'{output_folder}/{file}_speaker{i:003d}.wav'
81
- torchaudio.save(save_file, est_sources[:, :, i].detach().cpu(), SAMPLE_RATE)
82
- speakers += 1
83
-
84
- # Write number of speakers in a txt file
85
- write_number_speakers_txt(output_folder, speakers)
86
-
87
- def separate_vocals_modelscope(input_audio_file_path, output_folder, model):
88
- # Get input and output names
89
- input_name, _ = input_audio_file_path.split(".")
90
- input_folder, input_name = input_name.split("/")
91
-
92
- # Set input files with 8k sample rate and mono
93
- input_8k = f"{input_folder}/{input_name}_8k.wav"
94
- input_8k_mono = f"{input_folder}/{input_name}_8k_mono.wav"
95
-
96
- # Check if input has 8k sample rate, if not, change it
97
- sr = get_sample_rate(input_audio_file_path)
98
- if sr != SAMPLE_RATE:
99
- change_sample_rate(input, input_8k, SAMPLE_RATE)
100
- remove_8k = True
101
- else:
102
- input_8k = input
103
- remove_8k = False
104
-
105
- # Check if input is stereo, if yes, set it to mono
106
- if audio_is_stereo(input_8k):
107
- set_mono(input_8k, input_8k_mono)
108
- remove_mono = True
109
- else:
110
- input_8k_mono = input_8k
111
- remove_mono = False
112
-
113
- # Separate audio voices
114
- result = model(input_8k_mono)
115
-
116
- # Save separated audio voices
117
- speakers = 0
118
- for i, signal in enumerate(result['output_pcm_list']):
119
- save_file = f'{output_folder}/{input_name}_speaker{i:003d}.wav'
120
- sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
121
- speakers += 1
122
-
123
- # Write number of speakers in a txt file
124
- write_number_speakers_txt(output_folder, speakers)
125
-
126
- # Remove temporary files
127
- if remove_8k:
128
- os.remove(input_8k)
129
- if remove_mono:
130
- os.remove(input_8k_mono)
131
-
132
- if __name__ == '__main__':
133
- argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
134
- argparser.add_argument('inputs_file', type=str, help='File with the list of inputs')
135
- argparser.add_argument('device', type=str, help='Device to use for separation')
136
- args = argparser.parse_args()
137
-
138
- device = args.device
139
- if MODEL == MODEL_SPEECHBRAIN:
140
- if device == 'cpu':
141
- model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr')
142
- elif 'cuda' in device:
143
- model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":f"{device}"})
144
- elif device == 'gpu':
145
- model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":"cuda"})
146
- else:
147
- raise ValueError(f"Device {device} is not valid")
148
- elif MODEL == MODEL_MODELSCOPE:
149
- separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
150
- else:
151
- raise ValueError(f"Model {MODEL} is not valid")
152
-
153
- # Read files from input file
154
- with open(args.inputs_file, 'r') as f:
155
- inputs = f.read().splitlines()
156
-
157
- output_folder = "vocals"
158
- for input in inputs:
159
- if MODEL == MODEL_SPEECHBRAIN:
160
- separate_vocals_speechbrain(input, output_folder, model)
161
- elif MODEL == MODEL_MODELSCOPE:
162
- separate_vocals_modelscope(input, output_folder, separation)
163
- else:
164
- raise ValueError(f"Model {MODEL} is not valid")