Remove separe_vocals.py script for audio voice separation
Browse files- Delete standalone script for voice separation using SpeechBrain and ModelScope
- Remove functions for audio processing, sample rate conversion, and voice extraction
- Likely consolidating or refactoring voice separation functionality elsewhere
- separe_vocals.py +0 -164
separe_vocals.py
DELETED
@@ -1,164 +0,0 @@
|
|
1 |
-
from modelscope.pipelines import pipeline
|
2 |
-
from modelscope.utils.constant import Tasks
|
3 |
-
import soundfile as sf
|
4 |
-
import numpy as np
|
5 |
-
import os
|
6 |
-
import torch
|
7 |
-
import argparse
|
8 |
-
import speechbrain as sb
|
9 |
-
from speechbrain.dataio.dataio import read_audio
|
10 |
-
from speechbrain.pretrained import SepformerSeparation as separator
|
11 |
-
import torchaudio
|
12 |
-
|
13 |
-
SAMPLE_RATE = 8000
|
14 |
-
MODEL_SPEECHBRAIN = "SPEECHBRAIN"
|
15 |
-
MODEL_MODELSCOPE = "MODELSCOPE"
|
16 |
-
# MODEL = MODEL_SPEECHBRAIN
|
17 |
-
MODEL = MODEL_MODELSCOPE
|
18 |
-
|
19 |
-
def get_sample_rate(audio_file_path):
|
20 |
-
"""
|
21 |
-
Get the sample rate of an audio file
|
22 |
-
Args:
|
23 |
-
audio_file_path (str): Path to the audio file
|
24 |
-
|
25 |
-
Returns:
|
26 |
-
int: Sample rate of the audio file
|
27 |
-
"""
|
28 |
-
_, sample_rate = sf.read(audio_file_path, always_2d=True)
|
29 |
-
return sample_rate
|
30 |
-
|
31 |
-
def change_sample_rate(input_audio_file_path, output_audio_file_path, sample_rate):
|
32 |
-
"""
|
33 |
-
Change the sample rate of an audio file
|
34 |
-
Args:
|
35 |
-
input_audio_file_path (str): Path to the input audio file
|
36 |
-
output_audio_file_path (str): Path to the output audio file
|
37 |
-
sample_rate (int): Sample rate to change to
|
38 |
-
"""
|
39 |
-
os.system(f'ffmpeg -i {input_audio_file_path} -ar {sample_rate} -loglevel error {output_audio_file_path}')
|
40 |
-
|
41 |
-
def audio_is_stereo(audio_file_path):
|
42 |
-
"""
|
43 |
-
Check if an audio file is stereo
|
44 |
-
Args:
|
45 |
-
audio_file_path (str): Path to the audio file
|
46 |
-
|
47 |
-
Returns:
|
48 |
-
bool: True if the audio file is stereo, False otherwise
|
49 |
-
"""
|
50 |
-
audio, _ = sf.read(audio_file_path, always_2d=True)
|
51 |
-
return audio.shape[1] == 2
|
52 |
-
|
53 |
-
def set_mono(input_audio_file_path, output_audio_file_path):
|
54 |
-
"""
|
55 |
-
Set an audio file to mono
|
56 |
-
Args:
|
57 |
-
input_audio_file_path (str): Path to the input audio file
|
58 |
-
output_audio_file_path (str): Path to the output audio file
|
59 |
-
"""
|
60 |
-
os.system(f'ffmpeg -i {input_audio_file_path} -ac 1 -loglevel error {output_audio_file_path}')
|
61 |
-
|
62 |
-
def write_number_speakers_txt(output_folder, num_speakers):
|
63 |
-
"""
|
64 |
-
Write the number of speakers in a txt file
|
65 |
-
Args:
|
66 |
-
output_folder (str): Path to the output folder
|
67 |
-
num_speakers (int): Number of speakers
|
68 |
-
"""
|
69 |
-
with open(f"{output_folder}/speakers.txt", 'w') as f:
|
70 |
-
f.write(str(num_speakers))
|
71 |
-
|
72 |
-
def separate_vocals_speechbrain(input_audio_file_path, output_folder, model):
|
73 |
-
file, _ = input_audio_file_path.split(".")
|
74 |
-
_, file = file.split("/")
|
75 |
-
|
76 |
-
est_sources = model.separate_file(path=input_audio_file_path)
|
77 |
-
num_vocals = est_sources.shape[2]
|
78 |
-
speakers = 0
|
79 |
-
for i in range(num_vocals):
|
80 |
-
save_file = f'{output_folder}/{file}_speaker{i:003d}.wav'
|
81 |
-
torchaudio.save(save_file, est_sources[:, :, i].detach().cpu(), SAMPLE_RATE)
|
82 |
-
speakers += 1
|
83 |
-
|
84 |
-
# Write number of speakers in a txt file
|
85 |
-
write_number_speakers_txt(output_folder, speakers)
|
86 |
-
|
87 |
-
def separate_vocals_modelscope(input_audio_file_path, output_folder, model):
|
88 |
-
# Get input and output names
|
89 |
-
input_name, _ = input_audio_file_path.split(".")
|
90 |
-
input_folder, input_name = input_name.split("/")
|
91 |
-
|
92 |
-
# Set input files with 8k sample rate and mono
|
93 |
-
input_8k = f"{input_folder}/{input_name}_8k.wav"
|
94 |
-
input_8k_mono = f"{input_folder}/{input_name}_8k_mono.wav"
|
95 |
-
|
96 |
-
# Check if input has 8k sample rate, if not, change it
|
97 |
-
sr = get_sample_rate(input_audio_file_path)
|
98 |
-
if sr != SAMPLE_RATE:
|
99 |
-
change_sample_rate(input, input_8k, SAMPLE_RATE)
|
100 |
-
remove_8k = True
|
101 |
-
else:
|
102 |
-
input_8k = input
|
103 |
-
remove_8k = False
|
104 |
-
|
105 |
-
# Check if input is stereo, if yes, set it to mono
|
106 |
-
if audio_is_stereo(input_8k):
|
107 |
-
set_mono(input_8k, input_8k_mono)
|
108 |
-
remove_mono = True
|
109 |
-
else:
|
110 |
-
input_8k_mono = input_8k
|
111 |
-
remove_mono = False
|
112 |
-
|
113 |
-
# Separate audio voices
|
114 |
-
result = model(input_8k_mono)
|
115 |
-
|
116 |
-
# Save separated audio voices
|
117 |
-
speakers = 0
|
118 |
-
for i, signal in enumerate(result['output_pcm_list']):
|
119 |
-
save_file = f'{output_folder}/{input_name}_speaker{i:003d}.wav'
|
120 |
-
sf.write(save_file, np.frombuffer(signal, dtype=np.int16), SAMPLE_RATE)
|
121 |
-
speakers += 1
|
122 |
-
|
123 |
-
# Write number of speakers in a txt file
|
124 |
-
write_number_speakers_txt(output_folder, speakers)
|
125 |
-
|
126 |
-
# Remove temporary files
|
127 |
-
if remove_8k:
|
128 |
-
os.remove(input_8k)
|
129 |
-
if remove_mono:
|
130 |
-
os.remove(input_8k_mono)
|
131 |
-
|
132 |
-
if __name__ == '__main__':
|
133 |
-
argparser = argparse.ArgumentParser(description='Separate speech from a stereo audio file')
|
134 |
-
argparser.add_argument('inputs_file', type=str, help='File with the list of inputs')
|
135 |
-
argparser.add_argument('device', type=str, help='Device to use for separation')
|
136 |
-
args = argparser.parse_args()
|
137 |
-
|
138 |
-
device = args.device
|
139 |
-
if MODEL == MODEL_SPEECHBRAIN:
|
140 |
-
if device == 'cpu':
|
141 |
-
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr')
|
142 |
-
elif 'cuda' in device:
|
143 |
-
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":f"{device}"})
|
144 |
-
elif device == 'gpu':
|
145 |
-
model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr', run_opts={"device":"cuda"})
|
146 |
-
else:
|
147 |
-
raise ValueError(f"Device {device} is not valid")
|
148 |
-
elif MODEL == MODEL_MODELSCOPE:
|
149 |
-
separation = pipeline(Tasks.speech_separation, model='damo/speech_mossformer_separation_temporal_8k', device=device)
|
150 |
-
else:
|
151 |
-
raise ValueError(f"Model {MODEL} is not valid")
|
152 |
-
|
153 |
-
# Read files from input file
|
154 |
-
with open(args.inputs_file, 'r') as f:
|
155 |
-
inputs = f.read().splitlines()
|
156 |
-
|
157 |
-
output_folder = "vocals"
|
158 |
-
for input in inputs:
|
159 |
-
if MODEL == MODEL_SPEECHBRAIN:
|
160 |
-
separate_vocals_speechbrain(input, output_folder, model)
|
161 |
-
elif MODEL == MODEL_MODELSCOPE:
|
162 |
-
separate_vocals_modelscope(input, output_folder, separation)
|
163 |
-
else:
|
164 |
-
raise ValueError(f"Model {MODEL} is not valid")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|