|
import os
|
|
import glob
|
|
import torch
|
|
import hashlib
|
|
import librosa
|
|
import base64
|
|
from glob import glob
|
|
import numpy as np
|
|
from pydub import AudioSegment
|
|
from faster_whisper import WhisperModel
|
|
import hashlib
|
|
import base64
|
|
import librosa
|
|
|
|
|
|
model_size = "medium"
|
|
|
|
model = None
|
|
def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
|
|
global model
|
|
if model is None:
|
|
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
|
audio = AudioSegment.from_file(audio_path)
|
|
max_len = len(audio)
|
|
|
|
target_folder = os.path.join(target_dir, audio_name)
|
|
|
|
segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
|
|
segments = list(segments)
|
|
|
|
|
|
os.makedirs(target_folder, exist_ok=True)
|
|
wavs_folder = os.path.join(target_folder, 'wavs')
|
|
os.makedirs(wavs_folder, exist_ok=True)
|
|
|
|
|
|
s_ind = 0
|
|
start_time = None
|
|
|
|
for k, w in enumerate(segments):
|
|
|
|
if k == 0:
|
|
start_time = max(0, w.start)
|
|
|
|
end_time = w.end
|
|
|
|
|
|
if len(w.words) > 0:
|
|
confidence = sum([s.probability for s in w.words]) / len(w.words)
|
|
else:
|
|
confidence = 0.
|
|
|
|
text = w.text.replace('...', '')
|
|
|
|
|
|
audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)]
|
|
|
|
|
|
fname = f"{audio_name}_seg{s_ind}.wav"
|
|
|
|
|
|
save = audio_seg.duration_seconds > 1.5 and \
|
|
audio_seg.duration_seconds < 20. and \
|
|
len(text) >= 2 and len(text) < 200
|
|
|
|
if save:
|
|
output_file = os.path.join(wavs_folder, fname)
|
|
audio_seg.export(output_file, format='wav')
|
|
|
|
if k < len(segments) - 1:
|
|
start_time = max(0, segments[k+1].start - 0.08)
|
|
|
|
s_ind = s_ind + 1
|
|
return wavs_folder
|
|
|
|
|
|
def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
|
|
SAMPLE_RATE = 16000
|
|
audio_vad = get_audio_tensor(audio_path)
|
|
segments = get_vad_segments(
|
|
audio_vad,
|
|
output_sample=True,
|
|
min_speech_duration=0.1,
|
|
min_silence_duration=1,
|
|
method="silero",
|
|
)
|
|
segments = [(seg["start"], seg["end"]) for seg in segments]
|
|
segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
|
|
print(segments)
|
|
audio_active = AudioSegment.silent(duration=0)
|
|
audio = AudioSegment.from_file(audio_path)
|
|
|
|
for start_time, end_time in segments:
|
|
audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
|
|
|
|
audio_dur = audio_active.duration_seconds
|
|
print(f'after vad: dur = {audio_dur}')
|
|
target_folder = os.path.join(target_dir, audio_name)
|
|
wavs_folder = os.path.join(target_folder, 'wavs')
|
|
os.makedirs(wavs_folder, exist_ok=True)
|
|
start_time = 0.
|
|
count = 0
|
|
num_splits = int(np.round(audio_dur / split_seconds))
|
|
assert num_splits > 0, 'input audio is too short'
|
|
interval = audio_dur / num_splits
|
|
|
|
for i in range(num_splits):
|
|
end_time = min(start_time + interval, audio_dur)
|
|
if i == num_splits - 1:
|
|
end_time = audio_dur
|
|
output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
|
|
audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
|
|
audio_seg.export(output_file, format='wav')
|
|
start_time = end_time
|
|
count += 1
|
|
return wavs_folder
|
|
|
|
def hash_numpy_array(audio_path):
|
|
array, _ = librosa.load(audio_path, sr=None, mono=True)
|
|
|
|
array_bytes = array.tobytes()
|
|
|
|
hash_object = hashlib.sha256(array_bytes)
|
|
hash_value = hash_object.digest()
|
|
|
|
base64_value = base64.b64encode(hash_value)
|
|
return base64_value.decode('utf-8')[:16].replace('/', '_^')
|
|
|
|
def get_se(audio_path, vc_model, target_dir='processed', vad=True):
|
|
device = vc_model.device
|
|
version = vc_model.version
|
|
print("OpenVoice version:", version)
|
|
|
|
audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
|
|
se_path = os.path.join(target_dir, audio_name, 'se.pth')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return vc_model.extract_se([audio_path], se_save_path=se_path), audio_name
|
|
|
|
|