Spaces:
Running
Running
import os | |
import re | |
import sys | |
import codecs | |
import librosa | |
import logging | |
import numpy as np | |
import soundfile as sf | |
from pydub import AudioSegment | |
sys.path.append(os.getcwd()) | |
from main.tools import huggingface | |
from main.configs.config import Config | |
for l in ["httpx", "httpcore"]: | |
logging.getLogger(l).setLevel(logging.ERROR) | |
translations = Config().translations | |
def check_predictors(method, f0_onnx=False): | |
if f0_onnx and method not in ["harvest", "dio"]: method += "-onnx" | |
def download(predictors): | |
if not os.path.exists(os.path.join("assets", "models", "predictors", predictors)): huggingface.HF_download_file(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/cerqvpgbef/", "rot13") + predictors, os.path.join("assets", "models", "predictors", predictors)) | |
model_dict = {**dict.fromkeys(["rmvpe", "rmvpe-legacy"], "rmvpe.pt"), **dict.fromkeys(["rmvpe-onnx", "rmvpe-legacy-onnx"], "rmvpe.onnx"), **dict.fromkeys(["fcpe"], "fcpe.pt"), **dict.fromkeys(["fcpe-legacy"], "fcpe_legacy.pt"), **dict.fromkeys(["fcpe-onnx"], "fcpe.onnx"), **dict.fromkeys(["fcpe-legacy-onnx"], "fcpe_legacy.onnx"), **dict.fromkeys(["crepe-full", "mangio-crepe-full"], "crepe_full.pth"), **dict.fromkeys(["crepe-full-onnx", "mangio-crepe-full-onnx"], "crepe_full.onnx"), **dict.fromkeys(["crepe-large", "mangio-crepe-large"], "crepe_large.pth"), **dict.fromkeys(["crepe-large-onnx", "mangio-crepe-large-onnx"], "crepe_large.onnx"), **dict.fromkeys(["crepe-medium", "mangio-crepe-medium"], "crepe_medium.pth"), **dict.fromkeys(["crepe-medium-onnx", "mangio-crepe-medium-onnx"], "crepe_medium.onnx"), **dict.fromkeys(["crepe-small", "mangio-crepe-small"], "crepe_small.pth"), **dict.fromkeys(["crepe-small-onnx", "mangio-crepe-small-onnx"], "crepe_small.onnx"), **dict.fromkeys(["crepe-tiny", "mangio-crepe-tiny"], "crepe_tiny.pth"), **dict.fromkeys(["crepe-tiny-onnx", "mangio-crepe-tiny-onnx"], "crepe_tiny.onnx"), **dict.fromkeys(["harvest", "dio"], "world.pth")} | |
if "hybrid" in method: | |
methods_str = re.search("hybrid\[(.+)\]", method) | |
if methods_str: methods = [method.strip() for method in methods_str.group(1).split("+")] | |
for method in methods: | |
if method in model_dict: download(model_dict[method]) | |
elif method in model_dict: download(model_dict[method]) | |
def check_embedders(hubert, embedders_mode="fairseq"): | |
huggingface_url = codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/rzorqqref/", "rot13") | |
if embedders_mode == "spin": embedders_mode, hubert = "transformers", "spin" | |
if hubert in ["contentvec_base", "hubert_base", "japanese_hubert_base", "korean_hubert_base", "chinese_hubert_base", "portuguese_hubert_base", "spin"]: | |
if embedders_mode == "fairseq": hubert += ".pt" | |
elif embedders_mode == "onnx": hubert += ".onnx" | |
model_path = os.path.join("assets", "models", "embedders", hubert) | |
if embedders_mode == "fairseq": | |
if not os.path.exists(model_path): huggingface.HF_download_file("".join([huggingface_url, "fairseq/", hubert]), model_path) | |
elif embedders_mode == "onnx": | |
if not os.path.exists(model_path): huggingface.HF_download_file("".join([huggingface_url, "onnx/", hubert]), model_path) | |
elif embedders_mode == "transformers": | |
url, hubert = ("transformers/", hubert) if hubert != "spin" else ("spin", "") | |
bin_file = os.path.join(model_path, "model.safetensors") | |
config_file = os.path.join(model_path, "config.json") | |
os.makedirs(model_path, exist_ok=True) | |
if not os.path.exists(bin_file): huggingface.HF_download_file("".join([huggingface_url, url, hubert, "/model.safetensors"]), bin_file) | |
if not os.path.exists(config_file): huggingface.HF_download_file("".join([huggingface_url, url, hubert, "/config.json"]), config_file) | |
else: raise ValueError(translations["option_not_valid"]) | |
def check_spk_diarization(model_size): | |
whisper_model = os.path.join("assets", "models", "speaker_diarization", "models", f"{model_size}.pt") | |
if not os.path.exists(whisper_model): huggingface.HF_download_file("".join([codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/fcrnxre_qvnevmngvba/", "rot13"), model_size, ".pt"]), whisper_model) | |
speechbrain_path = os.path.join("assets", "models", "speaker_diarization", "models", "speechbrain") | |
if not os.path.exists(speechbrain_path): os.makedirs(speechbrain_path, exist_ok=True) | |
for f in ["classifier.ckpt", "config.json", "embedding_model.ckpt", "hyperparams.yaml", "mean_var_norm_emb.ckpt"]: | |
speechbrain_model = os.path.join(speechbrain_path, f) | |
if not os.path.exists(speechbrain_model): huggingface.HF_download_file(codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/fcrnxre_qvnevmngvba/fcrrpuoenva/", "rot13") + f, speechbrain_model) | |
def check_audioldm2(model): | |
for f in ["feature_extractor", "language_model", "projection_model", "scheduler", "text_encoder", "text_encoder_2", "tokenizer", "tokenizer_2", "unet", "vae", "vocoder"]: | |
folder_path = os.path.join("assets", "models", "audioldm2", model, f) | |
if not os.path.exists(folder_path): os.makedirs(folder_path, exist_ok=True) | |
for f in ["feature_extractor/preprocessor_config.json","language_model/config.json","language_model/model.safetensors","model_index.json","projection_model/config.json","projection_model/diffusion_pytorch_model.safetensors","scheduler/scheduler_config.json","text_encoder/config.json","text_encoder/model.safetensors","text_encoder_2/config.json","text_encoder_2/model.safetensors","tokenizer/merges.txt","tokenizer/special_tokens_map.json","tokenizer/tokenizer.json","tokenizer/tokenizer_config.json","tokenizer/vocab.json","tokenizer_2/special_tokens_map.json","tokenizer_2/spiece.model","tokenizer_2/tokenizer.json","tokenizer_2/tokenizer_config.json","unet/config.json","unet/diffusion_pytorch_model.safetensors","vae/config.json","vae/diffusion_pytorch_model.safetensors","vocoder/config.json","vocoder/model.safetensors"]: | |
model_path = os.path.join("assets", "models", "audioldm2", model, f) | |
if not os.path.exists(model_path): huggingface.HF_download_file("".join([codecs.decode("uggcf://uhttvatsnpr.pb/NauC/Ivrganzrfr-EIP-Cebwrpg/erfbyir/znva/nhqvbyqz/", "rot13"), model, "/", f]), model_path) | |
def load_audio(logger, file, sample_rate=16000, formant_shifting=False, formant_qfrency=0.8, formant_timbre=0.8): | |
try: | |
file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") | |
if not os.path.isfile(file): raise FileNotFoundError(translations["not_found"].format(name=file)) | |
try: | |
logger.debug(translations['read_sf']) | |
audio, sr = sf.read(file, dtype=np.float32) | |
except: | |
logger.debug(translations['read_librosa']) | |
audio, sr = librosa.load(file, sr=None) | |
if len(audio.shape) > 1: audio = librosa.to_mono(audio.T) | |
if sr != sample_rate: audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate, res_type="soxr_vhq") | |
if formant_shifting: | |
from main.library.algorithm.stftpitchshift import StftPitchShift | |
pitchshifter = StftPitchShift(1024, 32, sample_rate) | |
audio = pitchshifter.shiftpitch(audio, factors=1, quefrency=formant_qfrency * 1e-3, distortion=formant_timbre) | |
except Exception as e: | |
raise RuntimeError(f"{translations['errors_loading_audio']}: {e}") | |
return audio.flatten() | |
def pydub_convert(audio): | |
samples = np.frombuffer(audio.raw_data, dtype=np.int16) | |
if samples.dtype != np.int16: samples = (samples * 32767).astype(np.int16) | |
return AudioSegment(samples.tobytes(), frame_rate=audio.frame_rate, sample_width=samples.dtype.itemsize, channels=audio.channels) | |
def pydub_load(input_path): | |
try: | |
if input_path.endswith(".wav"): audio = AudioSegment.from_wav(input_path) | |
elif input_path.endswith(".mp3"): audio = AudioSegment.from_mp3(input_path) | |
elif input_path.endswith(".ogg"): audio = AudioSegment.from_ogg(input_path) | |
else: audio = AudioSegment.from_file(input_path) | |
except: | |
audio = AudioSegment.from_file(input_path) | |
return audio | |
def load_embedders_model(embedder_model, embedders_mode="fairseq", providers=None): | |
if embedders_mode == "fairseq": embedder_model += ".pt" | |
elif embedders_mode == "onnx": embedder_model += ".onnx" | |
elif embedders_mode == "spin": embedders_mode, embedder_model = "transformers", "spin" | |
embedder_model_path = os.path.join("assets", "models", "embedders", embedder_model) | |
if not os.path.exists(embedder_model_path): raise FileNotFoundError(f"{translations['not_found'].format(name=translations['model'])}: {embedder_model}") | |
try: | |
if embedders_mode == "fairseq": | |
from main.library.architectures import fairseq | |
models, saved_cfg, _ = fairseq.load_model(embedder_model_path) | |
embed_suffix = ".pt" | |
hubert_model = models[0] | |
elif embedders_mode == "onnx": | |
import onnxruntime | |
sess_options = onnxruntime.SessionOptions() | |
sess_options.log_severity_level = 3 | |
embed_suffix, saved_cfg = ".onnx", None | |
hubert_model = onnxruntime.InferenceSession(embedder_model_path, sess_options=sess_options, providers=providers) | |
elif embedders_mode == "transformers": | |
from torch import nn | |
from transformers import HubertModel | |
class HubertModelWithFinalProj(HubertModel): | |
def __init__(self, config): | |
super().__init__(config) | |
self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size) | |
embed_suffix, saved_cfg = ".safetensors", None | |
hubert_model = HubertModelWithFinalProj.from_pretrained(embedder_model_path) | |
else: raise ValueError(translations["option_not_valid"]) | |
except Exception as e: | |
raise RuntimeError(translations["read_model_error"].format(e=e)) | |
return hubert_model, saved_cfg, embed_suffix | |
def cut(audio, sr, db_thresh=-60, min_interval=250): | |
from main.inference.preprocess import Slicer, get_rms | |
class Slicer2(Slicer): | |
def slice2(self, waveform): | |
samples = waveform.mean(axis=0) if len(waveform.shape) > 1 else waveform | |
if samples.shape[0] <= self.min_length: return [(waveform, 0, samples.shape[0])] | |
rms_list = get_rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0) | |
sil_tags = [] | |
silence_start, clip_start = None, 0 | |
for i, rms in enumerate(rms_list): | |
if rms < self.threshold: | |
if silence_start is None: silence_start = i | |
continue | |
if silence_start is None: continue | |
is_leading_silence = silence_start == 0 and i > self.max_sil_kept | |
need_slice_middle = (i - silence_start >= self.min_interval and i - clip_start >= self.min_length) | |
if not is_leading_silence and not need_slice_middle: | |
silence_start = None | |
continue | |
if i - silence_start <= self.max_sil_kept: | |
pos = rms_list[silence_start : i + 1].argmin() + silence_start | |
sil_tags.append((0, pos) if silence_start == 0 else (pos, pos)) | |
clip_start = pos | |
elif i - silence_start <= self.max_sil_kept * 2: | |
pos = rms_list[i - self.max_sil_kept : silence_start + self.max_sil_kept + 1].argmin() | |
pos += i - self.max_sil_kept | |
pos_r = (rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept) | |
if silence_start == 0: | |
sil_tags.append((0, pos_r)) | |
clip_start = pos_r | |
else: | |
sil_tags.append((min((rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start), pos), max(pos_r, pos))) | |
clip_start = max(pos_r, pos) | |
else: | |
pos_r = (rms_list[i - self.max_sil_kept : i + 1].argmin() + i - self.max_sil_kept) | |
sil_tags.append((0, pos_r) if silence_start == 0 else ((rms_list[silence_start : silence_start + self.max_sil_kept + 1].argmin() + silence_start), pos_r)) | |
clip_start = pos_r | |
silence_start = None | |
total_frames = rms_list.shape[0] | |
if (silence_start is not None and total_frames - silence_start >= self.min_interval): sil_tags.append((rms_list[silence_start : min(total_frames, silence_start + self.max_sil_kept) + 1].argmin() + silence_start, total_frames + 1)) | |
if not sil_tags: return [(waveform, 0, samples.shape[-1])] | |
else: | |
chunks = [] | |
if sil_tags[0][0] > 0: chunks.append((self._apply_slice(waveform, 0, sil_tags[0][0]), 0, sil_tags[0][0] * self.hop_size)) | |
for i in range(len(sil_tags) - 1): | |
chunks.append((self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]), sil_tags[i][1] * self.hop_size, sil_tags[i + 1][0] * self.hop_size)) | |
if sil_tags[-1][1] < total_frames: chunks.append((self._apply_slice(waveform, sil_tags[-1][1], total_frames), sil_tags[-1][1] * self.hop_size, samples.shape[-1])) | |
return chunks | |
slicer = Slicer2(sr=sr, threshold=db_thresh, min_interval=min_interval) | |
return slicer.slice2(audio) | |
def restore(segments, total_len, dtype=np.float32): | |
out = [] | |
last_end = 0 | |
for start, end, processed_seg in segments: | |
if start > last_end: out.append(np.zeros(start - last_end, dtype=dtype)) | |
out.append(processed_seg) | |
last_end = end | |
if last_end < total_len: out.append(np.zeros(total_len - last_end, dtype=dtype)) | |
return np.concatenate(out, axis=-1) |