import json from pathlib import Path import gradio as gr import librosa import numpy as np import torch from huggingface_hub import hf_hub_download, list_repo_files from so_vits_svc_fork.hparams import HParams from so_vits_svc_fork.inference.core import Svc ########################################################## # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME ########################################################## repo_id = "dog/arianagrande" ckpt_name = None # None will pick latest ########################################################## # Figure out the latest generator by taking highest value one. # Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth if ckpt_name is None: latest_id = sorted( [ int(Path(x).stem.split("_")[1]) for x in list_repo_files(repo_id) if x.startswith("G_") and x.endswith(".pth") ] )[-1] ckpt_name = f"G_{latest_id}.pth" generator_path = hf_hub_download(repo_id, ckpt_name) config_path = hf_hub_download(repo_id, "config.json") hparams = HParams(**json.loads(Path(config_path).read_text())) speakers = list(hparams.spk.keys()) device = "cuda" if torch.cuda.is_available() else "cpu" model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=None) def predict( speaker, audio, transpose: int = 0, auto_predict_f0: bool = False, cluster_infer_ratio: float = 0, noise_scale: float = 0.4, f0_method: str = "crepe", db_thresh: int = -40, pad_seconds: float = 0.5, chunk_seconds: float = 0.5, absolute_thresh: bool = False, ): audio, _ = librosa.load(audio, sr=model.target_sample) audio = model.infer_silence( audio.astype(np.float32), speaker=speaker, transpose=transpose, auto_predict_f0=auto_predict_f0, cluster_infer_ratio=cluster_infer_ratio, noise_scale=noise_scale, f0_method=f0_method, db_thresh=db_thresh, pad_seconds=pad_seconds, chunk_seconds=chunk_seconds, absolute_thresh=absolute_thresh, ) return model.target_sample, audio description=f""" This app uses models trained with so-vits-svc-fork to clone your voice. Model currently being used is https://hf.co/{repo_id}. To change the model being served, duplicate the space and update the `repo_id` in `app.py`. """.strip() article="""

Github Repo

""".strip() interface_mic = gr.Interface( predict, inputs=[ gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"), gr.Audio(type="filepath", source="microphone", label="Source Audio"), gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"), gr.Checkbox(False, label="Auto Predict F0"), gr.Slider(0.0, 1.0, value=0.0, step=0.1, label='cluster infer ratio'), gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"), gr.Dropdown(choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value='crepe', label="f0 method"), ], outputs="audio", title="Voice Cloning", description=description, article=article, ) interface_file = gr.Interface( predict, inputs=[ gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"), gr.Audio(type="filepath", source="upload", label="Source Audio"), gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"), gr.Checkbox(False, label="Auto Predict F0"), gr.Slider(0.0, 1.0, value=0.0, step=0.1, label='cluster infer ratio'), gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"), gr.Dropdown(choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value='crepe', label="f0 method"), ], outputs="audio", title="Voice Cloning", description=description, article=article, ) interface = gr.TabbedInterface( [interface_mic, interface_file], ["Clone From Mic", "Clone From File"], ) if __name__ == '__main__': interface.launch()