Advanced-RVC-Inference

Runtime error

App Files Files Community

melromyeah commited on Jul 1

Commit

cbfbb5c

verified ·

1 Parent(s): 9b0bb71

Update app.py

Browse files

Files changed (1) hide show

app.py +387 -6

app.py CHANGED Viewed

@@ -1,8 +1,389 @@
-import os
-os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d . -o hubert_base.pt")
-os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -d . -o rmvpe.pt")
-os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/yoimiya-jp/resolve/main/model.pth -d ./weights -o yoimiya.pth")
-os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/yoimiya-jp/resolve/main/model.index -d ./weights/index -o yoimiya.index")
-os.system("python infer.py")

+# --- Imports ---
+import os
+import shutil
+import traceback
+import asyncio
+import subprocess
+from datetime import datetime
+import gradio as gr
+import torch
+import numpy as np
+import librosa
+import soundfile as sf
+import yt_dlp
+import edge_tts
+from fairseq import checkpoint_utils
+# --- Local Module Imports ---
+# Ensure these files are in your repository
+from lib.infer_pack.models import (
+    SynthesizerTrnMs256NSFsid,
+    SynthesizerTrnMs256NSFsid_nono,
+    SynthesizerTrnMs768NSFsid,
+    SynthesizerTrnMs768NSFsid_nono,
+)
+from vc_infer_pipeline import VC
+from config import Config
+# --- Constants and Configuration ---
+now_dir = os.getcwd()
+config = Config() # Sets device (CPU/GPU) and precision (half/full)
+# Define file paths for pre-trained models and voice models
+# These files should be in your repository, not downloaded at runtime.
+HUBERT_PATH = os.path.join(now_dir, "pretraineds", "hubert_base.pt")
+RMVPE_PATH = os.path.join(now_dir, "pretraineds", "rmvpe.pt")
+WEIGHT_ROOT = os.path.join(now_dir, "weights")
+INDEX_ROOT = os.path.join(WEIGHT_ROOT, "index")
+# Create necessary directories
+os.makedirs(WEIGHT_ROOT, exist_ok=True)
+os.makedirs(INDEX_ROOT, exist_ok=True)
+os.makedirs(os.path.join(now_dir, "output"), exist_ok=True) # For demucs output
+os.makedirs(os.path.join(now_dir, "dl_audio"), exist_ok=True) # For youtube-dl output
+# Setup for temporary files
+tmp_dir = os.path.join(now_dir, "TEMP")
+shutil.rmtree(tmp_dir, ignore_errors=True)
+os.makedirs(tmp_dir, exist_ok=True)
+os.environ["TEMP"] = tmp_dir
+# --- Model Loading (Cached for Performance) ---
+@gr.cache_resource
+def load_hubert_model():
+    """Loads the Hubert model and caches it."""
+    print("Loading Hubert model...")
+    models, _, _ = checkpoint_utils.load_model_ensemble_and_task([HUBERT_PATH], suffix="")
+    hubert_model = models[0]
+    hubert_model = hubert_model.to(config.device)
+    hubert_model = hubert_model.half() if config.is_half else hubert_model.float()
+    hubert_model.eval()
+    print("Hubert model loaded.")
+    return hubert_model
+hubert_model = load_hubert_model()
+# --- Utility Functions ---
+def get_models_and_indices():
+    """Scans the weights folders and returns lists of available models and indices."""
+    model_files = [f for f in os.listdir(WEIGHT_ROOT) if f.endswith(".pth")]
+    index_files = [os.path.join(INDEX_ROOT, f) for f in os.listdir(INDEX_ROOT) if f.endswith('.index') and "trained" not in f]
+    return sorted(model_files), sorted(index_files)
+def get_edge_tts_voices():
+    """Fetches the list of available voices for Edge-TTS."""
+    try:
+        tts_voice_list = asyncio.run(edge_tts.list_voices())
+        return [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
+    except Exception as e:
+        print(f"Error fetching TTS voices: {e}. Returning a default list.")
+        return ["en-US-AnaNeural-Female", "en-US-AriaNeural-Female", "en-GB-SoniaNeural-Female"]
+# --- Core Inference Logic ---
+def vc_single(
+    sid,
+    input_audio_tuple,
+    f0_up_key,
+    f0_method,
+    file_index,
+    index_rate,
+    filter_radius,
+    resample_sr,
+    rms_mix_rate,
+    protect,
+    f0_file,
+    loaded_model  # Comes from gr.State
+):
+    """Main voice conversion function."""
+    if not input_audio_tuple:
+        return "You need to upload an audio file.", None
+    if not loaded_model or loaded_model["sid"] != sid:
+        return "Model not loaded or selected model mismatch. Please select a model from the dropdown and wait for it to load.", None
+    # Unpack the loaded model state
+    net_g = loaded_model["model"]
+    tgt_sr = loaded_model["tgt_sr"]
+    vc = loaded_model["vc"]
+    version = loaded_model["version"]
+    if_f0 = loaded_model["if_f0"]
+    try:
+        sampling_rate, audio_data = input_audio_tuple
+        audio_data = (audio_data / np.iinfo(audio_data.dtype).max).astype(np.float32) # Normalize audio
+        if len(audio_data.shape) > 1:
+            audio_data = librosa.to_mono(audio_data.transpose(1, 0))
+        if sampling_rate != 16000:
+            audio_data = librosa.resample(audio=audio_data, orig_sr=sampling_rate, target_sr=16000)
+        times = [0, 0, 0] # for performance tracking
+        # Perform the pipeline conversion
+        audio_opt = vc.pipeline(
+            hubert_model, net_g, sid, audio_data, "dummy_path", times, int(f0_up_key),
+            f0_method, file_index, index_rate, if_f0, filter_radius, tgt_sr,
+            resample_sr, rms_mix_rate, version, protect, f0_file=f0_file
+        )
+        final_sr = resample_sr if resample_sr >= 16000 else tgt_sr
+        index_info = f"Using index: {os.path.basename(file_index)}" if file_index and os.path.exists(file_index) else "Index not used."
+        info = f"Success. {index_info}\nTime: npy:{times[0]:.2f}s, f0:{times[1]:.2f}s, infer:{times[2]:.2f}s"
+        print(info)
+        return info, (final_sr, audio_opt)
+    except Exception as e:
+        info = traceback.format_exc()
+        print(info)
+        return info, None
+def load_selected_model(sid, protect_val):
+    """Loads a selected .pth model file and updates the UI accordingly."""
+    if not sid:
+        return None, gr.update(maximum=2333, visible=False), gr.update(visible=True), gr.update(value=""), gr.update(value="# <center> No model selected")
+    print(f"Loading model: {sid}")
+    try:
+        cpt = torch.load(os.path.join(WEIGHT_ROOT, sid), map_location="cpu")
+        tgt_sr = cpt["config"][-1]
+        cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
+        if_f0 = cpt.get("f0", 1)
+        version = cpt.get("version", "v1")
+        # Determine the correct model class
+        synth_class = None
+        if version == "v1":
+            synth_class = SynthesizerTrnMs256NSFsid if if_f0 == 1 else SynthesizerTrnMs256NSFsid_nono
+        elif version == "v2":
+            synth_class = SynthesizerTrnMs768NSFsid if if_f0 == 1 else SynthesizerTrnMs768NSFsid_nono
+        net_g = synth_class(*cpt["config"], is_half=config.is_half)
+        del net_g.enc_q
+        net_g.load_state_dict(cpt["weight"], strict=False)
+        net_g.eval().to(config.device)
+        net_g = net_g.half() if config.is_half else net_g.float()
+        vc = VC(tgt_sr, config)
+        n_spk = cpt["config"][-3]
+        # Prepare model state to be stored
+        loaded_model_state = {
+            "sid": sid, "model": net_g, "tgt_sr": tgt_sr, "vc": vc,
+            "version": version, "if_f0": if_f0, "n_spk": n_spk
+        }
+        # Find the corresponding index file
+        model_name_no_ext = os.path.splitext(sid)[0]
+        _, index_files = get_models_and_indices()
+        best_index = ""
+        for index_file in index_files:
+            if model_name_no_ext in os.path.basename(index_file):
+                best_index = index_file
+                break
+        # UI Updates
+        protect_update = gr.update(visible=(if_f0 != 0), value=protect_val)
+        spk_id_update = gr.update(maximum=n_spk - 1, visible=True)
+        model_info_update = gr.update(value=f'## <center> ✅ Loaded: {model_name_no_ext}\n### <center> RVC {version} Model')
+        print(f"Model {sid} loaded successfully.")
+        return loaded_model_state, spk_id_update, protect_update, gr.update(value=best_index), model_info_update
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return None, gr.update(visible=False), gr.update(visible=True), gr.update(value=""), gr.update(value=f"# <center> ⚠️ Error loading {sid}")
+def run_tts(tts_text, tts_voice):
+    """Runs Edge-TTS and returns the audio file path."""
+    if not tts_text or not tts_voice:
+        raise gr.Error("TTS text and voice are required.")
+    output_file = os.path.join(tmp_dir, "tts_output.mp3")
+    voice_shortname = "-".join(tts_voice.split('-')[:-1])
+    try:
+        asyncio.run(edge_tts.Communicate(tts_text, voice_shortname).save(output_file))
+        return "TTS audio generated.", output_file
+    except Exception as e:
+        return f"TTS failed: {e}", None
+def run_youtube_dl(url):
+    """Downloads audio from a YouTube URL."""
+    if not url:
+        raise gr.Error("URL is required.")
+    output_path = os.path.join(now_dir, "dl_audio", "audio.wav")
+    ydl_opts = {
+        'noplaylist': True,
+        'format': 'bestaudio/best',
+        'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav'}],
+        "outtmpl": os.path.join(now_dir, "dl_audio", "audio"),
+        'quiet': True,
+    }
+    try:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+        return "Download complete.", output_path
+    except Exception as e:
+        return f"Download failed: {e}", None
+def run_demucs(audio_path, model="htdemucs_ft"):
+    """Runs Demucs to separate vocals from an audio file."""
+    if not audio_path or not os.path.exists(audio_path):
+        raise gr.Error("Input audio for splitting not found.")
+    output_dir = os.path.join(now_dir, "output")
+    command = f"demucs --two-stems=vocals -n {model} \"{audio_path}\" -o \"{output_dir}\""
+    print(f"Running command: {command}")
+    try:
+        subprocess.run(command.split(), check=True, capture_output=True, text=True)
+        input_filename = os.path.splitext(os.path.basename(audio_path))[0]
+        vocal_path = os.path.join(output_dir, model, input_filename, "vocals.wav")
+        inst_path = os.path.join(output_dir, model, input_filename, "no_vocals.wav")
+        if os.path.exists(vocal_path):
+            return "Splitting complete.", vocal_path, inst_path
+        else:
+            return "Splitting failed: vocal file not found.", None, None
+    except subprocess.CalledProcessError as e:
+        error_message = f"Demucs failed: {e.stderr}"
+        print(error_message)
+        return error_message, None, None
+def refresh_model_list_ui():
+    """Refreshes the UI dropdowns for models and indices."""
+    models, indices = get_models_and_indices()
+    return gr.update(choices=models), gr.update(choices=indices)
+# --- Gradio UI Layout ---
+initial_models, initial_indices = get_models_and_indices()
+tts_voices = get_edge_tts_voices()
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="rose", secondary_hue="pink")) as demo:
+    gr.Markdown("# 🌺 Modernized RVC Voice Conversion 🌺")
+    # Stores the loaded model dictionary {sid, model, tgt_sr, ...}
+    loaded_model_state = gr.State(value=None)
+    with gr.Row():
+        sid = gr.Dropdown(label="1. Select Voice Model (.pth)", choices=initial_models)
+        refresh_button = gr.Button("🔄 Refresh", variant="secondary")
+    selected_model_info = gr.Markdown("# <center> No model selected", elem_id="model-info")
+    with gr.Tabs():
+        with gr.TabItem("🎙️ Main Inference"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    gr.Markdown("### Input Audio")
+                    input_audio_type = gr.Radio(
+                        ["Upload", "Microphone", "TTS", "YouTube"],
+                        value="Upload", label="Input Source"
+                    )
+                    # Upload/Mic
+                    audio_in = gr.Audio(label="Upload or Record Audio", type="filepath", sources=["upload", "microphone"], visible=True)
+                    # TTS
+                    tts_text_in = gr.Textbox(label="TTS Text", lines=3, visible=False)
+                    tts_voice_in = gr.Dropdown(label="TTS Voice", choices=tts_voices, value=tts_voices[0], visible=False)
+                    tts_gen_button = gr.Button("Generate TTS Audio", variant="primary", visible=False)
+                    # YouTube
+                    yt_url_in = gr.Textbox(label="YouTube URL", visible=False)
+                    yt_dl_button = gr.Button("Download from YouTube", variant="primary", visible=False)
+                    gr.Markdown("### (Optional) Vocal Separation")
+                    run_demucs_button = gr.Button("Separate Vocals from Input", variant="secondary")
+                    demucs_output_vocals = gr.Audio(label="Separated Vocals (for conversion)", type="filepath")
+                    demucs_output_inst = gr.Audio(label="Separated Instrumentals", type="filepath")
+                    demucs_status = gr.Textbox(label="Splitter Status", interactive=False)
+                    gr.Markdown("_Use the 'Separated Vocals' as input for the best results._")
+                with gr.Column(scale=1):
+                    gr.Markdown("### Inference Settings")
+                    spk_item = gr.Slider(minimum=0, maximum=2333, step=1, label="Speaker ID", value=0, visible=False, interactive=True)
+                    vc_transform0 = gr.Number(label="Transpose (semitones)", value=0)
+                    f0method0 = gr.Radio(
+                        label="Pitch Extraction Algorithm",
+                        choices=["pm", "harvest", "crepe", "rmvpe"] if os.path.exists(RMVPE_PATH) else ["pm", "harvest", "crepe"],
+                        value="rmvpe" if os.path.exists(RMVPE_PATH) else "pm", interactive=True
+                    )
+                    file_index = gr.Dropdown(label="Feature Index File (.index)", choices=initial_indices, interactive=True)
+                    index_rate0 = gr.Slider(minimum=0, maximum=1, label="Feature Retrieval Ratio", value=0.7, interactive=True)
+                    filter_radius0 = gr.Slider(minimum=0, maximum=7, label="Median Filtering Radius (reduces breathiness)", value=3, step=1, interactive=True)
+                    resample_sr0 = gr.Slider(minimum=0, maximum=48000, label="Output Resampling (0 for auto)", value=0, step=1, interactive=True)
+                    rms_mix_rate0 = gr.Slider(minimum=0, maximum=1, label="Input/Output Volume Envelope Mix Ratio", value=1, interactive=True)
+                    protect0 = gr.Slider(minimum=0, maximum=0.5, label="Voice Protection (for breathiness)", value=0.33, step=0.01, interactive=True)
+                    f0_file0 = gr.File(label="Optional F0 Curve File (.txt)", file_count="single")
+                with gr.Column(scale=1):
+                    gr.Markdown("### Output")
+                    convert_button = gr.Button("✨ Convert", variant="primary")
+                    vc_log = gr.Textbox(label="Output Information", interactive=False)
+                    vc_output = gr.Audio(label="Converted Audio", interactive=False)
+    with gr.TabItem("📚 Add New Models"):
+        gr.Markdown(
+            """
+            ## How to Add New Models
+            The old 'Model Downloader' has been removed to make this Space faster and more reliable.
+            Here's the modern way to add your own RVC models:
+            1.  **Go to the 'Files' tab** at the top of this Hugging Face Space.
+            2.  **Navigate to the `weights` folder.**
+            3.  Click **'Upload file'** to add your model files.
+                - Your model `.pth` file should go directly into the `weights` folder.
+                - Your index `.index` file should go into the `weights/index` folder.
+            4.  Once uploaded, come back to this 'Inference' tab and **click the '🔄 Refresh' button** next to the model dropdown. Your new model will appear!
+            This process uses Git-LFS to handle large files correctly and ensures your models are always available without needing to be re-downloaded.
+            """
+        )
+    # --- Event Listeners ---
+    # Load model when dropdown changes
+    sid.change(
+        load_selected_model,
+        inputs=[sid, protect0],
+        outputs=[loaded_model_state, spk_item, protect0, file_index, selected_model_info]
+    )
+    # Refresh button
+    refresh_button.click(refresh_model_list_ui, None, [sid, file_index])
+    # Main conversion
+    # The source audio is chosen based on which one was last interacted with or generated.
+    # Gradio automatically picks the most recent one if multiple gr.Audio inputs are provided.
+    convert_button.click(
+        vc_single,
+        [spk_item, demucs_output_vocals, vc_transform0, f0method0, file_index, index_rate0, filter_radius0, resample_sr0, rms_mix_rate0, protect0, f0_file0, loaded_model_state],
+        [vc_log, vc_output]
+    )
+    # Input type visibility
+    def update_input_visibility(choice):
+        return {
+            audio_in: gr.update(visible=choice in ["Upload", "Microphone"]),
+            tts_text_in: gr.update(visible=choice == "TTS"),
+            tts_voice_in: gr.update(visible=choice == "TTS"),
+            tts_gen_button: gr.update(visible=choice == "TTS"),
+            yt_url_in: gr.update(visible=choice == "YouTube"),
+            yt_dl_button: gr.update(visible=choice == "YouTube"),
+        }
+    input_audio_type.change(update_input_visibility, input_audio_type, [audio_in, tts_text_in, tts_voice_in, tts_gen_button, yt_url_in, yt_dl_button])
+    # Generators for input audio
+    tts_gen_button.click(run_tts, [tts_text_in, tts_voice_in], [demucs_status, audio_in])
+    yt_dl_button.click(run_youtube_dl, [yt_url_in], [demucs_status, audio_in])
+    # Vocal separator
+    run_demucs_button.click(run_demucs, [audio_in], [demucs_status, demucs_output_vocals, demucs_output_inst])
+# Launch the app
+demo.queue(max_size=20).launch(debug=True) # Enable queue for handling traffic