melromyeah commited on
Commit
cbfbb5c
·
verified ·
1 Parent(s): 9b0bb71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +387 -6
app.py CHANGED
@@ -1,8 +1,389 @@
1
- import os
 
 
 
 
 
 
2
 
3
- os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d . -o hubert_base.pt")
4
- os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -d . -o rmvpe.pt")
5
- os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/yoimiya-jp/resolve/main/model.pth -d ./weights -o yoimiya.pth")
6
- os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/yoimiya-jp/resolve/main/model.index -d ./weights/index -o yoimiya.index")
 
 
 
 
7
 
8
- os.system("python infer.py")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Imports ---
2
+ import os
3
+ import shutil
4
+ import traceback
5
+ import asyncio
6
+ import subprocess
7
+ from datetime import datetime
8
 
9
+ import gradio as gr
10
+ import torch
11
+ import numpy as np
12
+ import librosa
13
+ import soundfile as sf
14
+ import yt_dlp
15
+ import edge_tts
16
+ from fairseq import checkpoint_utils
17
 
18
+ # --- Local Module Imports ---
19
+ # Ensure these files are in your repository
20
+ from lib.infer_pack.models import (
21
+ SynthesizerTrnMs256NSFsid,
22
+ SynthesizerTrnMs256NSFsid_nono,
23
+ SynthesizerTrnMs768NSFsid,
24
+ SynthesizerTrnMs768NSFsid_nono,
25
+ )
26
+ from vc_infer_pipeline import VC
27
+ from config import Config
28
+
29
+ # --- Constants and Configuration ---
30
+ now_dir = os.getcwd()
31
+ config = Config() # Sets device (CPU/GPU) and precision (half/full)
32
+
33
+ # Define file paths for pre-trained models and voice models
34
+ # These files should be in your repository, not downloaded at runtime.
35
+ HUBERT_PATH = os.path.join(now_dir, "pretraineds", "hubert_base.pt")
36
+ RMVPE_PATH = os.path.join(now_dir, "pretraineds", "rmvpe.pt")
37
+ WEIGHT_ROOT = os.path.join(now_dir, "weights")
38
+ INDEX_ROOT = os.path.join(WEIGHT_ROOT, "index")
39
+
40
+ # Create necessary directories
41
+ os.makedirs(WEIGHT_ROOT, exist_ok=True)
42
+ os.makedirs(INDEX_ROOT, exist_ok=True)
43
+ os.makedirs(os.path.join(now_dir, "output"), exist_ok=True) # For demucs output
44
+ os.makedirs(os.path.join(now_dir, "dl_audio"), exist_ok=True) # For youtube-dl output
45
+
46
+ # Setup for temporary files
47
+ tmp_dir = os.path.join(now_dir, "TEMP")
48
+ shutil.rmtree(tmp_dir, ignore_errors=True)
49
+ os.makedirs(tmp_dir, exist_ok=True)
50
+ os.environ["TEMP"] = tmp_dir
51
+
52
+ # --- Model Loading (Cached for Performance) ---
53
+
54
+ @gr.cache_resource
55
+ def load_hubert_model():
56
+ """Loads the Hubert model and caches it."""
57
+ print("Loading Hubert model...")
58
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task([HUBERT_PATH], suffix="")
59
+ hubert_model = models[0]
60
+ hubert_model = hubert_model.to(config.device)
61
+ hubert_model = hubert_model.half() if config.is_half else hubert_model.float()
62
+ hubert_model.eval()
63
+ print("Hubert model loaded.")
64
+ return hubert_model
65
+
66
+ hubert_model = load_hubert_model()
67
+
68
+ # --- Utility Functions ---
69
+
70
+ def get_models_and_indices():
71
+ """Scans the weights folders and returns lists of available models and indices."""
72
+ model_files = [f for f in os.listdir(WEIGHT_ROOT) if f.endswith(".pth")]
73
+ index_files = [os.path.join(INDEX_ROOT, f) for f in os.listdir(INDEX_ROOT) if f.endswith('.index') and "trained" not in f]
74
+ return sorted(model_files), sorted(index_files)
75
+
76
+ def get_edge_tts_voices():
77
+ """Fetches the list of available voices for Edge-TTS."""
78
+ try:
79
+ tts_voice_list = asyncio.run(edge_tts.list_voices())
80
+ return [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
81
+ except Exception as e:
82
+ print(f"Error fetching TTS voices: {e}. Returning a default list.")
83
+ return ["en-US-AnaNeural-Female", "en-US-AriaNeural-Female", "en-GB-SoniaNeural-Female"]
84
+
85
+ # --- Core Inference Logic ---
86
+
87
+ def vc_single(
88
+ sid,
89
+ input_audio_tuple,
90
+ f0_up_key,
91
+ f0_method,
92
+ file_index,
93
+ index_rate,
94
+ filter_radius,
95
+ resample_sr,
96
+ rms_mix_rate,
97
+ protect,
98
+ f0_file,
99
+ loaded_model # Comes from gr.State
100
+ ):
101
+ """Main voice conversion function."""
102
+ if not input_audio_tuple:
103
+ return "You need to upload an audio file.", None
104
+
105
+ if not loaded_model or loaded_model["sid"] != sid:
106
+ return "Model not loaded or selected model mismatch. Please select a model from the dropdown and wait for it to load.", None
107
+
108
+ # Unpack the loaded model state
109
+ net_g = loaded_model["model"]
110
+ tgt_sr = loaded_model["tgt_sr"]
111
+ vc = loaded_model["vc"]
112
+ version = loaded_model["version"]
113
+ if_f0 = loaded_model["if_f0"]
114
+
115
+ try:
116
+ sampling_rate, audio_data = input_audio_tuple
117
+ audio_data = (audio_data / np.iinfo(audio_data.dtype).max).astype(np.float32) # Normalize audio
118
+ if len(audio_data.shape) > 1:
119
+ audio_data = librosa.to_mono(audio_data.transpose(1, 0))
120
+ if sampling_rate != 16000:
121
+ audio_data = librosa.resample(audio=audio_data, orig_sr=sampling_rate, target_sr=16000)
122
+
123
+ times = [0, 0, 0] # for performance tracking
124
+
125
+ # Perform the pipeline conversion
126
+ audio_opt = vc.pipeline(
127
+ hubert_model, net_g, sid, audio_data, "dummy_path", times, int(f0_up_key),
128
+ f0_method, file_index, index_rate, if_f0, filter_radius, tgt_sr,
129
+ resample_sr, rms_mix_rate, version, protect, f0_file=f0_file
130
+ )
131
+
132
+ final_sr = resample_sr if resample_sr >= 16000 else tgt_sr
133
+ index_info = f"Using index: {os.path.basename(file_index)}" if file_index and os.path.exists(file_index) else "Index not used."
134
+ info = f"Success. {index_info}\nTime: npy:{times[0]:.2f}s, f0:{times[1]:.2f}s, infer:{times[2]:.2f}s"
135
+ print(info)
136
+ return info, (final_sr, audio_opt)
137
+
138
+ except Exception as e:
139
+ info = traceback.format_exc()
140
+ print(info)
141
+ return info, None
142
+
143
+
144
+ def load_selected_model(sid, protect_val):
145
+ """Loads a selected .pth model file and updates the UI accordingly."""
146
+ if not sid:
147
+ return None, gr.update(maximum=2333, visible=False), gr.update(visible=True), gr.update(value=""), gr.update(value="# <center> No model selected")
148
+
149
+ print(f"Loading model: {sid}")
150
+ try:
151
+ cpt = torch.load(os.path.join(WEIGHT_ROOT, sid), map_location="cpu")
152
+ tgt_sr = cpt["config"][-1]
153
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
154
+ if_f0 = cpt.get("f0", 1)
155
+ version = cpt.get("version", "v1")
156
+
157
+ # Determine the correct model class
158
+ synth_class = None
159
+ if version == "v1":
160
+ synth_class = SynthesizerTrnMs256NSFsid if if_f0 == 1 else SynthesizerTrnMs256NSFsid_nono
161
+ elif version == "v2":
162
+ synth_class = SynthesizerTrnMs768NSFsid if if_f0 == 1 else SynthesizerTrnMs768NSFsid_nono
163
+
164
+ net_g = synth_class(*cpt["config"], is_half=config.is_half)
165
+ del net_g.enc_q
166
+ net_g.load_state_dict(cpt["weight"], strict=False)
167
+ net_g.eval().to(config.device)
168
+ net_g = net_g.half() if config.is_half else net_g.float()
169
+
170
+ vc = VC(tgt_sr, config)
171
+ n_spk = cpt["config"][-3]
172
+
173
+ # Prepare model state to be stored
174
+ loaded_model_state = {
175
+ "sid": sid, "model": net_g, "tgt_sr": tgt_sr, "vc": vc,
176
+ "version": version, "if_f0": if_f0, "n_spk": n_spk
177
+ }
178
+
179
+ # Find the corresponding index file
180
+ model_name_no_ext = os.path.splitext(sid)[0]
181
+ _, index_files = get_models_and_indices()
182
+ best_index = ""
183
+ for index_file in index_files:
184
+ if model_name_no_ext in os.path.basename(index_file):
185
+ best_index = index_file
186
+ break
187
+
188
+ # UI Updates
189
+ protect_update = gr.update(visible=(if_f0 != 0), value=protect_val)
190
+ spk_id_update = gr.update(maximum=n_spk - 1, visible=True)
191
+ model_info_update = gr.update(value=f'## <center> ✅ Loaded: {model_name_no_ext}\n### <center> RVC {version} Model')
192
+
193
+ print(f"Model {sid} loaded successfully.")
194
+ return loaded_model_state, spk_id_update, protect_update, gr.update(value=best_index), model_info_update
195
+
196
+ except Exception as e:
197
+ print(f"Error loading model: {e}")
198
+ return None, gr.update(visible=False), gr.update(visible=True), gr.update(value=""), gr.update(value=f"# <center> ⚠️ Error loading {sid}")
199
+
200
+ def run_tts(tts_text, tts_voice):
201
+ """Runs Edge-TTS and returns the audio file path."""
202
+ if not tts_text or not tts_voice:
203
+ raise gr.Error("TTS text and voice are required.")
204
+ output_file = os.path.join(tmp_dir, "tts_output.mp3")
205
+ voice_shortname = "-".join(tts_voice.split('-')[:-1])
206
+ try:
207
+ asyncio.run(edge_tts.Communicate(tts_text, voice_shortname).save(output_file))
208
+ return "TTS audio generated.", output_file
209
+ except Exception as e:
210
+ return f"TTS failed: {e}", None
211
+
212
+ def run_youtube_dl(url):
213
+ """Downloads audio from a YouTube URL."""
214
+ if not url:
215
+ raise gr.Error("URL is required.")
216
+ output_path = os.path.join(now_dir, "dl_audio", "audio.wav")
217
+ ydl_opts = {
218
+ 'noplaylist': True,
219
+ 'format': 'bestaudio/best',
220
+ 'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav'}],
221
+ "outtmpl": os.path.join(now_dir, "dl_audio", "audio"),
222
+ 'quiet': True,
223
+ }
224
+ try:
225
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
226
+ ydl.download([url])
227
+ return "Download complete.", output_path
228
+ except Exception as e:
229
+ return f"Download failed: {e}", None
230
+
231
+ def run_demucs(audio_path, model="htdemucs_ft"):
232
+ """Runs Demucs to separate vocals from an audio file."""
233
+ if not audio_path or not os.path.exists(audio_path):
234
+ raise gr.Error("Input audio for splitting not found.")
235
+
236
+ output_dir = os.path.join(now_dir, "output")
237
+ command = f"demucs --two-stems=vocals -n {model} \"{audio_path}\" -o \"{output_dir}\""
238
+ print(f"Running command: {command}")
239
+
240
+ try:
241
+ subprocess.run(command.split(), check=True, capture_output=True, text=True)
242
+
243
+ input_filename = os.path.splitext(os.path.basename(audio_path))[0]
244
+ vocal_path = os.path.join(output_dir, model, input_filename, "vocals.wav")
245
+ inst_path = os.path.join(output_dir, model, input_filename, "no_vocals.wav")
246
+
247
+ if os.path.exists(vocal_path):
248
+ return "Splitting complete.", vocal_path, inst_path
249
+ else:
250
+ return "Splitting failed: vocal file not found.", None, None
251
+ except subprocess.CalledProcessError as e:
252
+ error_message = f"Demucs failed: {e.stderr}"
253
+ print(error_message)
254
+ return error_message, None, None
255
+
256
+ def refresh_model_list_ui():
257
+ """Refreshes the UI dropdowns for models and indices."""
258
+ models, indices = get_models_and_indices()
259
+ return gr.update(choices=models), gr.update(choices=indices)
260
+
261
+ # --- Gradio UI Layout ---
262
+ initial_models, initial_indices = get_models_and_indices()
263
+ tts_voices = get_edge_tts_voices()
264
+
265
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="rose", secondary_hue="pink")) as demo:
266
+ gr.Markdown("# 🌺 Modernized RVC Voice Conversion 🌺")
267
+
268
+ # Stores the loaded model dictionary {sid, model, tgt_sr, ...}
269
+ loaded_model_state = gr.State(value=None)
270
+
271
+ with gr.Row():
272
+ sid = gr.Dropdown(label="1. Select Voice Model (.pth)", choices=initial_models)
273
+ refresh_button = gr.Button("🔄 Refresh", variant="secondary")
274
+
275
+ selected_model_info = gr.Markdown("# <center> No model selected", elem_id="model-info")
276
+
277
+ with gr.Tabs():
278
+ with gr.TabItem("🎙️ Main Inference"):
279
+ with gr.Row():
280
+ with gr.Column(scale=1):
281
+ gr.Markdown("### Input Audio")
282
+ input_audio_type = gr.Radio(
283
+ ["Upload", "Microphone", "TTS", "YouTube"],
284
+ value="Upload", label="Input Source"
285
+ )
286
+
287
+ # Upload/Mic
288
+ audio_in = gr.Audio(label="Upload or Record Audio", type="filepath", sources=["upload", "microphone"], visible=True)
289
+
290
+ # TTS
291
+ tts_text_in = gr.Textbox(label="TTS Text", lines=3, visible=False)
292
+ tts_voice_in = gr.Dropdown(label="TTS Voice", choices=tts_voices, value=tts_voices[0], visible=False)
293
+ tts_gen_button = gr.Button("Generate TTS Audio", variant="primary", visible=False)
294
+
295
+ # YouTube
296
+ yt_url_in = gr.Textbox(label="YouTube URL", visible=False)
297
+ yt_dl_button = gr.Button("Download from YouTube", variant="primary", visible=False)
298
+
299
+ gr.Markdown("### (Optional) Vocal Separation")
300
+ run_demucs_button = gr.Button("Separate Vocals from Input", variant="secondary")
301
+ demucs_output_vocals = gr.Audio(label="Separated Vocals (for conversion)", type="filepath")
302
+ demucs_output_inst = gr.Audio(label="Separated Instrumentals", type="filepath")
303
+ demucs_status = gr.Textbox(label="Splitter Status", interactive=False)
304
+ gr.Markdown("_Use the 'Separated Vocals' as input for the best results._")
305
+
306
+ with gr.Column(scale=1):
307
+ gr.Markdown("### Inference Settings")
308
+ spk_item = gr.Slider(minimum=0, maximum=2333, step=1, label="Speaker ID", value=0, visible=False, interactive=True)
309
+ vc_transform0 = gr.Number(label="Transpose (semitones)", value=0)
310
+ f0method0 = gr.Radio(
311
+ label="Pitch Extraction Algorithm",
312
+ choices=["pm", "harvest", "crepe", "rmvpe"] if os.path.exists(RMVPE_PATH) else ["pm", "harvest", "crepe"],
313
+ value="rmvpe" if os.path.exists(RMVPE_PATH) else "pm", interactive=True
314
+ )
315
+ file_index = gr.Dropdown(label="Feature Index File (.index)", choices=initial_indices, interactive=True)
316
+ index_rate0 = gr.Slider(minimum=0, maximum=1, label="Feature Retrieval Ratio", value=0.7, interactive=True)
317
+ filter_radius0 = gr.Slider(minimum=0, maximum=7, label="Median Filtering Radius (reduces breathiness)", value=3, step=1, interactive=True)
318
+ resample_sr0 = gr.Slider(minimum=0, maximum=48000, label="Output Resampling (0 for auto)", value=0, step=1, interactive=True)
319
+ rms_mix_rate0 = gr.Slider(minimum=0, maximum=1, label="Input/Output Volume Envelope Mix Ratio", value=1, interactive=True)
320
+ protect0 = gr.Slider(minimum=0, maximum=0.5, label="Voice Protection (for breathiness)", value=0.33, step=0.01, interactive=True)
321
+ f0_file0 = gr.File(label="Optional F0 Curve File (.txt)", file_count="single")
322
+
323
+ with gr.Column(scale=1):
324
+ gr.Markdown("### Output")
325
+ convert_button = gr.Button("✨ Convert", variant="primary")
326
+ vc_log = gr.Textbox(label="Output Information", interactive=False)
327
+ vc_output = gr.Audio(label="Converted Audio", interactive=False)
328
+
329
+ with gr.TabItem("📚 Add New Models"):
330
+ gr.Markdown(
331
+ """
332
+ ## How to Add New Models
333
+ The old 'Model Downloader' has been removed to make this Space faster and more reliable.
334
+ Here's the modern way to add your own RVC models:
335
+
336
+ 1. **Go to the 'Files' tab** at the top of this Hugging Face Space.
337
+ 2. **Navigate to the `weights` folder.**
338
+ 3. Click **'Upload file'** to add your model files.
339
+ - Your model `.pth` file should go directly into the `weights` folder.
340
+ - Your index `.index` file should go into the `weights/index` folder.
341
+ 4. Once uploaded, come back to this 'Inference' tab and **click the '🔄 Refresh' button** next to the model dropdown. Your new model will appear!
342
+
343
+ This process uses Git-LFS to handle large files correctly and ensures your models are always available without needing to be re-downloaded.
344
+ """
345
+ )
346
+
347
+ # --- Event Listeners ---
348
+
349
+ # Load model when dropdown changes
350
+ sid.change(
351
+ load_selected_model,
352
+ inputs=[sid, protect0],
353
+ outputs=[loaded_model_state, spk_item, protect0, file_index, selected_model_info]
354
+ )
355
+
356
+ # Refresh button
357
+ refresh_button.click(refresh_model_list_ui, None, [sid, file_index])
358
+
359
+ # Main conversion
360
+ # The source audio is chosen based on which one was last interacted with or generated.
361
+ # Gradio automatically picks the most recent one if multiple gr.Audio inputs are provided.
362
+ convert_button.click(
363
+ vc_single,
364
+ [spk_item, demucs_output_vocals, vc_transform0, f0method0, file_index, index_rate0, filter_radius0, resample_sr0, rms_mix_rate0, protect0, f0_file0, loaded_model_state],
365
+ [vc_log, vc_output]
366
+ )
367
+
368
+ # Input type visibility
369
+ def update_input_visibility(choice):
370
+ return {
371
+ audio_in: gr.update(visible=choice in ["Upload", "Microphone"]),
372
+ tts_text_in: gr.update(visible=choice == "TTS"),
373
+ tts_voice_in: gr.update(visible=choice == "TTS"),
374
+ tts_gen_button: gr.update(visible=choice == "TTS"),
375
+ yt_url_in: gr.update(visible=choice == "YouTube"),
376
+ yt_dl_button: gr.update(visible=choice == "YouTube"),
377
+ }
378
+ input_audio_type.change(update_input_visibility, input_audio_type, [audio_in, tts_text_in, tts_voice_in, tts_gen_button, yt_url_in, yt_dl_button])
379
+
380
+ # Generators for input audio
381
+ tts_gen_button.click(run_tts, [tts_text_in, tts_voice_in], [demucs_status, audio_in])
382
+ yt_dl_button.click(run_youtube_dl, [yt_url_in], [demucs_status, audio_in])
383
+
384
+ # Vocal separator
385
+ run_demucs_button.click(run_demucs, [audio_in], [demucs_status, demucs_output_vocals, demucs_output_inst])
386
+
387
+
388
+ # Launch the app
389
+ demo.queue(max_size=20).launch(debug=True) # Enable queue for handling traffic