melromyeah commited on
Commit
1ec8bdb
·
verified ·
1 Parent(s): 1b7ac6c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -249
app.py CHANGED
@@ -16,7 +16,6 @@ import edge_tts
16
  from fairseq import checkpoint_utils
17
 
18
  # --- Local Module Imports ---
19
- # Ensure these files are in your repository
20
  from lib.infer_pack.models import (
21
  SynthesizerTrnMs256NSFsid,
22
  SynthesizerTrnMs256NSFsid_nono,
@@ -28,27 +27,28 @@ from config import Config
28
 
29
  # --- Constants and Configuration ---
30
  now_dir = os.getcwd()
31
- config = Config() # Sets device (CPU/GPU) and precision (half/full)
32
 
33
- # Define file paths for pre-trained models and voice models
34
- # These files should be in your repository, not downloaded at runtime.
35
  HUBERT_PATH = os.path.join(now_dir, "pretraineds", "hubert_base.pt")
36
  RMVPE_PATH = os.path.join(now_dir, "pretraineds", "rmvpe.pt")
37
  WEIGHT_ROOT = os.path.join(now_dir, "weights")
38
  INDEX_ROOT = os.path.join(WEIGHT_ROOT, "index")
39
 
40
- # Create necessary directories
41
- os.makedirs(os.path.join(now_dir, "output"), exist_ok=True) # For demucs output
42
- os.makedirs(os.path.join(now_dir, "dl_audio"), exist_ok=True) # For youtube-dl output
 
 
43
 
44
- # Setup for temporary files
45
- tmp_dir = os.path.join(now_dir, "TEMP")
46
- shutil.rmtree(tmp_dir, ignore_errors=True)
47
- os.makedirs(tmp_dir, exist_ok=True)
48
- os.environ["TEMP"] = tmp_dir
 
49
 
50
  # --- Model Loading (Cached for Performance) ---
51
-
52
  @gr.cache_resource
53
  def load_hubert_model():
54
  """Loads the Hubert model and caches it."""
@@ -64,7 +64,6 @@ def load_hubert_model():
64
  hubert_model = load_hubert_model()
65
 
66
  # --- Utility Functions ---
67
-
68
  def get_models_and_indices():
69
  """Scans the weights folders and returns lists of available models and indices."""
70
  model_files = [f for f in os.listdir(WEIGHT_ROOT) if f.endswith(".pth")]
@@ -80,179 +79,78 @@ def get_edge_tts_voices():
80
  print(f"Error fetching TTS voices: {e}. Returning a default list.")
81
  return ["en-US-AnaNeural-Female", "en-US-AriaNeural-Female", "en-GB-SoniaNeural-Female"]
82
 
83
- # --- Core Inference Logic ---
84
-
85
- def vc_single(
86
- sid,
87
- input_audio_tuple,
88
- f0_up_key,
89
- f0_method,
90
- file_index,
91
- index_rate,
92
- filter_radius,
93
- resample_sr,
94
- rms_mix_rate,
95
- protect,
96
- f0_file,
97
- loaded_model # Comes from gr.State
98
- ):
99
- """Main voice conversion function."""
100
- if not input_audio_tuple:
101
- return "You need to upload an audio file.", None
102
-
103
- if not loaded_model or loaded_model["sid"] != sid:
104
- return "Model not loaded or selected model mismatch. Please select a model from the dropdown and wait for it to load.", None
105
-
106
- # Unpack the loaded model state
107
- net_g = loaded_model["model"]
108
- tgt_sr = loaded_model["tgt_sr"]
109
- vc = loaded_model["vc"]
110
- version = loaded_model["version"]
111
- if_f0 = loaded_model["if_f0"]
112
-
113
  try:
114
  sampling_rate, audio_data = input_audio_tuple
115
- audio_data = (audio_data / np.iinfo(audio_data.dtype).max).astype(np.float32) # Normalize audio
116
- if len(audio_data.shape) > 1:
117
- audio_data = librosa.to_mono(audio_data.transpose(1, 0))
118
- if sampling_rate != 16000:
119
- audio_data = librosa.resample(audio=audio_data, orig_sr=sampling_rate, target_sr=16000)
120
-
121
- times = [0, 0, 0] # for performance tracking
122
-
123
- # Perform the pipeline conversion
124
- audio_opt = vc.pipeline(
125
- hubert_model, net_g, sid, audio_data, "dummy_path", times, int(f0_up_key),
126
- f0_method, file_index, index_rate, if_f0, filter_radius, tgt_sr,
127
- resample_sr, rms_mix_rate, version, protect, f0_file=f0_file
128
- )
129
-
130
  final_sr = resample_sr if resample_sr >= 16000 else tgt_sr
131
  index_info = f"Using index: {os.path.basename(file_index)}" if file_index and os.path.exists(file_index) else "Index not used."
132
  info = f"Success. {index_info}\nTime: npy:{times[0]:.2f}s, f0:{times[1]:.2f}s, infer:{times[2]:.2f}s"
133
- print(info)
134
  return info, (final_sr, audio_opt)
135
-
136
- except Exception as e:
137
- info = traceback.format_exc()
138
- print(info)
139
- return info, None
140
-
141
 
142
  def load_selected_model(sid, protect_val):
143
- """Loads a selected .pth model file and updates the UI accordingly."""
144
- if not sid:
145
- return None, gr.update(maximum=2333, visible=False), gr.update(visible=True), gr.update(value=""), gr.update(value="# <center> No model selected")
146
-
147
  print(f"Loading model: {sid}")
148
  try:
149
  cpt = torch.load(os.path.join(WEIGHT_ROOT, sid), map_location="cpu")
150
- tgt_sr = cpt["config"][-1]
151
- cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
152
- if_f0 = cpt.get("f0", 1)
153
- version = cpt.get("version", "v1")
154
-
155
- # Determine the correct model class
156
- synth_class = None
157
- if version == "v1":
158
- synth_class = SynthesizerTrnMs256NSFsid if if_f0 == 1 else SynthesizerTrnMs256NSFsid_nono
159
- elif version == "v2":
160
- synth_class = SynthesizerTrnMs768NSFsid if if_f0 == 1 else SynthesizerTrnMs768NSFsid_nono
161
-
162
  net_g = synth_class(*cpt["config"], is_half=config.is_half)
163
  del net_g.enc_q
164
  net_g.load_state_dict(cpt["weight"], strict=False)
165
  net_g.eval().to(config.device)
166
  net_g = net_g.half() if config.is_half else net_g.float()
167
-
168
  vc = VC(tgt_sr, config)
169
- n_spk = cpt["config"][-3]
170
-
171
- # Prepare model state to be stored
172
- loaded_model_state = {
173
- "sid": sid, "model": net_g, "tgt_sr": tgt_sr, "vc": vc,
174
- "version": version, "if_f0": if_f0, "n_spk": n_spk
175
- }
176
-
177
- # Find the corresponding index file
178
- model_name_no_ext = os.path.splitext(sid)[0]
179
- _, index_files = get_models_and_indices()
180
- best_index = ""
181
- for index_file in index_files:
182
- if model_name_no_ext in os.path.basename(index_file):
183
- best_index = index_file
184
- break
185
-
186
- # UI Updates
187
- protect_update = gr.update(visible=(if_f0 != 0), value=protect_val)
188
- spk_id_update = gr.update(maximum=n_spk - 1, visible=True)
189
- model_info_update = gr.update(value=f'## <center> ✅ Loaded: {model_name_no_ext}\n### <center> RVC {version} Model')
190
-
191
- print(f"Model {sid} loaded successfully.")
192
- return loaded_model_state, spk_id_update, protect_update, gr.update(value=best_index), model_info_update
193
-
194
- except Exception as e:
195
- print(f"Error loading model: {e}")
196
- return None, gr.update(visible=False), gr.update(visible=True), gr.update(value=""), gr.update(value=f"# <center> ⚠️ Error loading {sid}")
197
 
198
  def run_tts(tts_text, tts_voice):
199
- """Runs Edge-TTS and returns the audio file path."""
200
- if not tts_text or not tts_voice:
201
- raise gr.Error("TTS text and voice are required.")
202
- output_file = os.path.join(tmp_dir, "tts_output.mp3")
203
- voice_shortname = "-".join(tts_voice.split('-')[:-1])
204
  try:
205
- asyncio.run(edge_tts.Communicate(tts_text, voice_shortname).save(output_file))
206
  return "TTS audio generated.", output_file
207
- except Exception as e:
208
- return f"TTS failed: {e}", None
209
 
210
  def run_youtube_dl(url):
211
- """Downloads audio from a YouTube URL."""
212
- if not url:
213
- raise gr.Error("URL is required.")
214
- output_path = os.path.join(now_dir, "dl_audio", "audio.wav")
215
- ydl_opts = {
216
- 'noplaylist': True,
217
- 'format': 'bestaudio/best',
218
- 'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav'}],
219
- "outtmpl": os.path.join(now_dir, "dl_audio", "audio"),
220
- 'quiet': True,
221
- }
222
  try:
223
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
224
- ydl.download([url])
225
  return "Download complete.", output_path
226
- except Exception as e:
227
- return f"Download failed: {e}", None
228
 
229
  def run_demucs(audio_path, model="htdemucs_ft"):
230
- """Runs Demucs to separate vocals from an audio file."""
231
- if not audio_path or not os.path.exists(audio_path):
232
- raise gr.Error("Input audio for splitting not found.")
233
-
234
- output_dir = os.path.join(now_dir, "output")
235
  command = f"demucs --two-stems=vocals -n {model} \"{audio_path}\" -o \"{output_dir}\""
236
  print(f"Running command: {command}")
237
-
238
  try:
239
  subprocess.run(command.split(), check=True, capture_output=True, text=True)
240
-
241
  input_filename = os.path.splitext(os.path.basename(audio_path))[0]
242
  vocal_path = os.path.join(output_dir, model, input_filename, "vocals.wav")
243
  inst_path = os.path.join(output_dir, model, input_filename, "no_vocals.wav")
244
-
245
- if os.path.exists(vocal_path):
246
- return "Splitting complete.", vocal_path, inst_path
247
- else:
248
- return "Splitting failed: vocal file not found.", None, None
249
  except subprocess.CalledProcessError as e:
250
  error_message = f"Demucs failed: {e.stderr}"
251
- print(error_message)
252
  return error_message, None, None
253
 
254
  def refresh_model_list_ui():
255
- """Refreshes the UI dropdowns for models and indices."""
256
  models, indices = get_models_and_indices()
257
  return gr.update(choices=models), gr.update(choices=indices)
258
 
@@ -262,126 +160,41 @@ tts_voices = get_edge_tts_voices()
262
 
263
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="rose", secondary_hue="pink")) as demo:
264
  gr.Markdown("# 🌺 Modernized RVC Voice Conversion 🌺")
265
-
266
- # Stores the loaded model dictionary {sid, model, tgt_sr, ...}
267
  loaded_model_state = gr.State(value=None)
268
-
269
  with gr.Row():
270
  sid = gr.Dropdown(label="1. Select Voice Model (.pth)", choices=initial_models)
271
  refresh_button = gr.Button("🔄 Refresh", variant="secondary")
272
-
273
  selected_model_info = gr.Markdown("# <center> No model selected", elem_id="model-info")
274
-
275
  with gr.Tabs():
276
  with gr.TabItem("🎙️ Main Inference"):
277
  with gr.Row():
278
  with gr.Column(scale=1):
279
- gr.Markdown("### Input Audio")
280
- input_audio_type = gr.Radio(
281
- ["Upload", "Microphone", "TTS", "YouTube"],
282
- value="Upload", label="Input Source"
283
- )
284
-
285
- # Upload/Mic
286
  audio_in = gr.Audio(label="Upload or Record Audio", type="filepath", sources=["upload", "microphone"], visible=True)
287
-
288
- # TTS
289
- tts_text_in = gr.Textbox(label="TTS Text", lines=3, visible=False)
290
- tts_voice_in = gr.Dropdown(label="TTS Voice", choices=tts_voices, value=tts_voices[0], visible=False)
291
- tts_gen_button = gr.Button("Generate TTS Audio", variant="primary", visible=False)
292
-
293
- # YouTube
294
- yt_url_in = gr.Textbox(label="YouTube URL", visible=False)
295
- yt_dl_button = gr.Button("Download from YouTube", variant="primary", visible=False)
296
-
297
- gr.Markdown("### (Optional) Vocal Separation")
298
- run_demucs_button = gr.Button("Separate Vocals from Input", variant="secondary")
299
- demucs_output_vocals = gr.Audio(label="Separated Vocals (for conversion)", type="filepath")
300
- demucs_output_inst = gr.Audio(label="Separated Instrumentals", type="filepath")
301
- demucs_status = gr.Textbox(label="Splitter Status", interactive=False)
302
  gr.Markdown("_Use the 'Separated Vocals' as input for the best results._")
303
-
304
  with gr.Column(scale=1):
305
- gr.Markdown("### Inference Settings")
306
- spk_item = gr.Slider(minimum=0, maximum=2333, step=1, label="Speaker ID", value=0, visible=False, interactive=True)
307
  vc_transform0 = gr.Number(label="Transpose (semitones)", value=0)
308
- f0method0 = gr.Radio(
309
- label="Pitch Extraction Algorithm",
310
- choices=["pm", "harvest", "crepe", "rmvpe"] if os.path.exists(RMVPE_PATH) else ["pm", "harvest", "crepe"],
311
- value="rmvpe" if os.path.exists(RMVPE_PATH) else "pm", interactive=True
312
- )
313
- file_index = gr.Dropdown(label="Feature Index File (.index)", choices=initial_indices, interactive=True)
314
- index_rate0 = gr.Slider(minimum=0, maximum=1, label="Feature Retrieval Ratio", value=0.7, interactive=True)
315
- filter_radius0 = gr.Slider(minimum=0, maximum=7, label="Median Filtering Radius (reduces breathiness)", value=3, step=1, interactive=True)
316
- resample_sr0 = gr.Slider(minimum=0, maximum=48000, label="Output Resampling (0 for auto)", value=0, step=1, interactive=True)
317
- rms_mix_rate0 = gr.Slider(minimum=0, maximum=1, label="Input/Output Volume Envelope Mix Ratio", value=1, interactive=True)
318
- protect0 = gr.Slider(minimum=0, maximum=0.5, label="Voice Protection (for breathiness)", value=0.33, step=0.01, interactive=True)
319
  f0_file0 = gr.File(label="Optional F0 Curve File (.txt)", file_count="single")
320
-
321
  with gr.Column(scale=1):
322
- gr.Markdown("### Output")
323
- convert_button = gr.Button(" Convert", variant="primary")
324
- vc_log = gr.Textbox(label="Output Information", interactive=False)
325
- vc_output = gr.Audio(label="Converted Audio", interactive=False)
326
-
327
  with gr.TabItem("📚 Add New Models"):
328
- gr.Markdown(
329
- """
330
- ## How to Add New Models
331
- The old 'Model Downloader' has been removed to make this Space faster and more reliable.
332
- Here's the modern way to add your own RVC models:
333
-
334
- 1. **Go to the 'Files' tab** at the top of this Hugging Face Space.
335
- 2. **Navigate to the `weights` folder.**
336
- 3. Click **'Upload file'** to add your model files.
337
- - Your model `.pth` file should go directly into the `weights` folder.
338
- - Your index `.index` file should go into the `weights/index` folder.
339
- 4. Once uploaded, come back to this 'Inference' tab and **click the '🔄 Refresh' button** next to the model dropdown. Your new model will appear!
340
-
341
- This process uses Git-LFS to handle large files correctly and ensures your models are always available without needing to be re-downloaded.
342
- """
343
- )
344
-
345
- # --- Event Listeners ---
346
-
347
- # Load model when dropdown changes
348
- sid.change(
349
- load_selected_model,
350
- inputs=[sid, protect0],
351
- outputs=[loaded_model_state, spk_item, protect0, file_index, selected_model_info]
352
- )
353
-
354
- # Refresh button
355
  refresh_button.click(refresh_model_list_ui, None, [sid, file_index])
356
-
357
- # Main conversion
358
- # The source audio is chosen based on which one was last interacted with or generated.
359
- # Gradio automatically picks the most recent one if multiple gr.Audio inputs are provided.
360
- convert_button.click(
361
- vc_single,
362
- [spk_item, demucs_output_vocals, vc_transform0, f0method0, file_index, index_rate0, filter_radius0, resample_sr0, rms_mix_rate0, protect0, f0_file0, loaded_model_state],
363
- [vc_log, vc_output]
364
- )
365
-
366
- # Input type visibility
367
- def update_input_visibility(choice):
368
- return {
369
- audio_in: gr.update(visible=choice in ["Upload", "Microphone"]),
370
- tts_text_in: gr.update(visible=choice == "TTS"),
371
- tts_voice_in: gr.update(visible=choice == "TTS"),
372
- tts_gen_button: gr.update(visible=choice == "TTS"),
373
- yt_url_in: gr.update(visible=choice == "YouTube"),
374
- yt_dl_button: gr.update(visible=choice == "YouTube"),
375
- }
376
  input_audio_type.change(update_input_visibility, input_audio_type, [audio_in, tts_text_in, tts_voice_in, tts_gen_button, yt_url_in, yt_dl_button])
377
-
378
- # Generators for input audio
379
  tts_gen_button.click(run_tts, [tts_text_in, tts_voice_in], [demucs_status, audio_in])
380
  yt_dl_button.click(run_youtube_dl, [yt_url_in], [demucs_status, audio_in])
381
-
382
- # Vocal separator
383
  run_demucs_button.click(run_demucs, [audio_in], [demucs_status, demucs_output_vocals, demucs_output_inst])
384
 
385
-
386
- # Launch the app
387
- demo.queue(max_size=20).launch(debug=True) # Enable queue for handling traffic
 
16
  from fairseq import checkpoint_utils
17
 
18
  # --- Local Module Imports ---
 
19
  from lib.infer_pack.models import (
20
  SynthesizerTrnMs256NSFsid,
21
  SynthesizerTrnMs256NSFsid_nono,
 
27
 
28
  # --- Constants and Configuration ---
29
  now_dir = os.getcwd()
30
+ config = Config()
31
 
32
+ # Define paths for read-only models from the repository
 
33
  HUBERT_PATH = os.path.join(now_dir, "pretraineds", "hubert_base.pt")
34
  RMVPE_PATH = os.path.join(now_dir, "pretraineds", "rmvpe.pt")
35
  WEIGHT_ROOT = os.path.join(now_dir, "weights")
36
  INDEX_ROOT = os.path.join(WEIGHT_ROOT, "index")
37
 
38
+ # The /app directory is read-only in the Docker environment.
39
+ main_tmp_dir = "/tmp/rvc_app"
40
+ output_dir = os.path.join(main_tmp_dir, "output") # For demucs output
41
+ dl_audio_dir = os.path.join(main_tmp_dir, "dl_audio") # For youtube-dl output
42
+ tts_audio_dir = os.path.join(main_tmp_dir, "tts") # For tts output
43
 
44
+ # Create all necessary temporary directories at startup
45
+ shutil.rmtree(main_tmp_dir, ignore_errors=True)
46
+ os.makedirs(output_dir, exist_ok=True)
47
+ os.makedirs(dl_audio_dir, exist_ok=True)
48
+ os.makedirs(tts_audio_dir, exist_ok=True)
49
+ os.environ["TEMP"] = main_tmp_dir # Set for any underlying libraries
50
 
51
  # --- Model Loading (Cached for Performance) ---
 
52
  @gr.cache_resource
53
  def load_hubert_model():
54
  """Loads the Hubert model and caches it."""
 
64
  hubert_model = load_hubert_model()
65
 
66
  # --- Utility Functions ---
 
67
  def get_models_and_indices():
68
  """Scans the weights folders and returns lists of available models and indices."""
69
  model_files = [f for f in os.listdir(WEIGHT_ROOT) if f.endswith(".pth")]
 
79
  print(f"Error fetching TTS voices: {e}. Returning a default list.")
80
  return ["en-US-AnaNeural-Female", "en-US-AriaNeural-Female", "en-GB-SoniaNeural-Female"]
81
 
82
+ # --- Core Logic (Updated with correct paths) ---
83
+ def vc_single(sid, input_audio_tuple, f0_up_key, f0_method, file_index, index_rate, filter_radius, resample_sr, rms_mix_rate, protect, f0_file, loaded_model):
84
+ if not input_audio_tuple: return "You need to upload an audio file.", None
85
+ if not loaded_model or loaded_model["sid"] != sid: return "Model not loaded. Please select a model from the dropdown.", None
86
+ net_g, tgt_sr, vc, version, if_f0 = loaded_model["model"], loaded_model["tgt_sr"], loaded_model["vc"], loaded_model["version"], loaded_model["if_f0"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  try:
88
  sampling_rate, audio_data = input_audio_tuple
89
+ audio_data = (audio_data / np.iinfo(audio_data.dtype).max).astype(np.float32)
90
+ if len(audio_data.shape) > 1: audio_data = librosa.to_mono(audio_data.transpose(1, 0))
91
+ if sampling_rate != 16000: audio_data = librosa.resample(audio=audio_data, orig_sr=sampling_rate, target_sr=16000)
92
+ times = [0, 0, 0]
93
+ audio_opt = vc.pipeline(hubert_model, net_g, sid, audio_data, "dummy_path", times, int(f0_up_key), f0_method, file_index, index_rate, if_f0, filter_radius, tgt_sr, resample_sr, rms_mix_rate, version, protect, f0_file=f0_file)
 
 
 
 
 
 
 
 
 
 
94
  final_sr = resample_sr if resample_sr >= 16000 else tgt_sr
95
  index_info = f"Using index: {os.path.basename(file_index)}" if file_index and os.path.exists(file_index) else "Index not used."
96
  info = f"Success. {index_info}\nTime: npy:{times[0]:.2f}s, f0:{times[1]:.2f}s, infer:{times[2]:.2f}s"
 
97
  return info, (final_sr, audio_opt)
98
+ except Exception: return traceback.format_exc(), None
 
 
 
 
 
99
 
100
  def load_selected_model(sid, protect_val):
101
+ if not sid: return None, gr.update(maximum=2333, visible=False), gr.update(visible=True), gr.update(value=""), gr.update(value="# <center> No model selected")
 
 
 
102
  print(f"Loading model: {sid}")
103
  try:
104
  cpt = torch.load(os.path.join(WEIGHT_ROOT, sid), map_location="cpu")
105
+ tgt_sr, n_spk = cpt["config"][-1], cpt["weight"]["emb_g.weight"].shape[0]
106
+ cpt["config"][-3] = n_spk
107
+ if_f0, version = cpt.get("f0", 1), cpt.get("version", "v1")
108
+ synth_class = {"v1": {1: SynthesizerTrnMs256NSFsid, 0: SynthesizerTrnMs256NSFsid_nono}, "v2": {1: SynthesizerTrnMs768NSFsid, 0: SynthesizerTrnMs768NSFsid_nono}}[version][if_f0]
 
 
 
 
 
 
 
 
109
  net_g = synth_class(*cpt["config"], is_half=config.is_half)
110
  del net_g.enc_q
111
  net_g.load_state_dict(cpt["weight"], strict=False)
112
  net_g.eval().to(config.device)
113
  net_g = net_g.half() if config.is_half else net_g.float()
 
114
  vc = VC(tgt_sr, config)
115
+ loaded_model_state = {"sid": sid, "model": net_g, "tgt_sr": tgt_sr, "vc": vc, "version": version, "if_f0": if_f0, "n_spk": n_spk}
116
+ model_name_no_ext, (_, index_files) = os.path.splitext(sid)[0], get_models_and_indices()
117
+ best_index = next((index_file for index_file in index_files if model_name_no_ext in os.path.basename(index_file)), "")
118
+ return loaded_model_state, gr.update(maximum=n_spk - 1, visible=True), gr.update(visible=(if_f0 != 0), value=protect_val), gr.update(value=best_index), gr.update(value=f'## <center> ✅ Loaded: {model_name_no_ext}\n### <center> RVC {version} Model')
119
+ except Exception: return None, gr.update(visible=False), gr.update(visible=True), gr.update(value=""), gr.update(value=f"# <center> ⚠️ Error loading {sid}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  def run_tts(tts_text, tts_voice):
122
+ if not tts_text or not tts_voice: raise gr.Error("TTS text and voice are required.")
123
+ output_file = os.path.join(tts_audio_dir, "tts_output.mp3")
 
 
 
124
  try:
125
+ asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(output_file))
126
  return "TTS audio generated.", output_file
127
+ except Exception as e: return f"TTS failed: {e}", None
 
128
 
129
  def run_youtube_dl(url):
130
+ if not url: raise gr.Error("URL is required.")
131
+ output_path = os.path.join(dl_audio_dir, "audio.wav")
132
+ ydl_opts = {'noplaylist': True, 'format': 'bestaudio/best', 'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav'}], "outtmpl": os.path.join(dl_audio_dir, "audio"), 'quiet': True}
 
 
 
 
 
 
 
 
133
  try:
134
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url])
 
135
  return "Download complete.", output_path
136
+ except Exception as e: return f"Download failed: {e}", None
 
137
 
138
  def run_demucs(audio_path, model="htdemucs_ft"):
139
+ if not audio_path or not os.path.exists(audio_path): raise gr.Error("Input audio for splitting not found.")
 
 
 
 
140
  command = f"demucs --two-stems=vocals -n {model} \"{audio_path}\" -o \"{output_dir}\""
141
  print(f"Running command: {command}")
 
142
  try:
143
  subprocess.run(command.split(), check=True, capture_output=True, text=True)
 
144
  input_filename = os.path.splitext(os.path.basename(audio_path))[0]
145
  vocal_path = os.path.join(output_dir, model, input_filename, "vocals.wav")
146
  inst_path = os.path.join(output_dir, model, input_filename, "no_vocals.wav")
147
+ if os.path.exists(vocal_path): return "Splitting complete.", vocal_path, inst_path
148
+ else: return "Splitting failed: vocal file not found.", None, None
 
 
 
149
  except subprocess.CalledProcessError as e:
150
  error_message = f"Demucs failed: {e.stderr}"
 
151
  return error_message, None, None
152
 
153
  def refresh_model_list_ui():
 
154
  models, indices = get_models_and_indices()
155
  return gr.update(choices=models), gr.update(choices=indices)
156
 
 
160
 
161
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="rose", secondary_hue="pink")) as demo:
162
  gr.Markdown("# 🌺 Modernized RVC Voice Conversion 🌺")
 
 
163
  loaded_model_state = gr.State(value=None)
 
164
  with gr.Row():
165
  sid = gr.Dropdown(label="1. Select Voice Model (.pth)", choices=initial_models)
166
  refresh_button = gr.Button("🔄 Refresh", variant="secondary")
 
167
  selected_model_info = gr.Markdown("# <center> No model selected", elem_id="model-info")
 
168
  with gr.Tabs():
169
  with gr.TabItem("🎙️ Main Inference"):
170
  with gr.Row():
171
  with gr.Column(scale=1):
172
+ gr.Markdown("### Input Audio"); input_audio_type = gr.Radio(["Upload", "Microphone", "TTS", "YouTube"], value="Upload", label="Input Source")
 
 
 
 
 
 
173
  audio_in = gr.Audio(label="Upload or Record Audio", type="filepath", sources=["upload", "microphone"], visible=True)
174
+ tts_text_in, tts_voice_in, tts_gen_button = gr.Textbox(label="TTS Text", lines=3, visible=False), gr.Dropdown(label="TTS Voice", choices=tts_voices, value=tts_voices[0], visible=False), gr.Button("Generate TTS Audio", variant="primary", visible=False)
175
+ yt_url_in, yt_dl_button = gr.Textbox(label="YouTube URL", visible=False), gr.Button("Download from YouTube", variant="primary", visible=False)
176
+ gr.Markdown("### (Optional) Vocal Separation"); run_demucs_button = gr.Button("Separate Vocals from Input", variant="secondary")
177
+ demucs_output_vocals, demucs_output_inst, demucs_status = gr.Audio(label="Separated Vocals (for conversion)", type="filepath"), gr.Audio(label="Separated Instrumentals", type="filepath"), gr.Textbox(label="Splitter Status", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
178
  gr.Markdown("_Use the 'Separated Vocals' as input for the best results._")
 
179
  with gr.Column(scale=1):
180
+ gr.Markdown("### Inference Settings"); spk_item = gr.Slider(minimum=0, maximum=2333, step=1, label="Speaker ID", value=0, visible=False, interactive=True)
 
181
  vc_transform0 = gr.Number(label="Transpose (semitones)", value=0)
182
+ f0method0 = gr.Radio(label="Pitch Extraction Algorithm", choices=["pm", "harvest", "crepe", "rmvpe"] if os.path.exists(RMVPE_PATH) else ["pm", "harvest", "crepe"], value="rmvpe" if os.path.exists(RMVPE_PATH) else "pm", interactive=True)
183
+ file_index, index_rate0, filter_radius0 = gr.Dropdown(label="Feature Index File (.index)", choices=initial_indices, interactive=True), gr.Slider(minimum=0, maximum=1, label="Feature Retrieval Ratio", value=0.7, interactive=True), gr.Slider(minimum=0, maximum=7, label="Median Filtering Radius", value=3, step=1, interactive=True)
184
+ resample_sr0, rms_mix_rate0, protect0 = gr.Slider(minimum=0, maximum=48000, label="Output Resampling", value=0, step=1, interactive=True), gr.Slider(minimum=0, maximum=1, label="Volume Envelope Mix Ratio", value=1, interactive=True), gr.Slider(minimum=0, maximum=0.5, label="Voice Protection", value=0.33, step=0.01, interactive=True)
 
 
 
 
 
 
 
 
185
  f0_file0 = gr.File(label="Optional F0 Curve File (.txt)", file_count="single")
 
186
  with gr.Column(scale=1):
187
+ gr.Markdown("### Output"); convert_button = gr.Button("✨ Convert", variant="primary")
188
+ vc_log, vc_output = gr.Textbox(label="Output Information", interactive=False), gr.Audio(label="Converted Audio", interactive=False)
 
 
 
189
  with gr.TabItem("📚 Add New Models"):
190
+ gr.Markdown("## How to Add New Models\n1. Go to the 'Files' tab of this Space.\n2. Navigate to the `weights` folder.\n3. Click 'Upload file' to add your `.pth` model file.\n4. Navigate to `weights/index` to upload your `.index` file.\n5. Come back here and click '🔄 Refresh'.")
191
+ sid.change(load_selected_model, [sid, protect0], [loaded_model_state, spk_item, protect0, file_index, selected_model_info])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  refresh_button.click(refresh_model_list_ui, None, [sid, file_index])
193
+ convert_button.click(vc_single, [spk_item, demucs_output_vocals, vc_transform0, f0method0, file_index, index_rate0, filter_radius0, resample_sr0, rms_mix_rate0, protect0, f0_file0, loaded_model_state], [vc_log, vc_output])
194
+ def update_input_visibility(c): return {audio_in: gr.update(visible=c in ["Upload", "Microphone"]), tts_text_in: gr.update(visible=c=="TTS"), tts_voice_in: gr.update(visible=c=="TTS"), tts_gen_button: gr.update(visible=c=="TTS"), yt_url_in: gr.update(visible=c=="YouTube"), yt_dl_button: gr.update(visible=c=="YouTube")}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  input_audio_type.change(update_input_visibility, input_audio_type, [audio_in, tts_text_in, tts_voice_in, tts_gen_button, yt_url_in, yt_dl_button])
 
 
196
  tts_gen_button.click(run_tts, [tts_text_in, tts_voice_in], [demucs_status, audio_in])
197
  yt_dl_button.click(run_youtube_dl, [yt_url_in], [demucs_status, audio_in])
 
 
198
  run_demucs_button.click(run_demucs, [audio_in], [demucs_status, demucs_output_vocals, demucs_output_inst])
199
 
200
+ demo.queue(max_size=20).launch()