_Noxty commited on
Commit
292372b
·
verified ·
1 Parent(s): 24dac05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +721 -597
app.py CHANGED
@@ -1,7 +1,188 @@
 
 
 
 
 
 
1
  import gradio as gr
2
- from separwator import *
3
- import os, subprocess
4
- import gc, re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def leaderboard(list_filter, list_limit):
7
  try:
@@ -23,614 +204,557 @@ def leaderboard(list_filter, list_limit):
23
  except Exception as e:
24
  return f"Error: {e}"
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
 
 
27
 
28
- with gr.Blocks(theme=gr.themes.Base(), title = "🎵 Audio Separator UI 🎵") as app:
29
- with gr.Row():
30
- gr.Markdown("<h1><center> 🎵 Audio Separator UI 🎵")
31
- with gr.Row():
32
- with gr.Tabs():
33
- with gr.TabItem("BS/Mel Roformer"):
34
- with gr.Row():
35
- roformer_model = gr.Dropdown(
36
- label = "Select the model",
37
- choices = list(roformer_models.keys()),
38
- value = lambda : None,
39
- interactive = True
40
- )
41
- roformer_output_format = gr.Dropdown(
42
- label = "Select the output format",
43
- choices = output_format,
44
- value = lambda : None,
45
- interactive = True
46
- )
47
- with gr.Row():
48
- roformer_audio = gr.Audio(
49
- label = "Input audio",
50
- type = "filepath",
51
- interactive = True
52
- )
53
- with gr.Column():
54
- with gr.Accordion("Advanced settings", open = False):
55
- with gr.Group():
56
- with gr.Row():
57
- roformer_segment_size = gr.Slider(
58
- label = "Segment size",
59
- info = "Larger consumes more resources, but may give better results",
60
- minimum = 32,
61
- maximum = 4000,
62
- step = 32,
63
- value = 256,
64
- interactive = True
65
- )
66
-
67
- roformer_override_segment_size = gr.Checkbox(
68
- label = "Override segment size",
69
- info = "Override model default segment size instead of using the model default value",
70
- value = False,
71
- interactive = True
72
- )
73
- with gr.Row():
74
- roformer_overlap = gr.Slider(
75
- label = "Overlap",
76
- info = "Amount of overlap between prediction windows",
77
- minimum = 2,
78
- maximum = 10,
79
- step = 1,
80
- value = 8,
81
- interactive = True
82
- )
83
- roformer_batch_size = gr.Slider(
84
- label = "Batch size",
85
- info = "Larger consumes more RAM but may process slightly faster",
86
- minimum = 1,
87
- maximum = 16,
88
- step = 1,
89
- value = 1,
90
- interactive = True
91
- )
92
- with gr.Row():
93
- roformer_normalization_threshold = gr.Slider(
94
- label = "Normalization threshold",
95
- info = "The threshold for audio normalization",
96
- minimum = 0.1,
97
- maximum = 1,
98
- step = 0.1,
99
- value = 0.1,
100
- interactive = True
101
- )
102
- roformer_amplification_threshold = gr.Slider(label = "Amplification threshold",info = "The threshold for audio amplification", minimum = 0.1,maximum = 1,step = 0.1,value = 0.1,interactive = True)
103
-
104
-
105
-
106
-
107
-
108
-
109
-
110
-
111
- with gr.Row():
112
- roformer_button = gr.Button("Separate!", variant = "primary")
113
- with gr.Row():
114
- roformer_stem1 = gr.Audio(
115
- show_download_button = True,
116
- interactive = False,
117
- label = "Stem 1",
118
- type = "filepath"
119
- )
120
- roformer_stem2 = gr.Audio(
121
- show_download_button = True,
122
- interactive = False,
123
- label = "Stem 2",
124
- type = "filepath"
125
- )
126
-
127
- roformer_button.click(roformer_separator, [roformer_audio, roformer_model, roformer_output_format, roformer_segment_size, roformer_override_segment_size, roformer_overlap, roformer_batch_size, roformer_normalization_threshold, roformer_amplification_threshold], [roformer_stem1, roformer_stem2])
128
-
129
- with gr.TabItem("MDX23C"):
130
- with gr.Row():
131
- mdx23c_model = gr.Dropdown(
132
- label = "Select the model",
133
- choices = mdx23c_models,
134
- value = lambda : None,
135
- interactive = True
136
- )
137
- mdx23c_output_format = gr.Dropdown(
138
- label = "Select the output format",
139
- choices = output_format,
140
- value = lambda : None,
141
- interactive = True
142
- )
143
- with gr.Row():
144
- mdx23c_audio = gr.Audio(
145
- label = "Input audio",
146
- type = "filepath",
147
- interactive = True
148
- )
149
- with gr.Accordion("Advanced settings", open = False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  with gr.Group():
 
151
  with gr.Row():
152
- mdx23c_segment_size = gr.Slider(
153
- minimum = 32,
154
- maximum = 4000,
155
- step = 32,
156
- label = "Segment size",
157
- info = "Larger consumes more resources, but may give better results",
158
- value = 256,
159
- interactive = True
160
- )
161
- mdx23c_override_segment_size = gr.Checkbox(
162
- label = "Override segment size",
163
- info = "Override model default segment size instead of using the model default value",
164
- value = False,
165
- interactive = True
166
- )
167
- with gr.Row():
168
- mdx23c_overlap = gr.Slider(
169
- minimum = 2,
170
- maximum = 50,
171
- step = 1,
172
- label = "Overlap",
173
- info = "Amount of overlap between prediction windows",
174
- value = 8,
175
- interactive = True
176
- )
177
- mdx23c_batch_size = gr.Slider(
178
- label = "Batch size",
179
- info = "Larger consumes more RAM but may process slightly faster",
180
- minimum = 1,
181
- maximum = 16,
182
- step = 1,
183
- value = 1,
184
- interactive = True
185
- )
186
- with gr.Row():
187
- mdx23c_normalization_threshold = gr.Slider(
188
- label = "Normalization threshold",
189
- info = "The threshold for audio normalization",
190
- minimum = 0.1,
191
- maximum = 1,
192
- step = 0.1,
193
- value = 0.1,
194
- interactive = True
195
- )
196
- mdx23c_amplification_threshold = gr.Slider(
197
- label = "Amplification threshold",
198
- info = "The threshold for audio amplification",
199
- minimum = 0.1,
200
- maximum = 1,
201
- step = 0.1,
202
- value = 0.1,
203
- interactive = True
204
- )
205
-
206
-
207
- with gr.Row():
208
- mdx23c_button = gr.Button("Separate!", variant = "primary")
209
- with gr.Row():
210
- mdx23c_stem1 = gr.Audio(
211
- show_download_button = True,
212
- interactive = False,
213
- label = "Stem 1",
214
- type = "filepath"
215
- )
216
- mdx23c_stem2 = gr.Audio(
217
- show_download_button = True,
218
- interactive = False,
219
- label = "Stem 2",
220
- type = "filepath"
221
- )
222
-
223
- mdx23c_button.click(mdxc_separator, [mdx23c_audio, mdx23c_model, mdx23c_output_format, mdx23c_segment_size, mdx23c_override_segment_size, mdx23c_overlap, mdx23c_batch_size, mdx23c_normalization_threshold, mdx23c_amplification_threshold], [mdx23c_stem1, mdx23c_stem2])
224
-
225
- with gr.TabItem("MDX-NET"):
226
- with gr.Row():
227
- mdxnet_model = gr.Dropdown(
228
- label = "Select the model",
229
- choices = mdxnet_models,
230
- value = lambda : None,
231
- interactive = True
232
- )
233
- mdxnet_output_format = gr.Dropdown(
234
- label = "Select the output format",
235
- choices = output_format,
236
- value = lambda : None,
237
- interactive = True
238
- )
239
- with gr.Row():
240
- mdxnet_audio = gr.Audio(
241
- label = "Input audio",
242
- type = "filepath",
243
- interactive = True
244
- )
245
- with gr.Accordion("Advanced settings", open = False):
246
  with gr.Group():
247
  with gr.Row():
248
- mdxnet_hop_length = gr.Slider(
249
- label = "Hop length",
250
- info = "Usually called stride in neural networks; only change if you know what you're doing",
251
- minimum = 32,
252
- maximum = 2048,
253
- step = 32,
254
- value = 1024,
255
- interactive = True
256
- )
257
- mdxnet_segment_size = gr.Slider(
258
- minimum = 32,
259
- maximum = 4000,
260
- step = 32,
261
- label = "Segment size",
262
- info = "Larger consumes more resources, but may give better results",
263
- value = 256,
264
- interactive = True
265
- )
266
- mdxnet_denoise = gr.Checkbox(
267
- label = "Denoise",
268
- info = "Enable denoising during separation",
269
- value = True,
270
- interactive = True
271
- )
272
  with gr.Row():
273
- mdxnet_overlap = gr.Slider(
274
- label = "Overlap",
275
- info = "Amount of overlap between prediction windows",
276
- minimum = 0.001,
277
- maximum = 0.999,
278
- step = 0.001,
279
- value = 0.25,
280
- interactive = True
281
- )
282
- mdxnet_batch_size = gr.Slider(
283
- label = "Batch size",
284
- info = "Larger consumes more RAM but may process slightly faster",
285
- minimum = 1,
286
- maximum = 16,
287
- step = 1,
288
- value = 1,
289
- interactive = True
290
- )
291
  with gr.Row():
292
- mdxnet_normalization_threshold = gr.Slider(
293
- label = "Normalization threshold",
294
- info = "The threshold for audio normalization",
295
- minimum = 0.1,
296
- maximum = 1,
297
- step = 0.1,
298
- value = 0.1,
299
- interactive = True
300
- )
301
- mdxnet_amplification_threshold = gr.Slider(
302
- label = "Amplification threshold",
303
- info = "The threshold for audio amplification",
304
- minimum = 0.1,
305
- maximum = 1,
306
- step = 0.1,
307
- value = 0.1,
308
- interactive = True
309
- )
310
-
311
-
312
-
313
- with gr.Row():
314
- mdxnet_button = gr.Button("Separate!", variant = "primary")
315
- with gr.Row():
316
- mdxnet_stem1 = gr.Audio(
317
- show_download_button = True,
318
- interactive = False,
319
- label = "Stem 1",
320
- type = "filepath"
321
- )
322
- mdxnet_stem2 = gr.Audio(
323
- show_download_button = True,
324
- interactive = False,
325
- label = "Stem 2",
326
- type = "filepath"
327
- )
328
-
329
- mdxnet_button.click(mdxnet_separator, [mdxnet_audio, mdxnet_model, mdxnet_output_format, mdxnet_hop_length, mdxnet_segment_size, mdxnet_denoise, mdxnet_overlap, mdxnet_batch_size, mdxnet_normalization_threshold, mdxnet_amplification_threshold], [mdxnet_stem1, mdxnet_stem2])
330
-
331
- with gr.TabItem("VR ARCH"):
332
- with gr.Row():
333
- vrarch_model = gr.Dropdown(
334
- label = "Select the model",
335
- choices = vrarch_models,
336
- value = lambda : None,
337
- interactive = True
338
- )
339
- vrarch_output_format = gr.Dropdown(
340
- label = "Select the output format",
341
- choices = output_format,
342
- value = lambda : None,
343
- interactive = True
344
- )
345
- with gr.Row():
346
- vrarch_audio = gr.Audio(
347
- label = "Input audio",
348
- type = "filepath",
349
- interactive = True
350
- )
351
-
352
- with gr.Accordion("Advanced settings", open = False):
353
  with gr.Group():
 
354
  with gr.Row():
355
- vrarch_window_size = gr.Slider(
356
- label = "Window size",
357
- info = "Balance quality and speed. 1024 = fast but lower, 320 = slower but better quality",
358
- minimum=320,
359
- maximum=1024,
360
- step=32,
361
- value = 512,
362
- interactive = True
363
- )
364
- vrarch_agression = gr.Slider(
365
- minimum = 1,
366
- maximum = 50,
367
- step = 1,
368
- label = "Agression",
369
- info = "Intensity of primary stem extraction",
370
- value = 5,
371
- interactive = True
372
- )
373
- vrarch_tta = gr.Checkbox(
374
- label = "TTA",
375
- info = "Enable Test-Time-Augmentation; slow but improves quality",
376
- value = True,
377
- visible = True,
378
- interactive = True
379
- )
380
  with gr.Row():
381
- vrarch_post_process = gr.Checkbox(
382
- label = "Post process",
383
- info = "Identify leftover artifacts within vocal output; may improve separation for some songs",
384
- value = False,
385
- visible = True,
386
- interactive = True
387
- )
388
- vrarch_post_process_threshold = gr.Slider(
389
- label = "Post process threshold",
390
- info = "Threshold for post-processing",
391
- minimum = 0.1,
392
- maximum = 0.3,
393
- step = 0.1,
394
- value = 0.2,
395
- interactive = True
396
- )
 
 
 
397
  with gr.Row():
398
- vrarch_high_end_process = gr.Checkbox(
399
- label = "High end process",
400
- info = "Mirror the missing frequency range of the output",
401
- value = False,
402
- visible = True,
403
- interactive = True,
404
- )
405
- vrarch_batch_size = gr.Slider(
406
- label = "Batch size",
407
- info = "Larger consumes more RAM but may process slightly faster",
408
- minimum = 1,
409
- maximum = 16,
410
- step = 1,
411
- value = 1,
412
- interactive = True
413
- )
414
  with gr.Row():
415
- vrarch_normalization_threshold = gr.Slider(
416
- label = "Normalization threshold",
417
- info = "The threshold for audio normalization",
418
- minimum = 0.1,
419
- maximum = 1,
420
- step = 0.1,
421
- value = 0.1,
422
- interactive = True
423
- )
424
- vrarch_amplification_threshold = gr.Slider(
425
- label = "Amplification threshold",
426
- info = "The threshold for audio amplification",
427
- minimum = 0.1,
428
- maximum = 1,
429
- step = 0.1,
430
- value = 0.1,
431
- interactive = True
432
- )
433
-
434
-
435
- with gr.Row():
436
- vrarch_button = gr.Button("Separate!", variant = "primary")
437
- with gr.Row():
438
- vrarch_stem1 = gr.Audio(
439
- show_download_button = True,
440
- interactive = False,
441
- type = "filepath",
442
- label = "Stem 1"
443
- )
444
- vrarch_stem2 = gr.Audio(
445
- show_download_button = True,
446
- interactive = False,
447
- type = "filepath",
448
- label = "Stem 2"
449
- )
450
-
451
- vrarch_button.click(vrarch_separator, [vrarch_audio, vrarch_model, vrarch_output_format, vrarch_window_size, vrarch_agression, vrarch_tta, vrarch_post_process, vrarch_post_process_threshold, vrarch_high_end_process, vrarch_batch_size, vrarch_normalization_threshold, vrarch_amplification_threshold], [vrarch_stem1, vrarch_stem2])
452
-
453
- with gr.TabItem("Demucs"):
454
- with gr.Row():
455
- demucs_model = gr.Dropdown(
456
- label = "Select the model",
457
- choices = demucs_models,
458
- value = lambda : None,
459
- interactive = True
460
- )
461
- demucs_output_format = gr.Dropdown(
462
- label = "Select the output format",
463
- choices = output_format,
464
- value = lambda : None,
465
- interactive = True
466
- )
467
- with gr.Row():
468
- demucs_audio = gr.Audio(
469
- label = "Input audio",
470
- type = "filepath",
471
- interactive = True
472
- )
473
- with gr.Accordion("Advanced settings", open = False):
474
  with gr.Group():
475
  with gr.Row():
476
- demucs_shifts = gr.Slider(
477
- label = "Shifts",
478
- info = "Number of predictions with random shifts, higher = slower but better quality",
479
- minimum = 1,
480
- maximum = 20,
481
- step = 1,
482
- value = 2,
483
- interactive = True
484
- )
485
- demucs_segment_size = gr.Slider(
486
- label = "Segment size",
487
- info = "Size of segments into which the audio is split. Higher = slower but better quality",
488
- minimum = 1,
489
- maximum = 100,
490
- step = 1,
491
- value = 40,
492
- interactive = True
493
- )
494
- demucs_segments_enabled = gr.Checkbox(
495
- label = "Segment-wise processing",
496
- info = "Enable segment-wise processing",
497
- value = True,
498
- interactive = True
499
- )
500
  with gr.Row():
501
- demucs_overlap = gr.Slider(
502
- label = "Overlap",
503
- info = "Overlap between prediction windows. Higher = slower but better quality",
504
- minimum=0.001,
505
- maximum=0.999,
506
- step=0.001,
507
- value = 0.25,
508
- interactive = True
509
- )
510
- demucs_batch_size = gr.Slider(
511
- label = "Batch size",
512
- info = "Larger consumes more RAM but may process slightly faster",
513
- minimum = 1,
514
- maximum = 16,
515
- step = 1,
516
- value = 1,
517
- interactive = True
518
- )
519
  with gr.Row():
520
- demucs_normalization_threshold = gr.Slider(
521
- label = "Normalization threshold",
522
- info = "The threshold for audio normalization",
523
- minimum = 0.1,
524
- maximum = 1,
525
- step = 0.1,
526
- value = 0.1,
527
- interactive = True
528
- )
529
- demucs_amplification_threshold = gr.Slider(
530
- label = "Amplification threshold",
531
- info = "The threshold for audio amplification",
532
- minimum = 0.1,
533
- maximum = 1,
534
- step = 0.1,
535
- value = 0.1,
536
- interactive = True
537
- )
538
-
539
-
540
- with gr.Row():
541
- gr.Markdown("You can paste the link to the video/audio from many sites, check the complete list [here](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md)")
542
- with gr.Row():
543
- demucs_download_button = gr.Button(
544
- "Download!",
545
- variant = "primary"
546
- )
547
-
548
-
549
-
550
-
551
-
552
-
553
- with gr.Row():
554
- demucs_button = gr.Button("Separate!", variant = "primary")
555
- with gr.Row():
556
- demucs_stem1 = gr.Audio(
557
- show_download_button = True,
558
- interactive = False,
559
- type = "filepath",
560
- label = "Stem 1"
561
- )
562
- demucs_stem2 = gr.Audio(
563
- show_download_button = True,
564
- interactive = False,
565
- type = "filepath",
566
- label = "Stem 2"
567
- )
568
- with gr.Row():
569
- demucs_stem3 = gr.Audio(
570
- show_download_button = True,
571
- interactive = False,
572
- type = "filepath",
573
- label = "Stem 3"
574
- )
575
- demucs_stem4 = gr.Audio(
576
- show_download_button = True,
577
- interactive = False,
578
- type = "filepath",
579
- label = "Stem 4"
580
- )
581
- with gr.Row(visible=False) as stem6:
582
- demucs_stem5 = gr.Audio(
583
- show_download_button = True,
584
- interactive = False,
585
- type = "filepath",
586
- label = "Stem 5"
587
- )
588
- demucs_stem6 = gr.Audio(
589
- show_download_button = True,
590
- interactive = False,
591
- type = "filepath",
592
- label = "Stem 6"
593
- )
594
-
595
- demucs_model.change(update_stems, inputs=[demucs_model], outputs=stem6)
596
-
597
- demucs_button.click(demucs_separator, [demucs_audio, demucs_model, demucs_output_format, demucs_shifts, demucs_segment_size, demucs_segments_enabled, demucs_overlap, demucs_batch_size, demucs_normalization_threshold, demucs_amplification_threshold], [demucs_stem1, demucs_stem2, demucs_stem3, demucs_stem4, demucs_stem5, demucs_stem6])
598
-
599
-
600
- with gr.Tab("Leaderboard"):
601
- with gr.Row(equal_height=True):
602
- list_filter = gr.Dropdown(value="vocals", choices=["vocals", "instrumental", "drums", "bass", "guitar", "piano", "other"], label="List filter", info="Filter and sort the model list by 'stem'")
603
- list_limit = gr.Slider(minimum=1, maximum=10, step=1, value=5, label="List limit", info="Limit the number of models shown.")
604
- list_button = gr.Button("Show list", variant="primary")
605
- output_list = gr.HTML(label="Leaderboard")
606
-
607
- list_button.click(leaderboard, inputs=[list_filter, list_limit], outputs=output_list)
608
-
609
-
610
- with gr.TabItem("Credits"):
611
- gr.Markdown(
612
- """
613
- audio separator UI created by [Eddycrack 864](https://github.com/Eddycrack864) & [_noxty](https://huggingface.co/theNeofr).
614
- * python-audio-separator by [beveradb](https://github.com/beveradb).
615
- * Thanks to [Mikus](https://github.com/cappuch) for the help with the code.
616
- * Thanks to [Nick088](https://huggingface.co/Nick088) for the help to fix roformers.
617
- * Thanks to [ArisDev](https://github.com/aris-py) for porting UVR5 UI to Kaggle and improvements.
618
- * Thanks to [Bebra777228](https://github.com/Bebra777228)'s code for guiding me to improve my code.
619
-
620
-
621
- You can donate to the original UVR5 project here:
622
- [!["Buy Me A Coffee"](https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png)](https://www.buymeacoffee.com/uvr5)
623
- """
624
- )
625
- gr.Markdown(
626
- """
627
- <h1> similar of this project\n
628
- [Audio_separator by r3gm](https://huggingface.co/spaces/r3gm/Audio_separator)\n
629
- [Audio-Separator by Politrees](https://huggingface.co/spaces/Politrees/Audio-Separator)\n
630
- [UVR5 UI by Eddycrack 864](https://huggingface.co/spaces/Eddycrack864/UVR5-UI)
631
- """
632
- )
633
-
634
-
635
- app.queue()
636
- app.launch(share=True, debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import torch
4
+ import shutil
5
+ import logging
6
+ import subprocess
7
  import gradio as gr
8
+
9
+ from audio_separator.separator import Separator
10
+
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ use_autocast = device == "cuda"
13
+
14
+ #=========================#
15
+ # Roformer Models #
16
+ #=========================#
17
+ ROFORMER_MODELS = {
18
+ # BS Roformer
19
+ 'BS-Roformer-Viperx-1053': 'model_bs_roformer_ep_937_sdr_10.5309.ckpt',
20
+ 'BS-Roformer-Viperx-1296': 'model_bs_roformer_ep_368_sdr_12.9628.ckpt',
21
+ 'BS-Roformer-Viperx-1297': 'model_bs_roformer_ep_317_sdr_12.9755.ckpt',
22
+ 'BS-Roformer-De-Reverb': 'deverb_bs_roformer_8_384dim_10depth.ckpt',
23
+ 'BS Roformer | Chorus Male-Female by Sucial': 'model_chorus_bs_roformer_ep_267_sdr_24.1275.ckpt',
24
+
25
+ # MelBand Roformer
26
+ 'Mel-Roformer-Crowd-Aufr33-Viperx': 'mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144.ckpt',
27
+ 'Mel-Roformer-Karaoke-Aufr33-Viperx': 'mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt',
28
+ 'Mel-Roformer-Viperx-1143': 'model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt',
29
+ 'MelBand Roformer | De-Reverb-Echo by Sucial': 'dereverb-echo_mel_band_roformer_sdr_10.0169.ckpt',
30
+ 'MelBand Roformer | De-Reverb-Echo V2 by Sucial': 'dereverb-echo_mel_band_roformer_sdr_13.4843_v2.ckpt',
31
+ 'MelBand Roformer | Aspiration Less Aggressive by Sucial': 'aspiration_mel_band_roformer_less_aggr_sdr_18.1201.ckpt',
32
+ 'MelBand Roformer | Aspiration by Sucial': 'aspiration_mel_band_roformer_sdr_18.9845.ckpt',
33
+ 'MelBand Roformer | De-Reverb Less Aggressive by anvuew': 'dereverb_mel_band_roformer_less_aggressive_anvuew_sdr_18.8050.ckpt',
34
+ 'MelBand Roformer | De-Reverb by anvuew': 'dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt',
35
+ 'MelBand Roformer | Vocals by Kimberley Jensen': 'vocals_mel_band_roformer.ckpt',
36
+ 'Mel-Roformer-Denoise-Aufr33-Aggr': 'denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768.ckpt',
37
+ 'Mel-Roformer-Denoise-Aufr33': 'denoise_mel_band_roformer_aufr33_sdr_27.9959.ckpt',
38
+ 'MelBand Roformer | Bleed Suppressor V1 by unwa-97chris': 'mel_band_roformer_bleed_suppressor_v1.ckpt',
39
+
40
+ # MelBand Roformer Kim
41
+ 'MelBand Roformer Kim | FT by unwa': 'mel_band_roformer_kim_ft_unwa.ckpt',
42
+ 'MelBand Roformer Kim | Big Beta 4 FT by unwa': 'melband_roformer_big_beta4.ckpt',
43
+ 'MelBand Roformer Kim | Big Beta 5e FT by unwa': 'melband_roformer_big_beta5e.ckpt',
44
+ 'MelBand Roformer Kim | Inst V1 by Unwa': 'melband_roformer_inst_v1.ckpt',
45
+ 'MelBand Roformer Kim | Inst V1 (E) by Unwa': 'melband_roformer_inst_v1e.ckpt',
46
+ 'MelBand Roformer Kim | Inst V2 by Unwa': 'melband_roformer_inst_v2.ckpt',
47
+ 'MelBand Roformer Kim | InstVoc Duality V1 by Unwa': 'melband_roformer_instvoc_duality_v1.ckpt',
48
+ 'MelBand Roformer Kim | InstVoc Duality V2 by Unwa': 'melband_roformer_instvox_duality_v2.ckpt',
49
+ 'MelBand Roformer Kim | SYHFT by SYH99999': 'MelBandRoformerSYHFT.ckpt',
50
+ 'MelBand Roformer Kim | SYHFT V2 by SYH99999': 'MelBandRoformerSYHFTV2.ckpt',
51
+ 'MelBand Roformer Kim | SYHFT V2.5 by SYH99999': 'MelBandRoformerSYHFTV2.5.ckpt',
52
+ 'MelBand Roformer Kim | SYHFT V3 by SYH99999': 'MelBandRoformerSYHFTV3Epsilon.ckpt',
53
+ 'MelBand Roformer Kim | Big SYHFT V1 by SYH99999': 'MelBandRoformerBigSYHFTV1.ckpt',
54
+ }
55
+ #=========================#
56
+ # MDX23C Models #
57
+ #=========================#
58
+ MDX23C_MODELS = {
59
+ 'MDX23C DrumSep by aufr33-jarredou': 'MDX23C-DrumSep-aufr33-jarredou.ckpt',
60
+ 'MDX23C De-Reverb by aufr33-jarredou': 'MDX23C-De-Reverb-aufr33-jarredou.ckpt',
61
+ 'MDX23C-InstVoc HQ': 'MDX23C-8KFFT-InstVoc_HQ.ckpt',
62
+ 'VIP | MDX23C-InstVoc HQ 2': 'MDX23C-8KFFT-InstVoc_HQ_2.ckpt',
63
+ 'VIP | MDX23C_D1581': 'MDX23C_D1581.ckpt',
64
+ }
65
+ #=========================#
66
+ # MDXN-NET Models #
67
+ #=========================#
68
+ MDXNET_MODELS = {
69
+ 'UVR-MDX-NET 1': 'UVR_MDXNET_1_9703.onnx',
70
+ 'UVR-MDX-NET 2': 'UVR_MDXNET_2_9682.onnx',
71
+ 'UVR-MDX-NET 3': 'UVR_MDXNET_3_9662.onnx',
72
+ 'UVR_MDXNET_9482': 'UVR_MDXNET_9482.onnx',
73
+ 'UVR-MDX-NET Inst 1': 'UVR-MDX-NET-Inst_1.onnx',
74
+ 'UVR-MDX-NET Inst 2': 'UVR-MDX-NET-Inst_2.onnx',
75
+ 'UVR-MDX-NET Inst 3': 'UVR-MDX-NET-Inst_3.onnx',
76
+ 'UVR-MDX-NET Inst HQ 1': 'UVR-MDX-NET-Inst_HQ_1.onnx',
77
+ 'UVR-MDX-NET Inst HQ 2': 'UVR-MDX-NET-Inst_HQ_2.onnx',
78
+ 'UVR-MDX-NET Inst HQ 3': 'UVR-MDX-NET-Inst_HQ_3.onnx',
79
+ 'UVR-MDX-NET Inst HQ 4': 'UVR-MDX-NET-Inst_HQ_4.onnx',
80
+ 'UVR-MDX-NET Inst HQ 5': 'UVR-MDX-NET-Inst_HQ_5.onnx',
81
+ 'UVR-MDX-NET Inst Main': 'UVR-MDX-NET-Inst_Main.onnx',
82
+ 'UVR-MDX-NET Karaoke': 'UVR_MDXNET_KARA.onnx',
83
+ 'UVR-MDX-NET Karaoke 2': 'UVR_MDXNET_KARA_2.onnx',
84
+ 'UVR-MDX-NET Main': 'UVR_MDXNET_Main.onnx',
85
+ 'UVR-MDX-NET Voc FT': 'UVR-MDX-NET-Voc_FT.onnx',
86
+ 'Kim Inst': 'Kim_Inst.onnx',
87
+ 'Kim Vocal 1': 'Kim_Vocal_1.onnx',
88
+ 'Kim Vocal 2': 'Kim_Vocal_2.onnx',
89
+ 'kuielab_a_bass': 'kuielab_a_bass.onnx',
90
+ 'kuielab_a_drums': 'kuielab_a_drums.onnx',
91
+ 'kuielab_a_other': 'kuielab_a_other.onnx',
92
+ 'kuielab_a_vocals': 'kuielab_a_vocals.onnx',
93
+ 'kuielab_b_bass': 'kuielab_b_bass.onnx',
94
+ 'kuielab_b_drums': 'kuielab_b_drums.onnx',
95
+ 'kuielab_b_other': 'kuielab_b_other.onnx',
96
+ 'kuielab_b_vocals': 'kuielab_b_vocals.onnx',
97
+ 'Reverb HQ By FoxJoy': 'Reverb_HQ_By_FoxJoy.onnx',
98
+ 'VIP | UVR-MDX-NET_Inst_82_beta': 'UVR-MDX-NET_Inst_82_beta.onnx',
99
+ 'VIP | UVR-MDX-NET_Inst_90_beta': 'UVR-MDX-NET_Inst_90_beta.onnx',
100
+ 'VIP | UVR-MDX-NET_Inst_187_beta': 'UVR-MDX-NET_Inst_187_beta.onnx',
101
+ 'VIP | UVR-MDX-NET-Inst_full_292': 'UVR-MDX-NET-Inst_full_292.onnx',
102
+ 'VIP | UVR-MDX-NET_Main_340': 'UVR-MDX-NET_Main_340.onnx',
103
+ 'VIP | UVR-MDX-NET_Main_390': 'UVR-MDX-NET_Main_390.onnx',
104
+ 'VIP | UVR-MDX-NET_Main_406': 'UVR-MDX-NET_Main_406.onnx',
105
+ 'VIP | UVR-MDX-NET_Main_427': 'UVR-MDX-NET_Main_427.onnx',
106
+ 'VIP | UVR-MDX-NET_Main_438': 'UVR-MDX-NET_Main_438.onnx',
107
+ }
108
+ #========================#
109
+ # VR-ARCH Models #
110
+ #========================#
111
+ VR_ARCH_MODELS = {
112
+ '1_HP-UVR': '1_HP-UVR.pth',
113
+ '2_HP-UVR': '2_HP-UVR.pth',
114
+ '3_HP-Vocal-UVR': '3_HP-Vocal-UVR.pth',
115
+ '4_HP-Vocal-UVR': '4_HP-Vocal-UVR.pth',
116
+ '5_HP-Karaoke-UVR': '5_HP-Karaoke-UVR.pth',
117
+ '6_HP-Karaoke-UVR': '6_HP-Karaoke-UVR.pth',
118
+ '7_HP2-UVR': '7_HP2-UVR.pth',
119
+ '8_HP2-UVR': '8_HP2-UVR.pth',
120
+ '9_HP2-UVR': '9_HP2-UVR.pth',
121
+ '10_SP-UVR-2B-32000-1': '10_SP-UVR-2B-32000-1.pth',
122
+ '11_SP-UVR-2B-32000-2': '11_SP-UVR-2B-32000-2.pth',
123
+ '12_SP-UVR-3B-44100': '12_SP-UVR-3B-44100.pth',
124
+ '13_SP-UVR-4B-44100-1': '13_SP-UVR-4B-44100-1.pth',
125
+ '14_SP-UVR-4B-44100-2': '14_SP-UVR-4B-44100-2.pth',
126
+ '15_SP-UVR-MID-44100-1': '15_SP-UVR-MID-44100-1.pth',
127
+ '16_SP-UVR-MID-44100-2': '16_SP-UVR-MID-44100-2.pth',
128
+ '17_HP-Wind_Inst-UVR': '17_HP-Wind_Inst-UVR.pth',
129
+ 'MGM_HIGHEND_v4': 'MGM_HIGHEND_v4.pth',
130
+ 'MGM_LOWEND_A_v4': 'MGM_LOWEND_A_v4.pth',
131
+ 'MGM_LOWEND_B_v4': 'MGM_LOWEND_B_v4.pth',
132
+ 'MGM_MAIN_v4': 'MGM_MAIN_v4.pth',
133
+ 'UVR-BVE-4B_SN-44100-1': 'UVR-BVE-4B_SN-44100-1.pth',
134
+ 'UVR-De-Reverb by aufr33-jarredou': 'UVR-De-Reverb-aufr33-jarredou.pth',
135
+ 'UVR-De-Echo-Aggressive by FoxJoy': 'UVR-De-Echo-Aggressive.pth',
136
+ 'UVR-De-Echo-Normal by FoxJoy': 'UVR-De-Echo-Normal.pth',
137
+ 'UVR-DeEcho-DeReverb by FoxJoy': 'UVR-DeEcho-DeReverb.pth',
138
+ 'UVR-DeNoise-Lite by FoxJoy': 'UVR-DeNoise-Lite.pth',
139
+ 'UVR-DeNoise by FoxJoy': 'UVR-DeNoise.pth',
140
+ }
141
+ #=======================#
142
+ # DEMUCS Models #
143
+ #=======================#
144
+ DEMUCS_MODELS = {
145
+ 'htdemucs': 'htdemucs.yaml',
146
+ 'htdemucs_6s': 'htdemucs_6s.yaml',
147
+ 'htdemucs_ft': 'htdemucs_ft.yaml',
148
+ 'hdemucs_mmi': 'hdemucs_mmi.yaml',
149
+ }
150
+
151
+ OUTPUT_FORMAT = ["wav", "flac", "mp3", "ogg", "opus", "m4a", "aiff", "ac3"]
152
+
153
+ def print_message(input_file, model_name):
154
+ """Prints information about the audio separation process."""
155
+ base_name = os.path.splitext(os.path.basename(input_file))[0]
156
+ print("\n")
157
+ print("🎵 Audio-Separator 🎵")
158
+ print("Input audio:", base_name)
159
+ print("Separation Model:", model_name)
160
+ print("Audio Separation Process...")
161
+
162
+ def prepare_output_dir(input_file, output_dir):
163
+ """Create a directory for the output files and clean it if it already exists."""
164
+ base_name = os.path.splitext(os.path.basename(input_file))[0]
165
+ out_dir = os.path.join(output_dir, base_name)
166
+ try:
167
+ if os.path.exists(out_dir):
168
+ shutil.rmtree(out_dir)
169
+ os.makedirs(out_dir)
170
+ except Exception as e:
171
+ raise RuntimeError(f"Failed to prepare output directory {out_dir}: {e}")
172
+ return out_dir
173
+
174
+ def rename_stems(audio, vocals_stem, instrumental_stem, other_stem, drums_stem, bass_stem, guitar_stem, piano_stem, model):
175
+ base_name = os.path.splitext(os.path.basename(audio))[0]
176
+ stems = {
177
+ "Vocals": vocals_stem.replace("NAME", base_name).replace("STEM", "Vocals").replace("MODEL", model),
178
+ "Instrumental": instrumental_stem.replace("NAME", base_name).replace("STEM", "Instrumental").replace("MODEL", model),
179
+ "Drums": drums_stem.replace("NAME", base_name).replace("STEM", "Drums").replace("MODEL", model),
180
+ "Bass": bass_stem.replace("NAME", base_name).replace("STEM", "Bass").replace("MODEL", model),
181
+ "Other": other_stem.replace("NAME", base_name).replace("STEM", "Other").replace("MODEL", model),
182
+ "Guitar": guitar_stem.replace("NAME", base_name).replace("STEM", "Guitar").replace("MODEL", model),
183
+ "Piano": piano_stem.replace("NAME", base_name).replace("STEM", "Piano").replace("MODEL", model),
184
+ }
185
+ return stems
186
 
187
  def leaderboard(list_filter, list_limit):
188
  try:
 
204
  except Exception as e:
205
  return f"Error: {e}"
206
 
207
+ def roformer_separator(audio, model_key, seg_size, override_seg_size, overlap, pitch_shift, model_dir, out_dir, out_format, norm_thresh, amp_thresh, batch_size, vocals_stem, instrumental_stem, other_stem, drums_stem, bass_stem, guitar_stem, piano_stem, progress=gr.Progress(track_tqdm=True)):
208
+ """Separate audio using Roformer model."""
209
+ stemname = rename_stems(audio, vocals_stem, instrumental_stem, other_stem, drums_stem, bass_stem, guitar_stem, piano_stem, model_key)
210
+ print_message(audio, model_key)
211
+ model = ROFORMER_MODELS[model_key]
212
+ try:
213
+ out_dir = prepare_output_dir(audio, out_dir)
214
+ separator = Separator(
215
+ log_level=logging.WARNING,
216
+ model_file_dir=model_dir,
217
+ output_dir=out_dir,
218
+ output_format=out_format,
219
+ normalization_threshold=norm_thresh,
220
+ amplification_threshold=amp_thresh,
221
+ use_autocast=use_autocast,
222
+ mdxc_params={
223
+ "segment_size": seg_size,
224
+ "override_model_segment_size": override_seg_size,
225
+ "batch_size": batch_size,
226
+ "overlap": overlap,
227
+ "pitch_shift": pitch_shift,
228
+ }
229
+ )
230
 
231
+ progress(0.2, desc="Model loaded...")
232
+ separator.load_model(model_filename=model)
233
 
234
+ progress(0.7, desc="Audio separated...")
235
+ separation = separator.separate(audio, stemname)
236
+ print(f"Separation complete!\nResults: {', '.join(separation)}")
237
+
238
+ stems = [os.path.join(out_dir, file_name) for file_name in separation]
239
+ return stems[0], stems[1]
240
+ except Exception as e:
241
+ raise RuntimeError(f"Roformer separation failed: {e}") from e
242
+
243
+ def mdx23c_separator(audio, model_key, seg_size, override_seg_size, overlap, pitch_shift, model_dir, out_dir, out_format, norm_thresh, amp_thresh, batch_size, vocals_stem, instrumental_stem, other_stem, drums_stem, bass_stem, guitar_stem, piano_stem, progress=gr.Progress(track_tqdm=True)):
244
+ """Separate audio using MDX23C model."""
245
+ stemname = rename_stems(audio, vocals_stem, instrumental_stem, other_stem, drums_stem, bass_stem, guitar_stem, piano_stem, model_key)
246
+ print_message(audio, model_key)
247
+ model = MDX23C_MODELS[model_key]
248
+ try:
249
+ out_dir = prepare_output_dir(audio, out_dir)
250
+ separator = Separator(
251
+ log_level=logging.WARNING,
252
+ model_file_dir=model_dir,
253
+ output_dir=out_dir,
254
+ output_format=out_format,
255
+ normalization_threshold=norm_thresh,
256
+ amplification_threshold=amp_thresh,
257
+ use_autocast=use_autocast,
258
+ mdxc_params={
259
+ "segment_size": seg_size,
260
+ "override_model_segment_size": override_seg_size,
261
+ "batch_size": batch_size,
262
+ "overlap": overlap,
263
+ "pitch_shift": pitch_shift,
264
+ }
265
+ )
266
+
267
+ progress(0.2, desc="Model loaded...")
268
+ separator.load_model(model_filename=model)
269
+
270
+ progress(0.7, desc="Audio separated...")
271
+ separation = separator.separate(audio, stemname)
272
+ print(f"Separation complete!\nResults: {', '.join(separation)}")
273
+
274
+ stems = [os.path.join(out_dir, file_name) for file_name in separation]
275
+ return stems[0], stems[1]
276
+ except Exception as e:
277
+ raise RuntimeError(f"MDX23C separation failed: {e}") from e
278
+
279
+ def mdx_separator(audio, model_key, hop_length, seg_size, overlap, denoise, model_dir, out_dir, out_format, norm_thresh, amp_thresh, batch_size, vocals_stem, instrumental_stem, other_stem, drums_stem, bass_stem, guitar_stem, piano_stem, progress=gr.Progress(track_tqdm=True)):
280
+ """Separate audio using MDX-NET model."""
281
+ stemname = rename_stems(audio, vocals_stem, instrumental_stem, other_stem, drums_stem, bass_stem, guitar_stem, piano_stem, model_key)
282
+ print_message(audio, model_key)
283
+ model = MDXNET_MODELS[model_key]
284
+ try:
285
+ out_dir = prepare_output_dir(audio, out_dir)
286
+ separator = Separator(
287
+ log_level=logging.WARNING,
288
+ model_file_dir=model_dir,
289
+ output_dir=out_dir,
290
+ output_format=out_format,
291
+ normalization_threshold=norm_thresh,
292
+ amplification_threshold=amp_thresh,
293
+ use_autocast=use_autocast,
294
+ mdx_params={
295
+ "hop_length": hop_length,
296
+ "segment_size": seg_size,
297
+ "overlap": overlap,
298
+ "batch_size": batch_size,
299
+ "enable_denoise": denoise,
300
+ }
301
+ )
302
+
303
+ progress(0.2, desc="Model loaded...")
304
+ separator.load_model(model_filename=model)
305
+
306
+ progress(0.7, desc="Audio separated...")
307
+ separation = separator.separate(audio, stemname)
308
+ print(f"Separation complete!\nResults: {', '.join(separation)}")
309
+
310
+ stems = [os.path.join(out_dir, file_name) for file_name in separation]
311
+ return stems[0], stems[1]
312
+ except Exception as e:
313
+ raise RuntimeError(f"MDX-NET separation failed: {e}") from e
314
+
315
+ def vr_separator(audio, model_key, window_size, aggression, tta, post_process, post_process_threshold, high_end_process, model_dir, out_dir, out_format, norm_thresh, amp_thresh, batch_size, vocals_stem, instrumental_stem, other_stem, drums_stem, bass_stem, guitar_stem, piano_stem, progress=gr.Progress(track_tqdm=True)):
316
+ """Separate audio using VR ARCH model."""
317
+ stemname = rename_stems(audio, vocals_stem, instrumental_stem, other_stem, drums_stem, bass_stem, guitar_stem, piano_stem, model_key)
318
+ print_message(audio, model_key)
319
+ model = VR_ARCH_MODELS[model_key]
320
+ try:
321
+ out_dir = prepare_output_dir(audio, out_dir)
322
+ separator = Separator(
323
+ log_level=logging.WARNING,
324
+ model_file_dir=model_dir,
325
+ output_dir=out_dir,
326
+ output_format=out_format,
327
+ normalization_threshold=norm_thresh,
328
+ amplification_threshold=amp_thresh,
329
+ use_autocast=use_autocast,
330
+ vr_params={
331
+ "batch_size": batch_size,
332
+ "window_size": window_size,
333
+ "aggression": aggression,
334
+ "enable_tta": tta,
335
+ "enable_post_process": post_process,
336
+ "post_process_threshold": post_process_threshold,
337
+ "high_end_process": high_end_process,
338
+ }
339
+ )
340
+
341
+ progress(0.2, desc="Model loaded...")
342
+ separator.load_model(model_filename=model)
343
+
344
+ progress(0.7, desc="Audio separated...")
345
+ separation = separator.separate(audio, stemname)
346
+ print(f"Separation complete!\nResults: {', '.join(separation)}")
347
+
348
+ stems = [os.path.join(out_dir, file_name) for file_name in separation]
349
+ return stems[0], stems[1]
350
+ except Exception as e:
351
+ raise RuntimeError(f"VR ARCH separation failed: {e}") from e
352
+
353
+ def demucs_separator(audio, model_key, seg_size, shifts, overlap, segments_enabled, model_dir, out_dir, out_format, norm_thresh, amp_thresh, vocals_stem, instrumental_stem, other_stem, drums_stem, bass_stem, guitar_stem, piano_stem, progress=gr.Progress(track_tqdm=True)):
354
+ """Separate audio using Demucs model."""
355
+ stemname = rename_stems(audio, vocals_stem, instrumental_stem, other_stem, drums_stem, bass_stem, guitar_stem, piano_stem, model_key)
356
+ print_message(audio, model_key)
357
+ model = DEMUCS_MODELS[model_key]
358
+ try:
359
+ out_dir = prepare_output_dir(audio, out_dir)
360
+ separator = Separator(
361
+ log_level=logging.WARNING,
362
+ model_file_dir=model_dir,
363
+ output_dir=out_dir,
364
+ output_format=out_format,
365
+ normalization_threshold=norm_thresh,
366
+ amplification_threshold=amp_thresh,
367
+ use_autocast=use_autocast,
368
+ demucs_params={
369
+ "segment_size": seg_size,
370
+ "shifts": shifts,
371
+ "overlap": overlap,
372
+ "segments_enabled": segments_enabled,
373
+ }
374
+ )
375
+
376
+ progress(0.2, desc="Model loaded...")
377
+ separator.load_model(model_filename=model)
378
+
379
+ progress(0.7, desc="Audio separated...")
380
+ separation = separator.separate(audio, stemname)
381
+ print(f"Separation complete!\nResults: {', '.join(separation)}")
382
+
383
+ stems = [os.path.join(out_dir, file_name) for file_name in separation]
384
+
385
+ if model_key == "htdemucs_6s":
386
+ return stems[0], stems[1], stems[2], stems[3], stems[4], stems[5]
387
+ else:
388
+ return stems[0], stems[1], stems[2], stems[3], None, None
389
+ except Exception as e:
390
+ raise RuntimeError(f"Demucs separation failed: {e}") from e
391
+
392
+ def update_stems(model):
393
+ """Update the visibility of stem outputs based on the selected Demucs model."""
394
+ if model == "htdemucs_6s":
395
+ return gr.update(visible=True)
396
+ else:
397
+ return gr.update(visible=False)
398
+
399
+ def show_hide_params(param):
400
+ """Update the visibility of a parameter based on the checkbox state."""
401
+ return gr.update(visible=param)
402
+
403
+ with gr.Blocks(
404
+ title="🎵 Audio-Separator by Politrees 🎵",
405
+ css="footer{display:none !important}",
406
+ ) as app:
407
+ gr.HTML("<h1><center> 🎵 Audio-Separator HF Demo 🎵 </center></h1>")
408
+
409
+ with gr.Tab("Roformer"):
410
+ with gr.Group():
411
+ with gr.Row():
412
+ roformer_model = gr.Dropdown(value="MelBand Roformer Kim | Big Beta 5e FT by unwa", label="Select the Model", choices=list(ROFORMER_MODELS.keys()), scale=3)
413
+ roformer_output_format = gr.Dropdown(value="wav", choices=OUTPUT_FORMAT, label="Output Format", info="The format of the output audio file.", scale=1)
414
+ with gr.Accordion("Advanced settings", open=False):
415
+ with gr.Column(variant='panel'):
416
  with gr.Group():
417
+ roformer_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
418
  with gr.Row():
419
+ roformer_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.", visible=False)
420
+ roformer_overlap = gr.Slider(minimum=2, maximum=10, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Lower is better but slower.")
421
+ roformer_pitch_shift = gr.Slider(minimum=-24, maximum=24, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
422
+ with gr.Column(variant='panel'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  with gr.Group():
424
  with gr.Row():
425
+ roformer_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
426
+ roformer_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
427
+ roformer_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
428
+ with gr.Row():
429
+ roformer_audio = gr.Audio(label="Input Audio", type="filepath")
430
+ with gr.Row():
431
+ roformer_button = gr.Button("Separate!", variant="primary")
432
+ with gr.Row():
433
+ roformer_stem1 = gr.Audio(label="Stem 1", type="filepath", interactive=False)
434
+ roformer_stem2 = gr.Audio(label="Stem 2", type="filepath", interactive=False)
435
+
436
+ with gr.Tab("MDX23C"):
437
+ with gr.Group():
438
+ with gr.Row():
439
+ mdx23c_model = gr.Dropdown(value="MDX23C-InstVoc HQ", label="Select the Model", choices=list(MDX23C_MODELS.keys()), scale=3)
440
+ mdx23c_output_format = gr.Dropdown(value="wav", choices=OUTPUT_FORMAT, label="Output Format", info="The format of the output audio file.", scale=1)
441
+ with gr.Accordion("Advanced settings", open=False):
442
+ with gr.Column(variant='panel'):
443
+ with gr.Group():
444
+ mdx23c_override_seg_size = gr.Checkbox(value=False, label="Override segment size", info="Override model default segment size instead of using the model default value.")
 
 
 
 
445
  with gr.Row():
446
+ mdx23c_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.", visible=False)
447
+ mdx23c_overlap = gr.Slider(minimum=2, maximum=50, step=1, value=8, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
448
+ mdx23c_pitch_shift = gr.Slider(minimum=-24, maximum=24, step=1, value=0, label="Pitch shift", info="Shift audio pitch by a number of semitones while processing. may improve output for deep/high vocals.")
449
+ with gr.Column(variant='panel'):
450
+ with gr.Group():
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  with gr.Row():
452
+ mdx23c_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
453
+ mdx23c_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
454
+ mdx23c_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
455
+ with gr.Row():
456
+ mdx23c_audio = gr.Audio(label="Input Audio", type="filepath")
457
+ with gr.Row():
458
+ mdx23c_button = gr.Button("Separate!", variant="primary")
459
+ with gr.Row():
460
+ mdx23c_stem1 = gr.Audio(label="Stem 1", type="filepath", interactive=False)
461
+ mdx23c_stem2 = gr.Audio(label="Stem 2", type="filepath", interactive=False)
462
+
463
+ with gr.Tab("MDX-NET"):
464
+ with gr.Group():
465
+ with gr.Row():
466
+ mdx_model = gr.Dropdown(value="UVR-MDX-NET Inst HQ 5", label="Select the Model", choices=list(MDXNET_MODELS.keys()), scale=3)
467
+ mdx_output_format = gr.Dropdown(value="wav", choices=OUTPUT_FORMAT, label="Output Format", info="The format of the output audio file.", scale=1)
468
+ with gr.Accordion("Advanced settings", open=False):
469
+ with gr.Column(variant='panel'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  with gr.Group():
471
+ mdx_denoise = gr.Checkbox(value=False, label="Denoise", info="Enable denoising after separation.")
472
  with gr.Row():
473
+ mdx_hop_length = gr.Slider(minimum=32, maximum=2048, step=32, value=1024, label="Hop Length", info="Usually called stride in neural networks; only change if you know what you're doing.")
474
+ mdx_seg_size = gr.Slider(minimum=32, maximum=4000, step=32, value=256, label="Segment Size", info="Larger consumes more resources, but may give better results.")
475
+ mdx_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Amount of overlap between prediction windows. Higher is better but slower.")
476
+ with gr.Column(variant='panel'):
477
+ with gr.Group():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
  with gr.Row():
479
+ mdx_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
480
+ mdx_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
481
+ mdx_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
482
+ with gr.Row():
483
+ mdx_audio = gr.Audio(label="Input Audio", type="filepath")
484
+ with gr.Row():
485
+ mdx_button = gr.Button("Separate!", variant="primary")
486
+ with gr.Row():
487
+ mdx_stem1 = gr.Audio(label="Stem 1", type="filepath", interactive=False)
488
+ mdx_stem2 = gr.Audio(label="Stem 2", type="filepath", interactive=False)
489
+
490
+ with gr.Tab("VR ARCH"):
491
+ with gr.Group():
492
+ with gr.Row():
493
+ vr_model = gr.Dropdown(value="1_HP-UVR", label="Select the Model", choices=list(VR_ARCH_MODELS.keys()), scale=3)
494
+ vr_output_format = gr.Dropdown(value="wav", choices=OUTPUT_FORMAT, label="Output Format", info="The format of the output audio file.", scale=1)
495
+ with gr.Accordion("Advanced settings", open=False):
496
+ with gr.Column(variant='panel'):
497
+ with gr.Group():
498
  with gr.Row():
499
+ vr_post_process = gr.Checkbox(value=False, label="Post Process", info="Identify leftover artifacts within vocal output; may improve separation for some songs.")
500
+ vr_tta = gr.Checkbox(value=False, label="TTA", info="Enable Test-Time-Augmentation; slow but improves quality.")
501
+ vr_high_end_process = gr.Checkbox(value=False, label="High End Process", info="Mirror the missing frequency range of the output.")
 
 
 
 
 
 
 
 
 
 
 
 
 
502
  with gr.Row():
503
+ vr_post_process_threshold = gr.Slider(minimum=0.1, maximum=0.3, step=0.1, value=0.2, label="Post Process Threshold", info="Threshold for post-processing.", visible=False)
504
+ vr_window_size = gr.Slider(minimum=320, maximum=1024, step=32, value=512, label="Window Size", info="Balance quality and speed. 1024 = fast but lower, 320 = slower but better quality.")
505
+ vr_aggression = gr.Slider(minimum=1, maximum=100, step=1, value=5, label="Agression", info="Intensity of primary stem extraction.")
506
+ with gr.Column(variant='panel'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  with gr.Group():
508
  with gr.Row():
509
+ vr_batch_size = gr.Slider(minimum=1, maximum=16, step=1, value=1, label="Batch Size", info="Larger consumes more RAM but may process slightly faster.")
510
+ vr_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
511
+ vr_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
512
+ with gr.Row():
513
+ vr_audio = gr.Audio(label="Input Audio", type="filepath")
514
+ with gr.Row():
515
+ vr_button = gr.Button("Separate!", variant="primary")
516
+ with gr.Row():
517
+ vr_stem1 = gr.Audio(label="Stem 1", type="filepath", interactive=False)
518
+ vr_stem2 = gr.Audio(label="Stem 2", type="filepath", interactive=False)
519
+
520
+ with gr.Tab("Demucs"):
521
+ with gr.Group():
522
+ with gr.Row():
523
+ demucs_model = gr.Dropdown(value="htdemucs_ft", label="Select the Model", choices=list(DEMUCS_MODELS.keys()), scale=3)
524
+ demucs_output_format = gr.Dropdown(value="wav", choices=OUTPUT_FORMAT, label="Output Format", info="The format of the output audio file.", scale=1)
525
+ with gr.Accordion("Advanced settings", open=False):
526
+ with gr.Column(variant='panel'):
527
+ with gr.Group():
528
+ demucs_segments_enabled = gr.Checkbox(value=True, label="Segment-wise processing", info="Enable segment-wise processing.")
 
 
 
 
529
  with gr.Row():
530
+ demucs_seg_size = gr.Slider(minimum=1, maximum=100, step=1, value=40, label="Segment Size", info="Size of segments into which the audio is split. Higher = slower but better quality.")
531
+ demucs_overlap = gr.Slider(minimum=0.001, maximum=0.999, step=0.001, value=0.25, label="Overlap", info="Overlap between prediction windows. Higher = slower but better quality.")
532
+ demucs_shifts = gr.Slider(minimum=0, maximum=20, step=1, value=2, label="Shifts", info="Number of predictions with random shifts, higher = slower but better quality.")
533
+ with gr.Column(variant='panel'):
534
+ with gr.Group():
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  with gr.Row():
536
+ demucs_norm_threshold = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=0.9, label="Normalization threshold", info="The threshold for audio normalization.")
537
+ demucs_amp_threshold = gr.Slider(minimum=0.0, maximum=1, step=0.1, value=0.0, label="Amplification threshold", info="The threshold for audio amplification.")
538
+ with gr.Row():
539
+ demucs_audio = gr.Audio(label="Input Audio", type="filepath")
540
+ with gr.Row():
541
+ demucs_button = gr.Button("Separate!", variant="primary")
542
+ with gr.Row():
543
+ demucs_stem1 = gr.Audio(label="Stem 1", type="filepath", interactive=False)
544
+ demucs_stem2 = gr.Audio(label="Stem 2", type="filepath", interactive=False)
545
+ with gr.Row():
546
+ demucs_stem3 = gr.Audio(label="Stem 3", type="filepath", interactive=False)
547
+ demucs_stem4 = gr.Audio(label="Stem 4", type="filepath", interactive=False)
548
+ with gr.Row(visible=False) as stem6:
549
+ demucs_stem5 = gr.Audio(label="Stem 5", type="filepath", interactive=False)
550
+ demucs_stem6 = gr.Audio(label="Stem 6", type="filepath", interactive=False)
551
+
552
+ with gr.Tab("Settings"):
553
+ with gr.Group():
554
+ with gr.Row():
555
+ model_file_dir = gr.Textbox(value="/tmp/audio-separator-models/", label="Directory to cache model files", info="The directory where model files are stored.", placeholder="/tmp/audio-separator-models/")
556
+ output_dir = gr.Textbox(value="output", label="File output directory", info="The directory where output files will be saved.", placeholder="output")
557
+
558
+ with gr.Accordion("Rename Stems", open=False):
559
+ gr.Markdown(
560
+ """
561
+ Keys for automatic determination of input file names, stems, and models to simplify the construction of output file names.
562
+
563
+ Keys:
564
+ * **NAME** - Input File Name
565
+ * **STEM** - Stem Name (e.g., Vocals, Instrumental)
566
+ * **MODEL** - Model Name (e.g., BS-Roformer-Viperx-1297)
567
+
568
+ > Example:
569
+ > * **Usage:** NAME_(STEM)_MODEL
570
+ > * **Output File Name:** Music_(Vocals)_BS-Roformer-Viperx-1297
571
+ """
572
+ )
573
+ with gr.Row():
574
+ vocals_stem = gr.Textbox(value="NAME_(STEM)_MODEL", label="Vocals Stem", info="Output example: Music_(Vocals)_BS-Roformer-Viperx-1297", placeholder="NAME_(STEM)_MODEL")
575
+ instrumental_stem = gr.Textbox(value="NAME_(STEM)_MODEL", label="Instrumental Stem", info="Output example: Music_(Instrumental)_BS-Roformer-Viperx-1297", placeholder="NAME_(STEM)_MODEL")
576
+ other_stem = gr.Textbox(value="NAME_(STEM)_MODEL", label="Other Stem", info="Output example: Music_(Other)_BS-Roformer-Viperx-1297", placeholder="NAME_(STEM)_MODEL")
577
+ with gr.Row():
578
+ drums_stem = gr.Textbox(value="NAME_(STEM)_MODEL", label="Drums Stem", info="Output example: Music_(Drums)_BS-Roformer-Viperx-1297", placeholder="NAME_(STEM)_MODEL")
579
+ bass_stem = gr.Textbox(value="NAME_(STEM)_MODEL", label="Bass Stem", info="Output example: Music_(Bass)_BS-Roformer-Viperx-1297", placeholder="NAME_(STEM)_MODEL")
580
+ with gr.Row():
581
+ guitar_stem = gr.Textbox(value="NAME_(STEM)_MODEL", label="Guitar Stem", info="Output example: Music_(Guitar)_BS-Roformer-Viperx-1297", placeholder="NAME_(STEM)_MODEL")
582
+ piano_stem = gr.Textbox(value="NAME_(STEM)_MODEL", label="Piano Stem", info="Output example: Music_(Piano)_BS-Roformer-Viperx-1297", placeholder="NAME_(STEM)_MODEL")
583
+
584
+ with gr.Tab("Leaderboard"):
585
+ with gr.Group():
586
+ with gr.Row(equal_height=True):
587
+ list_filter = gr.Dropdown(value="vocals", choices=["vocals", "instrumental", "drums", "bass", "guitar", "piano", "other"], label="List filter", info="Filter and sort the model list by 'stem'")
588
+ list_limit = gr.Slider(minimum=1, maximum=10, step=1, value=5, label="List limit", info="Limit the number of models shown.")
589
+ list_button = gr.Button("Show list", variant="primary")
590
+
591
+ output_list = gr.HTML(label="Leaderboard")
592
+
593
+ with gr.Tab("Credits"):
594
+ gr.Markdown(
595
+ """
596
+ This Space created by **[Politrees](https://github.com/Bebra777228) forked by [NeoFr](https://github.com/TheNeodev)**.
597
+ * python-audio-separator by **[beveradb](https://github.com/beveradb)**.
598
+ * Thanks to **[Hev832](https://huggingface.co/Hev832)** for the help with the code.
599
+ """
600
+ )
601
+
602
+ roformer_override_seg_size.change(show_hide_params, inputs=[roformer_override_seg_size], outputs=[roformer_seg_size])
603
+ mdx23c_override_seg_size.change(show_hide_params, inputs=[mdx23c_override_seg_size], outputs=[mdx23c_seg_size])
604
+ vr_post_process.change(show_hide_params, inputs=[vr_post_process], outputs=[vr_post_process_threshold])
605
+
606
+ demucs_model.change(update_stems, inputs=[demucs_model], outputs=stem6)
607
+
608
+ list_button.click(leaderboard, inputs=[list_filter, list_limit], outputs=output_list)
609
+
610
+ roformer_button.click(
611
+ roformer_separator,
612
+ inputs=[
613
+ roformer_audio,
614
+ roformer_model,
615
+ roformer_seg_size,
616
+ roformer_override_seg_size,
617
+ roformer_overlap,
618
+ roformer_pitch_shift,
619
+ model_file_dir,
620
+ output_dir,
621
+ roformer_output_format,
622
+ roformer_norm_threshold,
623
+ roformer_amp_threshold,
624
+ roformer_batch_size,
625
+ vocals_stem,
626
+ instrumental_stem,
627
+ other_stem,
628
+ drums_stem,
629
+ bass_stem,
630
+ guitar_stem,
631
+ piano_stem,
632
+ ],
633
+ outputs=[
634
+ roformer_stem1,
635
+ roformer_stem2,
636
+ ], concurrency_limit=1,
637
+ )
638
+ mdx23c_button.click(
639
+ mdx23c_separator,
640
+ inputs=[
641
+ mdx23c_audio,
642
+ mdx23c_model,
643
+ mdx23c_seg_size,
644
+ mdx23c_override_seg_size,
645
+ mdx23c_overlap,
646
+ mdx23c_pitch_shift,
647
+ model_file_dir,
648
+ output_dir,
649
+ mdx23c_output_format,
650
+ mdx23c_norm_threshold,
651
+ mdx23c_amp_threshold,
652
+ mdx23c_batch_size,
653
+ vocals_stem,
654
+ instrumental_stem,
655
+ other_stem,
656
+ drums_stem,
657
+ bass_stem,
658
+ guitar_stem,
659
+ piano_stem,
660
+ ],
661
+ outputs=[
662
+ mdx23c_stem1,
663
+ mdx23c_stem2,
664
+ ], concurrency_limit=1,
665
+ )
666
+ mdx_button.click(
667
+ mdx_separator,
668
+ inputs=[
669
+ mdx_audio,
670
+ mdx_model,
671
+ mdx_hop_length,
672
+ mdx_seg_size,
673
+ mdx_overlap,
674
+ mdx_denoise,
675
+ model_file_dir,
676
+ output_dir,
677
+ mdx_output_format,
678
+ mdx_norm_threshold,
679
+ mdx_amp_threshold,
680
+ mdx_batch_size,
681
+ vocals_stem,
682
+ instrumental_stem,
683
+ other_stem,
684
+ drums_stem,
685
+ bass_stem,
686
+ guitar_stem,
687
+ piano_stem,
688
+ ],
689
+ outputs=[
690
+ mdx_stem1,
691
+ mdx_stem2,
692
+ ], concurrency_limit=1,
693
+ )
694
+ vr_button.click(
695
+ vr_separator,
696
+ inputs=[
697
+ vr_audio,
698
+ vr_model,
699
+ vr_window_size,
700
+ vr_aggression,
701
+ vr_tta,
702
+ vr_post_process,
703
+ vr_post_process_threshold,
704
+ vr_high_end_process,
705
+ model_file_dir,
706
+ output_dir,
707
+ vr_output_format,
708
+ vr_norm_threshold,
709
+ vr_amp_threshold,
710
+ vr_batch_size,
711
+ vocals_stem,
712
+ instrumental_stem,
713
+ other_stem,
714
+ drums_stem,
715
+ bass_stem,
716
+ guitar_stem,
717
+ piano_stem,
718
+ ],
719
+ outputs=[
720
+ vr_stem1,
721
+ vr_stem2,
722
+ ], concurrency_limit=1,
723
+ )
724
+ demucs_button.click(
725
+ demucs_separator,
726
+ inputs=[
727
+ demucs_audio,
728
+ demucs_model,
729
+ demucs_seg_size,
730
+ demucs_shifts,
731
+ demucs_overlap,
732
+ demucs_segments_enabled,
733
+ model_file_dir,
734
+ output_dir,
735
+ demucs_output_format,
736
+ demucs_norm_threshold,
737
+ demucs_amp_threshold,
738
+ vocals_stem,
739
+ instrumental_stem,
740
+ other_stem,
741
+ drums_stem,
742
+ bass_stem,
743
+ guitar_stem,
744
+ piano_stem,
745
+ ],
746
+ outputs=[
747
+ demucs_stem1,
748
+ demucs_stem2,
749
+ demucs_stem3,
750
+ demucs_stem4,
751
+ demucs_stem5,
752
+ demucs_stem6,
753
+ ], concurrency_limit=1,
754
+ )
755
+
756
+ def main():
757
+ app.queue().launch(share=True, debug=True)
758
+
759
+ if __name__ == "__main__":
760
+ main()