Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,669 Bytes
02c7bdf bdaf47a 02c7bdf 7b02833 02c7bdf a1655f3 ed2aa07 02c7bdf 1311e01 341eb54 ed2aa07 341eb54 ed2aa07 341eb54 ed2aa07 4c3cc25 6e4d760 75a5cbb 341eb54 6e4d760 3192961 6e4d760 b78b7d0 e805751 7c1bd00 b78b7d0 e805751 b78b7d0 9f61737 6e4d760 8310825 43e8301 b78b7d0 7c1bd00 b78b7d0 99710ec 863f583 49effbd 0f8dddd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import torch
import soundfile as sf
import gradio as gr
import spaces
from clearvoice import ClearVoice
import os
import random
@spaces.GPU
def fn_clearvoice_sr(input_wav, apply_se):
wavname = input_wav.split('/')[-1]
myClearVoice = ClearVoice(task='speech_super_resolution', model_names=['MossFormer2_SR_48K'])
fs = 48000
if apply_se:
new_wavname = wavname.replace('.wav', str(random.randint(0,1000))+'.wav')
myClearVoice_se = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
output_wav_dict = myClearVoice_se(input_path=input_wav, online_write=True, output_path=new_wavname)
input_wav = new_wavname
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
if isinstance(output_wav_dict, dict):
key = next(iter(output_wav_dict))
output_wav = output_wav_dict[key]
else:
output_wav = output_wav_dict
sf.write('enhanced_high_res.wav', output_wav[0,:], fs)
return 'enhanced_high_res.wav'
demo = gr.Blocks()
sr_demo = gr.Interface(
fn=fn_clearvoice_sr,
inputs = [
gr.Audio(label="Input Audio", type="filepath"),
gr.Checkbox(label="Apply Speech Enhancement", value=True),
],
outputs = [
gr.Audio(label="Output Audio", type="filepath"),
],
title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Super Resolution",
description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and transform low-resolution audio (effective sampling rate ≥ 16 kHz) into crystal-clear, high-resolution audio at 48 kHz. It supports most of audio types. "
"To try it, simply upload your audio, or click one of the examples. "),
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> </p>"
"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
examples = [
["examples/mandarin_speech_16kHz.wav", True],
["examples/LJSpeech-001-0001-22k.wav", True],
["examples/LibriTTS_986_129388_24k.wav", True],
["examples/english_speech_48kHz.wav", True],
],
cache_examples = True,
)
with demo:
gr.TabbedInterface([sr_demo], ["Task 4: Speech Super Resolution"])
demo.launch() |