ClearVoice-SR

Running on Zero

File size: 2,669 Bytes

02c7bdf
bdaf47a
02c7bdf
7b02833
02c7bdf
a1655f3
ed2aa07
02c7bdf
1311e01
341eb54
ed2aa07
341eb54
 
 
ed2aa07
341eb54
ed2aa07
 
4c3cc25
6e4d760
 
 
 
 
75a5cbb
 
341eb54
6e4d760
3192961
 
6e4d760
 
b78b7d0
e805751
7c1bd00
b78b7d0
 
e805751
b78b7d0
9f61737
6e4d760
8310825
43e8301
 
b78b7d0
7c1bd00
 
 
 
b78b7d0
 
 
 
99710ec
863f583
49effbd
0f8dddd

import torch
import soundfile as sf
import gradio as gr
import spaces
from clearvoice import ClearVoice
import os
import random

@spaces.GPU
def fn_clearvoice_sr(input_wav, apply_se):
    wavname = input_wav.split('/')[-1]
    myClearVoice = ClearVoice(task='speech_super_resolution', model_names=['MossFormer2_SR_48K'])
    fs = 48000
    if apply_se:
        new_wavname = wavname.replace('.wav', str(random.randint(0,1000))+'.wav')
        myClearVoice_se = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
        output_wav_dict = myClearVoice_se(input_path=input_wav, online_write=True, output_path=new_wavname)
        input_wav = new_wavname

    output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
    if isinstance(output_wav_dict, dict):
        key = next(iter(output_wav_dict))
        output_wav = output_wav_dict[key]
    else:
        output_wav = output_wav_dict
    sf.write('enhanced_high_res.wav', output_wav[0,:], fs)
    return 'enhanced_high_res.wav'
    
demo = gr.Blocks()

sr_demo = gr.Interface(
    fn=fn_clearvoice_sr,
    inputs = [
        gr.Audio(label="Input Audio", type="filepath"),
        gr.Checkbox(label="Apply Speech Enhancement", value=True),
    ],
    outputs = [
        gr.Audio(label="Output Audio", type="filepath"),
    ],
    title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Super Resolution",
    description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and transform low-resolution audio (effective sampling rate ≥ 16 kHz) into crystal-clear, high-resolution audio at 48 kHz. It supports most of audio types. "
                   "To try it, simply upload your audio, or click one of the examples. "),
    article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> </p>"
              "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
    examples = [
        ["examples/mandarin_speech_16kHz.wav", True],
        ["examples/LJSpeech-001-0001-22k.wav", True],
        ["examples/LibriTTS_986_129388_24k.wav", True],
        ["examples/english_speech_48kHz.wav", True],
    ],
    cache_examples = True,
)

with demo:
    gr.TabbedInterface([sr_demo], ["Task 4: Speech Super Resolution"])

demo.launch()