import tempfile from gradio import Checkbox, Dropdown, Interface, Textbox import soundfile as sf import torch from voicefixer import VoiceFixer from models.delightful_univnet import DelightfulUnivnet from training.datasets.hifi_libri_dataset import speakers_hifi_ids from .config import speakers_delightful_22050 delightful_checkpoint_path = "epoch=5816-step=390418.ckpt" device = torch.device("cpu") delightfulunivnet_22050 = DelightfulUnivnet( delightful_checkpoint_path=delightful_checkpoint_path, ).to(device) voicefixer = VoiceFixer() def generate_audio(text: str, speaker_name: str, fix_voice: bool): speaker = torch.tensor( [speakers_delightful_22050[speaker_name]], device=device, ) with torch.no_grad(): wav = delightfulunivnet_22050.forward(text, speaker) wav = wav.squeeze().detach().cpu().numpy() if fix_voice: # Save the numpy array to a temporary wav file with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as input_file: # Write to the temp wav file sf.write(input_file.name, wav, delightfulunivnet_22050.sampling_rate) with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as output_file: voicefixer.restore( input=input_file.name, # low quality .wav/.flac file output=output_file.name, # save file path cuda=False, # GPU acceleration off mode=0, ) # Read the wav file back into a numpy array wav_vf, sampling_rate = sf.read(output_file.name) return sampling_rate, wav_vf return delightfulunivnet_22050.sampling_rate, wav interfaceDelightfulUnuvnet22050 = Interface( generate_audio, [ Textbox( label="Text", value="As the snake shook its head, a deafening shout behind Harry made both of them jump.", ), Dropdown( label="Speaker", choices=list(speakers_delightful_22050.keys()), value=speakers_hifi_ids[0], ), Checkbox( label="Fix voice (Voicefixer)", value=False, ), ], outputs="audio", title=f"Delightful UnivNet, Sampling Rate: {delightfulunivnet_22050.sampling_rate}. When Voicefixer is enabled, the Simpling Rate is 44100.", )