Spaces:
Sleeping
Sleeping
File size: 2,404 Bytes
bbd9e13 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import tempfile
from gradio import Checkbox, Dropdown, Interface, Textbox
import soundfile as sf
import torch
from voicefixer import VoiceFixer
from models.delightful_univnet import DelightfulUnivnet
from training.datasets.hifi_libri_dataset import speakers_hifi_ids
from .config import speakers_delightful_22050
delightful_checkpoint_path = "epoch=5816-step=390418.ckpt"
device = torch.device("cpu")
delightfulunivnet_22050 = DelightfulUnivnet(
delightful_checkpoint_path=delightful_checkpoint_path,
).to(device)
voicefixer = VoiceFixer()
def generate_audio(text: str, speaker_name: str, fix_voice: bool):
speaker = torch.tensor(
[speakers_delightful_22050[speaker_name]],
device=device,
)
with torch.no_grad():
wav = delightfulunivnet_22050.forward(text, speaker)
wav = wav.squeeze().detach().cpu().numpy()
if fix_voice:
# Save the numpy array to a temporary wav file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as input_file:
# Write to the temp wav file
sf.write(input_file.name, wav, delightfulunivnet_22050.sampling_rate)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as output_file:
voicefixer.restore(
input=input_file.name, # low quality .wav/.flac file
output=output_file.name, # save file path
cuda=False, # GPU acceleration off
mode=0,
)
# Read the wav file back into a numpy array
wav_vf, sampling_rate = sf.read(output_file.name)
return sampling_rate, wav_vf
return delightfulunivnet_22050.sampling_rate, wav
interfaceDelightfulUnuvnet22050 = Interface(
generate_audio,
[
Textbox(
label="Text",
value="As the snake shook its head, a deafening shout behind Harry made both of them jump.",
),
Dropdown(
label="Speaker",
choices=list(speakers_delightful_22050.keys()),
value=speakers_hifi_ids[0],
),
Checkbox(
label="Fix voice (Voicefixer)",
value=False,
),
],
outputs="audio",
title=f"Delightful UnivNet, Sampling Rate: {delightfulunivnet_22050.sampling_rate}. When Voicefixer is enabled, the Simpling Rate is 44100.",
)
|