Voice / app.py
Staticaliza's picture
Update app.py
dfe5a3d verified
raw
history blame
3.54 kB
# Imports
import gradio as gr
import spaces
import torch
import numpy as np
from kokoro import KModel, KPipeline
# Pre-Initialize
DEVICE = "auto"
if DEVICE == "auto":
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[SYSTEM] | Using {DEVICE} type compute device.")
torch.set_num_threads(4)
# Variables
CHAR_LIMIT = 2000
DEFAULT_INPUT = ""
DEFAULT_VOICE = "af_heart"
CHOICES = {
"πŸ‡ΊπŸ‡Έ 🚺 Heart ❀️": "af_heart",
"πŸ‡ΊπŸ‡Έ 🚺 Bella πŸ”₯": "af_bella",
"πŸ‡ΊπŸ‡Έ 🚺 Nicole 🎧": "af_nicole",
"πŸ‡ΊπŸ‡Έ 🚺 Aoede": "af_aoede",
"πŸ‡ΊπŸ‡Έ 🚺 Kore": "af_kore",
"πŸ‡ΊπŸ‡Έ 🚺 Sarah": "af_sarah",
"πŸ‡ΊπŸ‡Έ 🚺 Nova": "af_nova",
"πŸ‡ΊπŸ‡Έ 🚺 Sky": "af_sky",
"πŸ‡ΊπŸ‡Έ 🚺 Alloy": "af_alloy",
"πŸ‡ΊπŸ‡Έ 🚺 Jessica": "af_jessica",
"πŸ‡ΊπŸ‡Έ 🚺 River": "af_river",
"πŸ‡ΊπŸ‡Έ 🚹 Michael": "am_michael",
"πŸ‡ΊπŸ‡Έ 🚹 Fenrir": "am_fenrir",
"πŸ‡ΊπŸ‡Έ 🚹 Puck": "am_puck",
"πŸ‡ΊπŸ‡Έ 🚹 Echo": "am_echo",
"πŸ‡ΊπŸ‡Έ 🚹 Eric": "am_eric",
"πŸ‡ΊπŸ‡Έ 🚹 Liam": "am_liam",
"πŸ‡ΊπŸ‡Έ 🚹 Onyx": "am_onyx",
"πŸ‡ΊπŸ‡Έ 🚹 Santa": "am_santa",
"πŸ‡ΊπŸ‡Έ 🚹 Adam": "am_adam",
"πŸ‡¬πŸ‡§ 🚺 Emma": "bf_emma",
"πŸ‡¬πŸ‡§ 🚺 Isabella": "bf_isabella",
"πŸ‡¬πŸ‡§ 🚺 Alice": "bf_alice",
"πŸ‡¬πŸ‡§ 🚺 Lily": "bf_lily",
"πŸ‡¬πŸ‡§ 🚹 George": "bm_george",
"πŸ‡¬πŸ‡§ 🚹 Fable": "bm_fable",
"πŸ‡¬πŸ‡§ 🚹 Lewis": "bm_lewis",
"πŸ‡¬πŸ‡§ 🚹 Daniel": "bm_daniel",
}
PIPELINES = {lang: KPipeline(lang_code=lang, model=False) for lang in "ab"}
PIPELINES["a"].g2p.lexicon.golds["kokoro"] = "kˈOkΙ™ΙΉO"
PIPELINES["b"].g2p.lexicon.golds["kokoro"] = "kˈQkΙ™ΙΉQ"
VOICE_PACKS = {}
for v in CHOICES.values():
VOICE_PACKS[v] = PIPELINES[v[0]].load_voice(v)
model_instance = KModel().to(DEVICE).eval()
try:
MODEL = torch.jit.script(model_instance)
except Exception as e:
print("torch.jit.script failed, using original model:", e)
MODEL = model_instance
css = '''
.gradio-container{max-width: 560px !important}
h1{text-align:center}
footer {
visibility: hidden
}
'''
def trim_silence(audio, threshold=0.001):
abs_audio = np.abs(audio)
indices = np.where(abs_audio > threshold)[0]
if len(indices) == 0:
return audio
start = indices[0]
end = indices[-1] + 1
return audio[start:end]
# Functions
def generate(text=DEFAULT_INPUT, voice=DEFAULT_VOICE, speed=1):
text = text.strip()[:CHAR_LIMIT] + "."
pipeline = PIPELINES[voice[0]]
pack = VOICE_PACKS[voice]
for _, ps, _ in pipeline(text, voice, speed):
ref_s = pack[len(ps) - 1]
audio = MODEL(ps, ref_s, speed)
audio_np = audio.numpy()
trimmed_audio = trim_silence(audio_np)
return (24000, trimmed_audio)
def cloud():
print("[CLOUD] | Space maintained.")
@spaces.GPU()
def gpu():
return
# Initialize
with gr.Blocks(css=css) as main:
with gr.Column():
input = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Input")
voice_input = gr.Dropdown(list(CHOICES.items()), value=DEFAULT_VOICE, label="Voice")
speed_input = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
submit = gr.Button("β–Ά")
maintain = gr.Button("☁️")
with gr.Column():
output = gr.Audio(label="Output")
submit.click(fn=generate, inputs=[input, voice_input, speed_input], outputs=output)
maintain.click(cloud, inputs=[], outputs=[], queue=False)
main.launch(show_api=True)