Voice / app.py
Staticaliza's picture
Update app.py
36a226d verified
raw
history blame
2.83 kB
import spaces
from kokoro import KModel, KPipeline
import gradio as gr
CHAR_LIMIT = 5000
PIPELINES = {lang: KPipeline(lang_code=lang, model=False) for lang in "ab"}
PIPELINES["a"].g2p.lexicon.golds["kokoro"] = "kˈOkΙ™ΙΉO"
PIPELINES["b"].g2p.lexicon.golds["kokoro"] = "kˈQkΙ™ΙΉQ"
CHOICES = {
"πŸ‡ΊπŸ‡Έ 🚺 Heart ❀️": "af_heart",
"πŸ‡ΊπŸ‡Έ 🚺 Bella πŸ”₯": "af_bella",
"πŸ‡ΊπŸ‡Έ 🚺 Nicole 🎧": "af_nicole",
"πŸ‡ΊπŸ‡Έ 🚺 Aoede": "af_aoede",
"πŸ‡ΊπŸ‡Έ 🚺 Kore": "af_kore",
"πŸ‡ΊπŸ‡Έ 🚺 Sarah": "af_sarah",
"πŸ‡ΊπŸ‡Έ 🚺 Nova": "af_nova",
"πŸ‡ΊπŸ‡Έ 🚺 Sky": "af_sky",
"πŸ‡ΊπŸ‡Έ 🚺 Alloy": "af_alloy",
"πŸ‡ΊπŸ‡Έ 🚺 Jessica": "af_jessica",
"πŸ‡ΊπŸ‡Έ 🚺 River": "af_river",
"πŸ‡ΊπŸ‡Έ 🚹 Michael": "am_michael",
"πŸ‡ΊπŸ‡Έ 🚹 Fenrir": "am_fenrir",
"πŸ‡ΊπŸ‡Έ 🚹 Puck": "am_puck",
"πŸ‡ΊπŸ‡Έ 🚹 Echo": "am_echo",
"πŸ‡ΊπŸ‡Έ 🚹 Eric": "am_eric",
"πŸ‡ΊπŸ‡Έ 🚹 Liam": "am_liam",
"πŸ‡ΊπŸ‡Έ 🚹 Onyx": "am_onyx",
"πŸ‡ΊπŸ‡Έ 🚹 Santa": "am_santa",
"πŸ‡ΊπŸ‡Έ 🚹 Adam": "am_adam",
"πŸ‡¬πŸ‡§ 🚺 Emma": "bf_emma",
"πŸ‡¬πŸ‡§ 🚺 Isabella": "bf_isabella",
"πŸ‡¬πŸ‡§ 🚺 Alice": "bf_alice",
"πŸ‡¬πŸ‡§ 🚺 Lily": "bf_lily",
"πŸ‡¬πŸ‡§ 🚹 George": "bm_george",
"πŸ‡¬πŸ‡§ 🚹 Fable": "bm_fable",
"πŸ‡¬πŸ‡§ 🚹 Lewis": "bm_lewis",
"πŸ‡¬πŸ‡§ 🚹 Daniel": "bm_daniel",
}
for v in CHOICES.values():
PIPELINES[v[0]].load_voice(v)
MODEL = KModel().eval()
def generate_first(text, voice="af_heart", speed=1):
text = text.strip()[:CHAR_LIMIT]
pipeline = PIPELINES[voice[0]]
pack = pipeline.load_voice(voice)
for _, ps, _ in pipeline(text, voice, speed):
ref_s = pack[len(ps) - 1]
audio = MODEL(ps, ref_s, speed)
return (24000, audio.numpy())
return None
def predict(text, voice="af_heart", speed=1):
return generate_first(text, voice, speed)
def generate_all(text, voice="af_heart", speed=1):
text = text.strip()[:CHAR_LIMIT]
pipeline = PIPELINES[voice[0]]
pack = pipeline.load_voice(voice)
for _, ps, _ in pipeline(text, voice, speed):
ref_s = pack[len(ps) - 1]
audio = MODEL(ps, ref_s, speed)
yield 24000, audio.numpy()
@spaces.GPU()
def gpu():
return
with gr.Blocks() as app:
with gr.Row():
text_input = gr.Textbox(label="input text")
voice_input = gr.Dropdown(list(CHOICES.items()), value="af_heart", label="voice")
speed_input = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="speed")
out_audio = gr.Audio(label="output audio", interactive=False, autoplay=True)
gen_btn = gr.Button("generate")
gen_btn.click(fn=generate_first, inputs=[text_input, voice_input, speed_input], outputs=out_audio)
if __name__ == "__main__":
app.launch()