Spaces:

Staticaliza
/

Voice

Running on Zero

File size: 2,826 Bytes

944743c
b3480ec
 
944743c
33d5f3b
b3480ec
33d5f3b
 
 
b3480ec
36a226d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33d5f3b
 
 
b3480ec
 
33d5f3b
 
36a226d
 
b3480ec
33d5f3b
36a226d
b3480ec
33d5f3b
 
 
b3480ec
 
33d5f3b
 
b3480ec
 
33d5f3b
 
 
b3480ec
 
 
33d5f3b
36a226d
33d5f3b
 
 
36a226d
33d5f3b

import spaces
from kokoro import KModel, KPipeline
import gradio as gr

CHAR_LIMIT = 5000

PIPELINES = {lang: KPipeline(lang_code=lang, model=False) for lang in "ab"}
PIPELINES["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"
PIPELINES["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"

CHOICES = {
    "🇺🇸 🚺 Heart ❤️": "af_heart",
    "🇺🇸 🚺 Bella 🔥": "af_bella",
    "🇺🇸 🚺 Nicole 🎧": "af_nicole",
    "🇺🇸 🚺 Aoede": "af_aoede",
    "🇺🇸 🚺 Kore": "af_kore",
    "🇺🇸 🚺 Sarah": "af_sarah",
    "🇺🇸 🚺 Nova": "af_nova",
    "🇺🇸 🚺 Sky": "af_sky",
    "🇺🇸 🚺 Alloy": "af_alloy",
    "🇺🇸 🚺 Jessica": "af_jessica",
    "🇺🇸 🚺 River": "af_river",
    "🇺🇸 🚹 Michael": "am_michael",
    "🇺🇸 🚹 Fenrir": "am_fenrir",
    "🇺🇸 🚹 Puck": "am_puck",
    "🇺🇸 🚹 Echo": "am_echo",
    "🇺🇸 🚹 Eric": "am_eric",
    "🇺🇸 🚹 Liam": "am_liam",
    "🇺🇸 🚹 Onyx": "am_onyx",
    "🇺🇸 🚹 Santa": "am_santa",
    "🇺🇸 🚹 Adam": "am_adam",
    "🇬🇧 🚺 Emma": "bf_emma",
    "🇬🇧 🚺 Isabella": "bf_isabella",
    "🇬🇧 🚺 Alice": "bf_alice",
    "🇬🇧 🚺 Lily": "bf_lily",
    "🇬🇧 🚹 George": "bm_george",
    "🇬🇧 🚹 Fable": "bm_fable",
    "🇬🇧 🚹 Lewis": "bm_lewis",
    "🇬🇧 🚹 Daniel": "bm_daniel",
}

for v in CHOICES.values():
    PIPELINES[v[0]].load_voice(v)

MODEL = KModel().eval()

def generate_first(text, voice="af_heart", speed=1):
    text = text.strip()[:CHAR_LIMIT]
    pipeline = PIPELINES[voice[0]]
    pack = pipeline.load_voice(voice)
    for _, ps, _ in pipeline(text, voice, speed):
        ref_s = pack[len(ps) - 1]
        audio = MODEL(ps, ref_s, speed)
        return (24000, audio.numpy())
    return None

def predict(text, voice="af_heart", speed=1):
    return generate_first(text, voice, speed)

def generate_all(text, voice="af_heart", speed=1):
    text = text.strip()[:CHAR_LIMIT]
    pipeline = PIPELINES[voice[0]]
    pack = pipeline.load_voice(voice)
    for _, ps, _ in pipeline(text, voice, speed):
        ref_s = pack[len(ps) - 1]
        audio = MODEL(ps, ref_s, speed)
        yield 24000, audio.numpy()

@spaces.GPU()
def gpu():
    return

with gr.Blocks() as app:
    with gr.Row():
        text_input = gr.Textbox(label="input text")
        voice_input = gr.Dropdown(list(CHOICES.items()), value="af_heart", label="voice")
        speed_input = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="speed")
    out_audio = gr.Audio(label="output audio", interactive=False, autoplay=True)
    gen_btn = gr.Button("generate")
    gen_btn.click(fn=generate_first, inputs=[text_input, voice_input, speed_input], outputs=out_audio)

if __name__ == "__main__":
    app.launch()