File size: 2,826 Bytes
944743c
b3480ec
 
944743c
33d5f3b
b3480ec
33d5f3b
 
 
b3480ec
36a226d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33d5f3b
 
 
b3480ec
 
33d5f3b
 
36a226d
 
b3480ec
33d5f3b
36a226d
b3480ec
33d5f3b
 
 
b3480ec
 
33d5f3b
 
b3480ec
 
33d5f3b
 
 
b3480ec
 
 
33d5f3b
36a226d
33d5f3b
 
 
36a226d
33d5f3b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import spaces
from kokoro import KModel, KPipeline
import gradio as gr

CHAR_LIMIT = 5000

PIPELINES = {lang: KPipeline(lang_code=lang, model=False) for lang in "ab"}
PIPELINES["a"].g2p.lexicon.golds["kokoro"] = "kˈOkΙ™ΙΉO"
PIPELINES["b"].g2p.lexicon.golds["kokoro"] = "kˈQkΙ™ΙΉQ"

CHOICES = {
    "πŸ‡ΊπŸ‡Έ 🚺 Heart ❀️": "af_heart",
    "πŸ‡ΊπŸ‡Έ 🚺 Bella πŸ”₯": "af_bella",
    "πŸ‡ΊπŸ‡Έ 🚺 Nicole 🎧": "af_nicole",
    "πŸ‡ΊπŸ‡Έ 🚺 Aoede": "af_aoede",
    "πŸ‡ΊπŸ‡Έ 🚺 Kore": "af_kore",
    "πŸ‡ΊπŸ‡Έ 🚺 Sarah": "af_sarah",
    "πŸ‡ΊπŸ‡Έ 🚺 Nova": "af_nova",
    "πŸ‡ΊπŸ‡Έ 🚺 Sky": "af_sky",
    "πŸ‡ΊπŸ‡Έ 🚺 Alloy": "af_alloy",
    "πŸ‡ΊπŸ‡Έ 🚺 Jessica": "af_jessica",
    "πŸ‡ΊπŸ‡Έ 🚺 River": "af_river",
    "πŸ‡ΊπŸ‡Έ 🚹 Michael": "am_michael",
    "πŸ‡ΊπŸ‡Έ 🚹 Fenrir": "am_fenrir",
    "πŸ‡ΊπŸ‡Έ 🚹 Puck": "am_puck",
    "πŸ‡ΊπŸ‡Έ 🚹 Echo": "am_echo",
    "πŸ‡ΊπŸ‡Έ 🚹 Eric": "am_eric",
    "πŸ‡ΊπŸ‡Έ 🚹 Liam": "am_liam",
    "πŸ‡ΊπŸ‡Έ 🚹 Onyx": "am_onyx",
    "πŸ‡ΊπŸ‡Έ 🚹 Santa": "am_santa",
    "πŸ‡ΊπŸ‡Έ 🚹 Adam": "am_adam",
    "πŸ‡¬πŸ‡§ 🚺 Emma": "bf_emma",
    "πŸ‡¬πŸ‡§ 🚺 Isabella": "bf_isabella",
    "πŸ‡¬πŸ‡§ 🚺 Alice": "bf_alice",
    "πŸ‡¬πŸ‡§ 🚺 Lily": "bf_lily",
    "πŸ‡¬πŸ‡§ 🚹 George": "bm_george",
    "πŸ‡¬πŸ‡§ 🚹 Fable": "bm_fable",
    "πŸ‡¬πŸ‡§ 🚹 Lewis": "bm_lewis",
    "πŸ‡¬πŸ‡§ 🚹 Daniel": "bm_daniel",
}

for v in CHOICES.values():
    PIPELINES[v[0]].load_voice(v)

MODEL = KModel().eval()

def generate_first(text, voice="af_heart", speed=1):
    text = text.strip()[:CHAR_LIMIT]
    pipeline = PIPELINES[voice[0]]
    pack = pipeline.load_voice(voice)
    for _, ps, _ in pipeline(text, voice, speed):
        ref_s = pack[len(ps) - 1]
        audio = MODEL(ps, ref_s, speed)
        return (24000, audio.numpy())
    return None

def predict(text, voice="af_heart", speed=1):
    return generate_first(text, voice, speed)

def generate_all(text, voice="af_heart", speed=1):
    text = text.strip()[:CHAR_LIMIT]
    pipeline = PIPELINES[voice[0]]
    pack = pipeline.load_voice(voice)
    for _, ps, _ in pipeline(text, voice, speed):
        ref_s = pack[len(ps) - 1]
        audio = MODEL(ps, ref_s, speed)
        yield 24000, audio.numpy()

@spaces.GPU()
def gpu():
    return

with gr.Blocks() as app:
    with gr.Row():
        text_input = gr.Textbox(label="input text")
        voice_input = gr.Dropdown(list(CHOICES.items()), value="af_heart", label="voice")
        speed_input = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="speed")
    out_audio = gr.Audio(label="output audio", interactive=False, autoplay=True)
    gen_btn = gr.Button("generate")
    gen_btn.click(fn=generate_first, inputs=[text_input, voice_input, speed_input], outputs=out_audio)

if __name__ == "__main__":
    app.launch()