File size: 3,537 Bytes
24cdd02
 
944743c
2708d4a
dfe5a3d
b3480ec
944743c
24cdd02
 
 
 
 
b3480ec
2708d4a
 
24cdd02
 
 
 
b3480ec
36a226d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24cdd02
 
 
 
2708d4a
36a226d
2708d4a
 
 
36a226d
dfe5a3d
 
 
 
 
 
24cdd02
 
 
 
 
 
 
 
dfe5a3d
 
 
 
 
 
 
 
 
24cdd02
 
9487e3f
33d5f3b
2708d4a
b3480ec
33d5f3b
 
dfe5a3d
 
 
b3480ec
24cdd02
 
b3480ec
33d5f3b
 
 
b3480ec
24cdd02
 
 
 
 
 
 
 
 
 
 
 
33d5f3b
24cdd02
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# Imports
import gradio as gr
import spaces
import torch
import numpy as np
from kokoro import KModel, KPipeline

# Pre-Initialize
DEVICE = "auto"
if DEVICE == "auto":
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[SYSTEM] | Using {DEVICE} type compute device.")

torch.set_num_threads(4)

# Variables
CHAR_LIMIT = 2000
DEFAULT_INPUT = ""
DEFAULT_VOICE = "af_heart"

CHOICES = {
    "πŸ‡ΊπŸ‡Έ 🚺 Heart ❀️": "af_heart",
    "πŸ‡ΊπŸ‡Έ 🚺 Bella πŸ”₯": "af_bella",
    "πŸ‡ΊπŸ‡Έ 🚺 Nicole 🎧": "af_nicole",
    "πŸ‡ΊπŸ‡Έ 🚺 Aoede": "af_aoede",
    "πŸ‡ΊπŸ‡Έ 🚺 Kore": "af_kore",
    "πŸ‡ΊπŸ‡Έ 🚺 Sarah": "af_sarah",
    "πŸ‡ΊπŸ‡Έ 🚺 Nova": "af_nova",
    "πŸ‡ΊπŸ‡Έ 🚺 Sky": "af_sky",
    "πŸ‡ΊπŸ‡Έ 🚺 Alloy": "af_alloy",
    "πŸ‡ΊπŸ‡Έ 🚺 Jessica": "af_jessica",
    "πŸ‡ΊπŸ‡Έ 🚺 River": "af_river",
    "πŸ‡ΊπŸ‡Έ 🚹 Michael": "am_michael",
    "πŸ‡ΊπŸ‡Έ 🚹 Fenrir": "am_fenrir",
    "πŸ‡ΊπŸ‡Έ 🚹 Puck": "am_puck",
    "πŸ‡ΊπŸ‡Έ 🚹 Echo": "am_echo",
    "πŸ‡ΊπŸ‡Έ 🚹 Eric": "am_eric",
    "πŸ‡ΊπŸ‡Έ 🚹 Liam": "am_liam",
    "πŸ‡ΊπŸ‡Έ 🚹 Onyx": "am_onyx",
    "πŸ‡ΊπŸ‡Έ 🚹 Santa": "am_santa",
    "πŸ‡ΊπŸ‡Έ 🚹 Adam": "am_adam",
    "πŸ‡¬πŸ‡§ 🚺 Emma": "bf_emma",
    "πŸ‡¬πŸ‡§ 🚺 Isabella": "bf_isabella",
    "πŸ‡¬πŸ‡§ 🚺 Alice": "bf_alice",
    "πŸ‡¬πŸ‡§ 🚺 Lily": "bf_lily",
    "πŸ‡¬πŸ‡§ 🚹 George": "bm_george",
    "πŸ‡¬πŸ‡§ 🚹 Fable": "bm_fable",
    "πŸ‡¬πŸ‡§ 🚹 Lewis": "bm_lewis",
    "πŸ‡¬πŸ‡§ 🚹 Daniel": "bm_daniel",
}

PIPELINES = {lang: KPipeline(lang_code=lang, model=False) for lang in "ab"}
PIPELINES["a"].g2p.lexicon.golds["kokoro"] = "kˈOkΙ™ΙΉO"
PIPELINES["b"].g2p.lexicon.golds["kokoro"] = "kˈQkΙ™ΙΉQ"

VOICE_PACKS = {}
for v in CHOICES.values():
    VOICE_PACKS[v] = PIPELINES[v[0]].load_voice(v)
    
model_instance = KModel().to(DEVICE).eval()

try:
    MODEL = torch.jit.script(model_instance)
except Exception as e:
    print("torch.jit.script failed, using original model:", e)
    MODEL = model_instance
    
css = '''
.gradio-container{max-width: 560px !important}
h1{text-align:center}
footer {
    visibility: hidden
}
'''

def trim_silence(audio, threshold=0.001):
    abs_audio = np.abs(audio)
    indices = np.where(abs_audio > threshold)[0]
    if len(indices) == 0:
        return audio
    start = indices[0]
    end = indices[-1] + 1
    return audio[start:end]

# Functions
def generate(text=DEFAULT_INPUT, voice=DEFAULT_VOICE, speed=1):
    text = text.strip()[:CHAR_LIMIT] + "."
    pipeline = PIPELINES[voice[0]]
    pack = VOICE_PACKS[voice]
    for _, ps, _ in pipeline(text, voice, speed):
        ref_s = pack[len(ps) - 1]
        audio = MODEL(ps, ref_s, speed)
        audio_np = audio.numpy()
        trimmed_audio = trim_silence(audio_np)
        return (24000, trimmed_audio)

def cloud():
    print("[CLOUD] | Space maintained.")

@spaces.GPU()
def gpu():
    return

# Initialize
with gr.Blocks(css=css) as main:
    with gr.Column():
        input = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Input")
        voice_input = gr.Dropdown(list(CHOICES.items()), value=DEFAULT_VOICE, label="Voice")
        speed_input = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
        submit = gr.Button("β–Ά")
        maintain = gr.Button("☁️")
    with gr.Column():
        output = gr.Audio(label="Output")
    submit.click(fn=generate, inputs=[input, voice_input, speed_input], outputs=output)
    maintain.click(cloud, inputs=[], outputs=[], queue=False)

main.launch(show_api=True)