Spaces:

Staticaliza
/

Voice

Running on Zero

File size: 3,410 Bytes

24cdd02
 
944743c
2708d4a
dfe5a3d
b3480ec
944743c
24cdd02
 
 
 
 
b3480ec
24cdd02
d53f698
24cdd02
68e4bce
24cdd02
 
b3480ec
68e4bce
36a226d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24cdd02
 
 
 
36a226d
fe87b84
 
 
dfe5a3d
24cdd02
 
 
 
 
 
 
 
93fc110
68e4bce
dfe5a3d
 
cb1604b
 
 
dfe5a3d
 
 
 
24cdd02
9487e3f
33d5f3b
662b7d6
cb1604b
b3480ec
662b7d6
 
e8582ad
b3480ec
24cdd02
 
b3480ec
33d5f3b
 
 
b3480ec
24cdd02
 
7b10e06
 
 
24cdd02
 
 
 
 
 
cb1604b
24cdd02
 
cb1604b
24cdd02
 
33d5f3b
24cdd02

# Imports
import gradio as gr
import spaces
import torch
import numpy as np
from kokoro import KModel, KPipeline

# Pre-Initialize
DEVICE = "auto"
if DEVICE == "auto":
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[SYSTEM] | Using {DEVICE} type compute device.")

# Variables
SILENT_THRESHOLD = 0.01
CHAR_LIMIT = 2000

DEFAULT_INPUT = ""
DEFAULT_VOICE = "af_heart"


CHOICES = {
    "🇺🇸 🚺 Heart ❤️": "af_heart",
    "🇺🇸 🚺 Bella 🔥": "af_bella",
    "🇺🇸 🚺 Nicole 🎧": "af_nicole",
    "🇺🇸 🚺 Aoede": "af_aoede",
    "🇺🇸 🚺 Kore": "af_kore",
    "🇺🇸 🚺 Sarah": "af_sarah",
    "🇺🇸 🚺 Nova": "af_nova",
    "🇺🇸 🚺 Sky": "af_sky",
    "🇺🇸 🚺 Alloy": "af_alloy",
    "🇺🇸 🚺 Jessica": "af_jessica",
    "🇺🇸 🚺 River": "af_river",
    "🇺🇸 🚹 Michael": "am_michael",
    "🇺🇸 🚹 Fenrir": "am_fenrir",
    "🇺🇸 🚹 Puck": "am_puck",
    "🇺🇸 🚹 Echo": "am_echo",
    "🇺🇸 🚹 Eric": "am_eric",
    "🇺🇸 🚹 Liam": "am_liam",
    "🇺🇸 🚹 Onyx": "am_onyx",
    "🇺🇸 🚹 Santa": "am_santa",
    "🇺🇸 🚹 Adam": "am_adam",
    "🇬🇧 🚺 Emma": "bf_emma",
    "🇬🇧 🚺 Isabella": "bf_isabella",
    "🇬🇧 🚺 Alice": "bf_alice",
    "🇬🇧 🚺 Lily": "bf_lily",
    "🇬🇧 🚹 George": "bm_george",
    "🇬🇧 🚹 Fable": "bm_fable",
    "🇬🇧 🚹 Lewis": "bm_lewis",
    "🇬🇧 🚹 Daniel": "bm_daniel",
}

PIPELINES = {lang: KPipeline(lang_code=lang, model=False) for lang in "ab"}
PIPELINES["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"
PIPELINES["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"

for v in CHOICES.values():
    PIPELINES[v[0]].load_voice(v)

MODEL = KModel().eval()
    
css = '''
.gradio-container{max-width: 560px !important}
h1{text-align:center}
footer {
    visibility: hidden
}
'''

# Functions
def trim_silence(audio, threshold=SILENT_THRESHOLD):
    abs_audio = np.abs(audio)
    indices = np.where(abs_audio > threshold)[0]
    
    if len(indices) == 0: return audio
        
    start = indices[0]
    end = indices[-1] + 1
    return audio[start:end]

def generate(text=DEFAULT_INPUT, voice=DEFAULT_VOICE, speed=1):
    text = text.strip()[:CHAR_LIMIT] + "."
    pipeline = PIPELINES[voice[0]]
    pack = pipeline.load_voice(voice)
    
    for _, ps, _ in pipeline(text, voice, speed):
        ref_s = pack[len(ps) - 1]
        audio = MODEL(ps, ref_s, speed)
        return (24000, trim_silence(audio.numpy()))

def cloud():
    print("[CLOUD] | Space maintained.")

@spaces.GPU()
def gpu():
    return

# Initialize
with gr.Blocks(css=css) as main:
    with gr.Column():
        gr.Markdown("🪄 Instantly generate realistic voices using text input.")
        
    with gr.Column():
        input = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Input")
        voice_input = gr.Dropdown(list(CHOICES.items()), value=DEFAULT_VOICE, label="Voice")
        speed_input = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
        submit = gr.Button("▶")
        maintain = gr.Button("☁️")
        
    with gr.Column():
        output = gr.Audio(label="Output")
        
    submit.click(fn=generate, inputs=[input, voice_input, speed_input], outputs=output)
    maintain.click(cloud, inputs=[], outputs=[], queue=False)

main.launch(show_api=True)