Spaces:

Staticaliza
/

Voice

Running

App Files Files Community

Voice / app.py

Staticaliza

Update app.py

dfe5a3d verified 5 months ago

raw

history blame

3.54 kB

	# Imports
	import gradio as gr
	import spaces
	import torch
	import numpy as np
	from kokoro import KModel, KPipeline

	# Pre-Initialize
	DEVICE = "auto"
	if DEVICE == "auto":
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"[SYSTEM] \| Using {DEVICE} type compute device.")

	torch.set_num_threads(4)

	# Variables
	CHAR_LIMIT = 2000
	DEFAULT_INPUT = ""
	DEFAULT_VOICE = "af_heart"

	CHOICES = {
	"🇺🇸 🚺 Heart ❤️": "af_heart",
	"🇺🇸 🚺 Bella 🔥": "af_bella",
	"🇺🇸 🚺 Nicole 🎧": "af_nicole",
	"🇺🇸 🚺 Aoede": "af_aoede",
	"🇺🇸 🚺 Kore": "af_kore",
	"🇺🇸 🚺 Sarah": "af_sarah",
	"🇺🇸 🚺 Nova": "af_nova",
	"🇺🇸 🚺 Sky": "af_sky",
	"🇺🇸 🚺 Alloy": "af_alloy",
	"🇺🇸 🚺 Jessica": "af_jessica",
	"🇺🇸 🚺 River": "af_river",
	"🇺🇸 🚹 Michael": "am_michael",
	"🇺🇸 🚹 Fenrir": "am_fenrir",
	"🇺🇸 🚹 Puck": "am_puck",
	"🇺🇸 🚹 Echo": "am_echo",
	"🇺🇸 🚹 Eric": "am_eric",
	"🇺🇸 🚹 Liam": "am_liam",
	"🇺🇸 🚹 Onyx": "am_onyx",
	"🇺🇸 🚹 Santa": "am_santa",
	"🇺🇸 🚹 Adam": "am_adam",
	"🇬🇧 🚺 Emma": "bf_emma",
	"🇬🇧 🚺 Isabella": "bf_isabella",
	"🇬🇧 🚺 Alice": "bf_alice",
	"🇬🇧 🚺 Lily": "bf_lily",
	"🇬🇧 🚹 George": "bm_george",
	"🇬🇧 🚹 Fable": "bm_fable",
	"🇬🇧 🚹 Lewis": "bm_lewis",
	"🇬🇧 🚹 Daniel": "bm_daniel",
	}

	PIPELINES = {lang: KPipeline(lang_code=lang, model=False) for lang in "ab"}
	PIPELINES["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO"
	PIPELINES["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ"

	VOICE_PACKS = {}
	for v in CHOICES.values():
	VOICE_PACKS[v] = PIPELINES[v[0]].load_voice(v)

	model_instance = KModel().to(DEVICE).eval()

	try:
	MODEL = torch.jit.script(model_instance)
	except Exception as e:
	print("torch.jit.script failed, using original model:", e)
	MODEL = model_instance

	css = '''
	.gradio-container{max-width: 560px !important}
	h1{text-align:center}
	footer {
	visibility: hidden
	}
	'''

	def trim_silence(audio, threshold=0.001):
	abs_audio = np.abs(audio)
	indices = np.where(abs_audio > threshold)[0]
	if len(indices) == 0:
	return audio
	start = indices[0]
	end = indices[-1] + 1
	return audio[start:end]

	# Functions
	def generate(text=DEFAULT_INPUT, voice=DEFAULT_VOICE, speed=1):
	text = text.strip()[:CHAR_LIMIT] + "."
	pipeline = PIPELINES[voice[0]]
	pack = VOICE_PACKS[voice]
	for _, ps, _ in pipeline(text, voice, speed):
	ref_s = pack[len(ps) - 1]
	audio = MODEL(ps, ref_s, speed)
	audio_np = audio.numpy()
	trimmed_audio = trim_silence(audio_np)
	return (24000, trimmed_audio)

	def cloud():
	print("[CLOUD] \| Space maintained.")

	@spaces.GPU()
	def gpu():
	return

	# Initialize
	with gr.Blocks(css=css) as main:
	with gr.Column():
	input = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Input")
	voice_input = gr.Dropdown(list(CHOICES.items()), value=DEFAULT_VOICE, label="Voice")
	speed_input = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label="Speed")
	submit = gr.Button("▶")
	maintain = gr.Button("☁️")
	with gr.Column():
	output = gr.Audio(label="Output")
	submit.click(fn=generate, inputs=[input, voice_input, speed_input], outputs=output)
	maintain.click(cloud, inputs=[], outputs=[], queue=False)

	main.launch(show_api=True)