Vokan / app.py
ButterCream
upgrade to VoPho - resolved all issues
b07d516
raw
history blame
10.7 kB
import gradio as gr
import spaces
from styletts2 import tts
import re
import numpy as np
from scipy.io.wavfile import write
import nltk
from VoPho.engine import Phonemizer
import torch
INTRO = """
<style>
.TitleContainer {
background-color: #ffff;
margin-bottom: 0rem;
margin-left: auto;
margin-right: auto;
width: 40%;
height: 30%;
border-radius: 10rem;
border: 0.5vw solid #ff593e;
text-align: center;
display: flex;
justify-content: center;
transition: .6s;
}
.TitleContainer:hover {
transform: scale(1.05);
}
.VokanLogo {
margin: auto;
display: block;
}
</style>
<div class="TitleContainer">
<img src="https://huggingface.co/spaces/ShoukanLabs/Vokan/resolve/main/Vokan.gif" class="VokanLogo">
</div>
<p align="center", style="font-size: 1vw; font-weight: bold; color: #ff593e;">A StyleTTS2 fine-tune, designed for expressiveness.</p>
<hr>
"""
js_func = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'light') {
url.searchParams.set('__theme', 'light');
window.location.href = url.href;
}
}
"""
examples = [
["./Examples/David Attenborough.wav",
"An understanding of the natural world is a source of not only great curiosity, but great fulfilment.",
1, 0.2, 0.5, 1, 200],
["./Examples/Linus Tech Tips.wav",
"sometimes I get so in the zone while building a computer it's like an out of body experience.",
1, 0.2, 0.8, 2, 200],
["./Examples/Melina.wav",
"If you intend to claim the Frenzied Flame, I ask that you cease. It is not to be meddled with. It is chaos, "
"devouring life and thought unending. However ruined this world has become, "
"however mired in torment and despair, life endures.",
0.95, 0.2, 0.5, 2, 200],
["./Examples/Patrick Bateman.wav",
"My Pain Is Constant And Sharp, And I Do Not Wish For A Better World For Anyone.",
1, 0.1, 0.3, 2, 200],
["./Examples/Furina.ogg",
"That's more like it! As expected, my dazzling side comes through in any situation.",
1, 0.2, 0.8, 2, 200]
]
theme = gr.themes.Soft(
primary_hue=gr.themes.Color(c100="#ffd7d1", c200="#ff593e", c300="#ff593e", c400="#ff593e", c50="#fff0f0",
c500="#ff593e", c600="#ea580c", c700="#c2410c", c800="#9a3412", c900="#7c2d12",
c950="#6c2e12"),
secondary_hue="orange",
radius_size=gr.themes.Size(lg="20px", md="8px", sm="6px", xl="30px", xs="4px", xxl="40px", xxs="2px"),
font=[gr.themes.GoogleFont('M PLUS Rounded 1c'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
).set(
block_background_fill='*neutral_50'
)
def split_and_recombine_text(text, desired_length=200, max_length=300):
"""Split text it into chunks of a desired length trying to keep sentences intact."""
# normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
text = re.sub(r'\n\n+', '\n', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[β€œβ€]', '"', text)
rv = []
in_quote = False
current = ""
split_pos = []
pos = -1
end_pos = len(text) - 1
def seek(delta):
nonlocal pos, in_quote, current
is_neg = delta < 0
for _ in range(abs(delta)):
if is_neg:
pos -= 1
current = current[:-1]
else:
pos += 1
current += text[pos]
if text[pos] == '"':
in_quote = not in_quote
return text[pos]
def peek(delta):
p = pos + delta
return text[p] if p < end_pos and p >= 0 else ""
def commit():
nonlocal rv, current, split_pos
rv.append(current)
current = ""
split_pos = []
while pos < end_pos:
c = seek(1)
# do we need to force a split?
if len(current) >= max_length:
if len(split_pos) > 0 and len(current) > (desired_length / 2):
# we have at least one sentence and we are over half the desired length, seek back to the last split
d = pos - split_pos[-1]
seek(-d)
else:
# no full sentences, seek back until we are not in the middle of a word and split there
while c not in '!?.\n ' and pos > 0 and len(current) > desired_length:
c = seek(-1)
commit()
# check for sentence boundaries
elif not in_quote and (c in '!?\n' or (c == '.' and peek(1) in '\n ')):
# seek forward if we have consecutive boundary markers but still within the max length
while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.':
c = seek(1)
split_pos.append(pos)
if len(current) >= desired_length:
commit()
# treat end of quote as a boundary if its followed by a space or newline
elif in_quote and peek(1) == '"' and peek(2) in '\n ':
seek(2)
split_pos.append(pos)
rv.append(current)
# clean up, remove lines with only whitespace or punctuation
rv = [s.strip() for s in rv]
rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)]
return rv
engine = Phonemizer()
def text_to_phonemes(text):
text = text.strip()
print("Text before phonemization: ", text)
ps = engine.phonemize(text)
print("Text after phonemization: ", ps)
return ps
@spaces.GPU
def generate(audio_path, ins, speed, alpha, beta, embedding, steps=200):
ref_s = other_tts.compute_style(audio_path)
print(ref_s.size())
s_prev = None
texts = split_and_recombine_text(ins)
audio = np.array([])
for i in texts:
i = text_to_phonemes(i)
synthaud, s_prev = other_tts.long_inference_segment(i, diffusion_steps=steps,
alpha=alpha, beta=beta, is_phonemes=True,
embedding_scale=embedding, prev_s=s_prev, ref_s=ref_s,
speed=speed, t=0.8)
# S-Curve
np_log_99 = np.log(99)
def s_curve(p):
assert 0 <= p and p <= 1, p
if p == 0 or p == 1:
return p
p = (2*p - 1) * np_log_99
s = 1 / (1 + np.exp(-p))
s = (s - 0.01) * 50 / 49
assert 0 <= s and s <= 1, s
return s
# Post-Processing
thresh = np.percentile(np.abs(synthaud), 95)
CUT_SAMPLES = 20000 # max samples to cut, in practice only 4-6k are actually cut
lead_percent = 0.008
trail_percent = 0.0085
# Leading artefact removal
left = CUT_SAMPLES + int(len(synthaud) * lead_percent)
for j in range(left):
if abs(synthaud[j]) > thresh:
left = j
break
left = max(0, min(left - int(len(synthaud) * lead_percent), CUT_SAMPLES))
synthaud[:left] = 0
for k in range(int(len(synthaud) * lead_percent)):
s = s_curve(k / int(len(synthaud) * lead_percent))
synthaud[k + left] *= s
# Trailing artefact removal
right = len(synthaud) - CUT_SAMPLES - int(len(synthaud) * trail_percent)
for j in range(len(synthaud) - 1, right, -1):
if abs(synthaud[j]) > thresh:
right = j
break
right = min(len(synthaud), max(right + int(len(synthaud) * trail_percent), len(synthaud) - CUT_SAMPLES))
synthaud[right:] = 0
for k in range(int(len(synthaud) * trail_percent)):
s = s_curve(k / int(len(synthaud) * trail_percent))
synthaud[right - int(len(synthaud) * trail_percent) + k] *= (1 - s)
audio = np.concatenate((audio, synthaud))
scaled = np.int16(audio / np.max(np.abs(audio)) * 32767)
return 24000, scaled
other_tts = tts.StyleTTS2(model_checkpoint_path='./epoch_2nd_00012.pth', config_path="models/config_ft.yml")
if torch.cuda.is_available():
other_tts.device = "cuda"
else:
other_tts.device = "cpu"
with gr.Blocks(theme=theme, js=js_func) as clone:
gr.HTML(INTRO)
with gr.Row():
with gr.Column(scale=1):
inp = gr.Textbox(label="Text", info="What do you want Vokan to say? | Longform generation may produce artifacts in between sentences", interactive=True)
voice = gr.Audio(label="Voice", interactive=True, type='filepath', max_length=1000,
waveform_options={'waveform_progress_color': '#FF593E'})
steps = gr.Slider(minimum=3, maximum=500, value=20, step=1, label="Diffusion Steps",
info="Higher produces better results typically", interactive=True)
embscale = gr.Slider(minimum=0.1, maximum=5, value=2, step=0.1, label="Embedding Scale",
info="Defaults to 2 | high scales may produce unexpected results | Higher scales produce more emotion guided reults", interactive=True)
alpha = gr.Slider(minimum=0, maximum=1, value=0.3, step=0.1, label="Alpha", info="Defaults to 0.3 | Lower = More similar in sound to speaker",
interactive=True)
beta = gr.Slider(minimum=0, maximum=1, value=0.7, step=0.1, label="Beta", info="Defaults to 0.7 | Lower = More similar prosody at cost of stability",
interactive=True)
speed = gr.Slider(minimum=0.5, maximum=1.5, value=1, step=0.1, label="Speed of speech",
info="Defaults to 1", interactive=True)
with gr.Column(scale=1):
clbtn = gr.Button("Synthesize", variant="primary")
claudio = gr.Audio(interactive=False, label="Synthesized Audio",
waveform_options={'waveform_progress_color': '#FF593E'})
clbtn.click(generate, inputs=[voice, inp, speed, alpha, beta, embscale, steps], outputs=[claudio],
concurrency_limit=15)
gr.Examples(examples=examples,
inputs=[voice, inp, speed, alpha, beta, embscale, steps],
outputs=[claudio],
fn=generate,
cache_examples=True,)
if __name__ == "__main__":
# demo.queue(api_open=False, max_size=15).launch(show_api=False)
clone.queue(api_open=False, max_size=15).launch(show_api=False)