emo-knob / app.py
tonychenxyz's picture
init
9e34a62
raw
history blame
12.3 kB
import gradio as gr
import os
is_prod = True
if os.environ.get('PROD_MODE') == 'local':
is_prod = False
import pickle
if not is_prod:
import os
os.environ['HF_HOME'] = '/proj/afosr/metavoice/cache'
os.environ['TRANSFORMERS_CACHE'] = '/proj/afosr/metavoice/cache'
os.environ['HF_DATASETS_CACHE'] = '/proj/afosr/metavoice/cache'
os.environ['HF_METRICS_CACHE'] = '/proj/afosr/metavoice/cache'
os.environ['HF_MODULES_CACHE'] = '/proj/afosr/metavoice/cache'
ffmpeg_path = '/home/hc3295/ffmpeg_build/bin'
os.environ['PATH'] += os.pathsep + ffmpeg_path
import shutil
import tempfile
import time
from pathlib import Path
import librosa
import torch
from huggingface_hub import snapshot_download
from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook
from fam.llm.decoders import EncodecDecoder
from fam.llm.fast_inference_utils import build_model, main
from fam.llm.inference import (
EncodecDecoder,
InferenceConfig,
Model,
TiltedEncodec,
TrainedBPETokeniser,
get_cached_embedding,
get_cached_file,
get_enhancer,
)
from fam.llm.utils import (
check_audio_file,
get_default_dtype,
get_device,
normalize_text,
)
debug = False
if not debug:
model_name = "metavoiceio/metavoice-1B-v0.1"
seed = 1337
output_dir = "outputs"
_dtype = get_default_dtype()
_device = 'cuda:0'
_model_dir = snapshot_download(repo_id=model_name)
first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
second_stage_ckpt_path = f"{_model_dir}/second_stage.pt"
config_second_stage = InferenceConfig(
ckpt_path=second_stage_ckpt_path,
num_samples=1,
seed=seed,
device=_device,
dtype=_dtype,
compile=False,
init_from="resume",
output_dir=output_dir,
)
data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
llm_second_stage = Model(
config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
)
enhancer = get_enhancer("df")
precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[_dtype]
model, tokenizer, smodel, model_size = build_model(
precision=precision,
checkpoint_path=Path(f"{_model_dir}/first_stage.pt"),
spk_emb_ckpt_path=Path(f"{_model_dir}/speaker_encoder.pt"),
device=_device,
compile=True,
compile_prefill=True,
)
def generate_sample(text, emo_dir = None, source_path = None, emo_path = None, neutral_path = None, strength = 0.1, top_p = 0.95, guidance_scale = 3.0, preset_dropdown = None, toggle = None):
print('text', text)
print('emo_dir', emo_dir)
print('source_path', source_path)
print('emo_path', emo_path)
print('neutral_path', neutral_path)
print('strength', strength)
print('top_p', top_p)
print('guidance_scale', guidance_scale)
if toggle == RADIO_CHOICES[0]:
source_path = PRESET_VOICES[preset_dropdown]
source_path = get_cached_file(source_path)
check_audio_file(source_path)
source_emb = get_cached_embedding(source_path, smodel).to(device=_device, dtype=precision)
if emo_dir == EMO_NAMES[0]:
emo_path = get_cached_file(emo_path)
check_audio_file(emo_path)
emo_emb = get_cached_embedding(emo_path, smodel).to(device=_device, dtype=precision)
neutral_path = get_cached_file(neutral_path)
check_audio_file(neutral_path)
neutral_emb = get_cached_embedding(neutral_path, smodel).to(device=_device, dtype=precision)
emo_dir = emo_emb - neutral_emb
emo_dir = emo_dir / torch.norm(emo_dir, p=2)
else:
emo_dir = torch.tensor(ALL_EMO_DIRS[emo_dir], device=_device, dtype=precision)
edited_emb = source_emb + strength * emo_dir
edited_emb = edited_emb.to(device=_device, dtype=precision)
temperature=1.0
text = normalize_text(text)
start = time.time()
# first stage LLM
tokens = main(
model=model,
tokenizer=tokenizer,
model_size=model_size,
prompt=text,
spk_emb=edited_emb,
top_p=torch.tensor(top_p, device=_device, dtype=precision),
guidance_scale=torch.tensor(guidance_scale, device=_device, dtype=precision),
temperature=torch.tensor(temperature, device=_device, dtype=precision),
)
text_ids, extracted_audio_ids = first_stage_adapter.decode([tokens])
b_speaker_embs = edited_emb.unsqueeze(0)
# second stage LLM + multi-band diffusion model
wav_files = llm_second_stage(
texts=[text],
encodec_tokens=[torch.tensor(extracted_audio_ids, dtype=torch.int32, device=_device).unsqueeze(0)],
speaker_embs=b_speaker_embs,
batch_size=1,
guidance_scale=None,
top_p=None,
top_k=200,
temperature=1.0,
max_new_tokens=None,
)
wav_file = wav_files[0]
with tempfile.NamedTemporaryFile(suffix=".wav") as enhanced_tmp:
enhancer(str(wav_file) + ".wav", enhanced_tmp.name)
shutil.copy2(enhanced_tmp.name, str(wav_file) + ".wav")
print(f"\nSaved audio to {wav_file}.wav")
output_path = str(wav_file) + ".wav"
return output_path
ALL_EMO_DIRS = pickle.load(open('all_emo_dirs.pkl', 'rb'))
EMO_NAMES = ['Upload your own sample'] + list(ALL_EMO_DIRS.keys())
RADIO_CHOICES = ["Preset voices", "Upload your voice"]
MAX_CHARS = 220
PRESET_VOICES = {
# female
"Bria": "https://cdn.themetavoice.xyz/speakers%2Fbria.mp3",
# male
"Alex": "https://cdn.themetavoice.xyz/speakers/alex.mp3",
"Jacob": "https://cdn.themetavoice.xyz/speakers/jacob.wav",
}
def denormalise_top_p(top_p):
# returns top_p in the range [0.9, 1.0]
return round(0.9 + top_p / 100, 2)
def denormalise_guidance(guidance):
# returns guidance in the range [1.0, 3.0]
return 1 + ((guidance - 1) * (3 - 1)) / (5 - 1)
def _check_file_size(path):
if not path:
return
filesize = os.path.getsize(path)
filesize_mb = filesize / 1024 / 1024
if filesize_mb >= 50:
raise gr.Error(f"Please upload a sample less than 20MB for voice cloning. Provided: {round(filesize_mb)} MB")
def _handle_edge_cases(to_say, upload_target):
if not to_say:
raise gr.Error("Please provide text to synthesise")
if len(to_say) > MAX_CHARS:
gr.Warning(
f"Max {MAX_CHARS} characters allowed. Provided: {len(to_say)} characters. Truncating and generating speech...Result at the end can be unstable as a result."
)
if not upload_target:
return
check_audio_file(upload_target) # check file duration to be atleast 30s
_check_file_size(upload_target)
def tts(to_say, top_p, guidance, toggle, preset_dropdown, upload_target):
try:
d_top_p = denormalise_top_p(top_p)
d_guidance = denormalise_guidance(guidance)
_handle_edge_cases(to_say, upload_target)
to_say = to_say if len(to_say) < MAX_CHARS else to_say[:MAX_CHARS]
return TTS_MODEL.synthesise(
text=to_say,
spk_ref_path=PRESET_VOICES[preset_dropdown] if toggle == RADIO_CHOICES[0] else upload_target,
top_p=d_top_p,
guidance_scale=d_guidance,
)
except Exception as e:
raise gr.Error(f"Something went wrong. Reason: {str(e)}")
def change_voice_selection_layout(choice):
if choice == RADIO_CHOICES[0]:
return [gr.update(visible=True), gr.update(visible=False)]
return [gr.update(visible=False), gr.update(visible=True)]
def change_emotion_selection_layout(choice):
if choice == EMO_NAMES[0]:
return [gr.update(visible=True)]
return [gr.update(visible=False)]
title = """
</style>
<h1 style="margin-top: 10px;" class="page-title">Demo for <span style="margin-left: 10px;background-color: #E0FEE4;padding: 15px;border-radius: 10px;">🎛️ EmoKnob</span></h1>
"""
description = """
- While existing TTS services do not allow fine-grained control over emotions, EmoKnob allows users to control emotion in speech with few-shot samples.
- In this demo, you can select from a few preset voices and upload your own emotional samples to clone.
- You can then use preset emotion or upload your own emotional-neutral sample pair to control emotions.
- You can adjust the strength of the emotion by using the slider.
EmoKnob is uses [MetaVoice](https://github.com/metavoiceio/metavoice-src) as voice cloning backbone.
"""
with gr.Blocks(title="EmoKnob Demo") as demo:
gr.Markdown(title)
gr.Image("emo-knob-teaser-1.svg", show_label=False, container=False)
with gr.Row():
gr.Markdown(description)
with gr.Row():
with gr.Column():
to_say = gr.TextArea(
label=f"What should I say!? (max {MAX_CHARS} characters).",
lines=4,
value="To be or not to be, that is the question.",
)
with gr.Row(), gr.Column():
# voice settings
top_p = gr.Slider(
value=0.95,
minimum=0.0,
maximum=10.0,
step=1.0,
label="Speech Stability - improves text following for a challenging speaker",
)
guidance = gr.Slider(
value=3.0,
minimum=1.0,
maximum=5.0,
step=1.0,
label="Speaker similarity - How closely to match speaker identity and speech style.",
)
strength = gr.Slider(
value=0.1,
minimum=0.0,
maximum=5.0,
step=0.01,
label="Strength - how strong the emotion is. Setting it to too large a value may result in unstable output.",
)
# voice select
toggle = gr.Radio(choices=RADIO_CHOICES, label="Choose voice", value=RADIO_CHOICES[0])
with gr.Row(visible=True) as row_1:
preset_dropdown = gr.Dropdown(
PRESET_VOICES.keys(), label="Preset voices", value=list(PRESET_VOICES.keys())[0]
)
with gr.Accordion("Preview: Preset voices", open=False):
for label, path in PRESET_VOICES.items():
gr.Audio(value=path, label=label)
with gr.Row(visible=False) as row_2:
upload_target = gr.Audio(
sources=["upload"],
type="filepath",
label="Upload a clean sample to clone.",
)
with gr.Row():
emotion_name = gr.Radio(choices=EMO_NAMES, label="Emotion", value=EMO_NAMES[0])
with gr.Row(visible=True) as row_3:
upload_neutral = gr.Audio(
sources=["upload"],
type="filepath",
label="Upload a neutral sample to compute the emotion direction. Should be same speaker as the emotional sample.",
)
upload_emo = gr.Audio(
sources=["upload"],
type="filepath",
label="Upload an emotional sample to compute the emotion direction. Should be same speaker as the neutral sample.",
)
toggle.change(
change_voice_selection_layout,
inputs=toggle,
outputs=[row_1, row_2],
)
# emotion_name.change(
# change_emotion_selection_layout,
# inputs=emotion_name,
# outputs=[row_3],
# )
with gr.Column():
speech = gr.Audio(
type="filepath",
label="Model says...",
)
submit = gr.Button("Generate Speech")
submit.click(
fn=generate_sample,
inputs=[to_say, emotion_name, upload_target, upload_emo, upload_neutral, strength, top_p, guidance, preset_dropdown, toggle],
outputs=speech,
)
demo.launch()