Spaces:

tonychenxyz
/

emo-knob

Sleeping

App Files Files Community

emo-knob / app.py

tonychenxyz

updated all req

d4419d5 about 1 month ago

raw

history blame

13 kB


	import os
	import subprocess
	import sys

	def install(package):
	if '=' in package:
	package_name, package_version = package.split('==')
	else:
	package_name = package
	package_version = None
	try:
	subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "-y", package_name])
	print(f"Successfully uninstalled {package}")
	except subprocess.CalledProcessError:
	print(f"Package {package} was not installed, proceeding with installation")
	subprocess.check_call([sys.executable, "-m", "pip", "install", package])

	# install('pydantic==2.0.0')
	# install('gradio==4.44.0')
	# install('spacy==3.7')

	is_prod = True
	if os.environ.get('PROD_MODE') == 'local':
	is_prod = False

	import pickle

	import gradio as gr
	import os

	if not is_prod:

	import os
	os.environ['HF_HOME'] = '/proj/afosr/metavoice/cache'
	os.environ['TRANSFORMERS_CACHE'] = '/proj/afosr/metavoice/cache'
	os.environ['HF_DATASETS_CACHE'] = '/proj/afosr/metavoice/cache'
	os.environ['HF_METRICS_CACHE'] = '/proj/afosr/metavoice/cache'
	os.environ['HF_MODULES_CACHE'] = '/proj/afosr/metavoice/cache'
	ffmpeg_path = '/home/hc3295/ffmpegg_build/bin'
	os.environ['PATH'] += os.pathsep + ffmpeg_path


	import shutil
	import tempfile
	import time
	from pathlib import Path

	import librosa
	import torch
	from huggingface_hub import snapshot_download

	from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook
	from fam.llm.decoders import EncodecDecoder
	from fam.llm.fast_inference_utils import build_model, main
	from fam.llm.inference import (
	EncodecDecoder,
	InferenceConfig,
	Model,
	TiltedEncodec,
	TrainedBPETokeniser,
	get_cached_embedding,
	get_cached_file,
	get_enhancer,
	)
	from fam.llm.utils import (
	check_audio_file,
	get_default_dtype,
	get_device,
	normalize_text,
	)

	debug = False
	if not debug:
	model_name = "metavoiceio/metavoice-1B-v0.1"
	seed = 1337
	output_dir = "outputs"
	_dtype = get_default_dtype()
	_device = 'cuda:0'
	_model_dir = snapshot_download(repo_id=model_name)
	first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=1024)
	output_dir = output_dir
	os.makedirs(output_dir, exist_ok=True)

	second_stage_ckpt_path = f"{_model_dir}/second_stage.pt"
	config_second_stage = InferenceConfig(
	ckpt_path=second_stage_ckpt_path,
	num_samples=1,
	seed=seed,
	device=_device,
	dtype=_dtype,
	compile=False,
	init_from="resume",
	output_dir=output_dir,
	)
	data_adapter_second_stage = TiltedEncodec(end_of_audio_token=1024)
	llm_second_stage = Model(
	config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
	)
	enhancer = get_enhancer("df")

	precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[_dtype]
	model, tokenizer, smodel, model_size = build_model(
	precision=precision,
	checkpoint_path=Path(f"{_model_dir}/first_stage.pt"),
	spk_emb_ckpt_path=Path(f"{_model_dir}/speaker_encoder.pt"),
	device=_device,
	compile=True,
	compile_prefill=True,
	)


	def generate_sample(text, emo_dir = None, source_path = None, emo_path = None, neutral_path = None, strength = 0.1, top_p = 0.95, guidance_scale = 3.0, preset_dropdown = None, toggle = None):

	print('text', text)
	print('emo_dir', emo_dir)
	print('source_path', source_path)
	print('emo_path', emo_path)
	print('neutral_path', neutral_path)
	print('strength', strength)
	print('top_p', top_p)
	print('guidance_scale', guidance_scale)

	if toggle == RADIO_CHOICES[0]:
	source_path = PRESET_VOICES[preset_dropdown]
	source_path = get_cached_file(source_path)
	check_audio_file(source_path)
	source_emb = get_cached_embedding(source_path, smodel).to(device=_device, dtype=precision)

	if emo_dir == EMO_NAMES[0]:
	emo_path = get_cached_file(emo_path)
	check_audio_file(emo_path)
	emo_emb = get_cached_embedding(emo_path, smodel).to(device=_device, dtype=precision)

	neutral_path = get_cached_file(neutral_path)
	check_audio_file(neutral_path)
	neutral_emb = get_cached_embedding(neutral_path, smodel).to(device=_device, dtype=precision)

	emo_dir = emo_emb - neutral_emb
	emo_dir = emo_dir / torch.norm(emo_dir, p=2)
	else:
	emo_dir = torch.tensor(ALL_EMO_DIRS[emo_dir], device=_device, dtype=precision)


	edited_emb = source_emb + strength * emo_dir
	edited_emb = edited_emb.to(device=_device, dtype=precision)

	temperature=1.0
	text = normalize_text(text)

	start = time.time()
	# first stage LLM
	tokens = main(
	model=model,
	tokenizer=tokenizer,
	model_size=model_size,
	prompt=text,
	spk_emb=edited_emb,
	top_p=torch.tensor(top_p, device=_device, dtype=precision),
	guidance_scale=torch.tensor(guidance_scale, device=_device, dtype=precision),
	temperature=torch.tensor(temperature, device=_device, dtype=precision),
	)
	text_ids, extracted_audio_ids = first_stage_adapter.decode([tokens])

	b_speaker_embs = edited_emb.unsqueeze(0)

	# second stage LLM + multi-band diffusion model
	wav_files = llm_second_stage(
	texts=[text],
	encodec_tokens=[torch.tensor(extracted_audio_ids, dtype=torch.int32, device=_device).unsqueeze(0)],
	speaker_embs=b_speaker_embs,
	batch_size=1,
	guidance_scale=None,
	top_p=None,
	top_k=200,
	temperature=1.0,
	max_new_tokens=None,
	)

	wav_file = wav_files[0]
	with tempfile.NamedTemporaryFile(suffix=".wav") as enhanced_tmp:
	enhancer(str(wav_file) + ".wav", enhanced_tmp.name)
	shutil.copy2(enhanced_tmp.name, str(wav_file) + ".wav")
	print(f"\nSaved audio to {wav_file}.wav")

	output_path = str(wav_file) + ".wav"
	return output_path


	ALL_EMO_DIRS = pickle.load(open('all_emo_dirs.pkl', 'rb'))
	EMO_NAMES = ['Upload your own sample'] + list(ALL_EMO_DIRS.keys())

	RADIO_CHOICES = ["Preset voices", "Upload your voice"]
	MAX_CHARS = 220
	PRESET_VOICES = {
	# female
	"Bria": "https://cdn.themetavoice.xyz/speakers%2Fbria.mp3",
	# male
	"Alex": "https://cdn.themetavoice.xyz/speakers/alex.mp3",
	"Jacob": "https://cdn.themetavoice.xyz/speakers/jacob.wav",
	}


	def denormalise_top_p(top_p):
	# returns top_p in the range [0.9, 1.0]
	return round(0.9 + top_p / 100, 2)


	def denormalise_guidance(guidance):
	# returns guidance in the range [1.0, 3.0]
	return 1 + ((guidance - 1) * (3 - 1)) / (5 - 1)


	def _check_file_size(path):
	if not path:
	return
	filesize = os.path.getsize(path)
	filesize_mb = filesize / 1024 / 1024
	if filesize_mb >= 50:
	raise gr.Error(f"Please upload a sample less than 20MB for voice cloning. Provided: {round(filesize_mb)} MB")


	def _handle_edge_cases(to_say, upload_target):
	if not to_say:
	raise gr.Error("Please provide text to synthesise")

	if len(to_say) > MAX_CHARS:
	gr.Warning(
	f"Max {MAX_CHARS} characters allowed. Provided: {len(to_say)} characters. Truncating and generating speech...Result at the end can be unstable as a result."
	)

	if not upload_target:
	return

	check_audio_file(upload_target) # check file duration to be atleast 30s
	_check_file_size(upload_target)


	def tts(to_say, top_p, guidance, toggle, preset_dropdown, upload_target):
	try:
	d_top_p = denormalise_top_p(top_p)
	d_guidance = denormalise_guidance(guidance)

	_handle_edge_cases(to_say, upload_target)

	to_say = to_say if len(to_say) < MAX_CHARS else to_say[:MAX_CHARS]

	return TTS_MODEL.synthesise(
	text=to_say,
	spk_ref_path=PRESET_VOICES[preset_dropdown] if toggle == RADIO_CHOICES[0] else upload_target,
	top_p=d_top_p,
	guidance_scale=d_guidance,
	)
	except Exception as e:
	raise gr.Error(f"Something went wrong. Reason: {str(e)}")


	def change_voice_selection_layout(choice):
	if choice == RADIO_CHOICES[0]:
	return [gr.update(visible=True), gr.update(visible=False)]

	return [gr.update(visible=False), gr.update(visible=True)]

	def change_emotion_selection_layout(choice):
	if choice == EMO_NAMES[0]:
	return [gr.update(visible=True)]

	return [gr.update(visible=False)]

	title = """
	</style>
	<h1 style="margin-top: 10px;" class="page-title">Demo for <span style="margin-left: 10px;background-color: #E0FEE4;padding: 15px;border-radius: 10px;">🎛️ EmoKnob</span></h1>
	"""

	description = """
	- While existing TTS services do not allow fine-grained control over emotions, EmoKnob allows users to control emotion in speech with few-shot samples.
	- In this demo, you can select from a few preset voices and upload your own emotional samples to clone.
	- You can then use preset emotion or upload your own emotional-neutral sample pair to control emotions.
	- You can adjust the strength of the emotion by using the slider.


	EmoKnob is uses [MetaVoice](https://github.com/metavoiceio/metavoice-src) as voice cloning backbone.
	"""

	with gr.Blocks(title="EmoKnob Demo") as demo:
	gr.Markdown(title)
	gr.Image("emo-knob-teaser-1.svg", show_label=False, container=False)

	with gr.Row():
	gr.Markdown(description)

	with gr.Row():
	with gr.Column():
	to_say = gr.TextArea(
	label=f"What should I say!? (max {MAX_CHARS} characters).",
	lines=4,
	value="To be or not to be, that is the question.",
	)



	with gr.Row(), gr.Column():
	# voice settings
	top_p = gr.Slider(
	value=0.95,
	minimum=0.0,
	maximum=10.0,
	step=1.0,
	label="Speech Stability - improves text following for a challenging speaker",
	)
	guidance = gr.Slider(
	value=3.0,
	minimum=1.0,
	maximum=5.0,
	step=1.0,
	label="Speaker similarity - How closely to match speaker identity and speech style.",
	)

	strength = gr.Slider(
	value=0.1,
	minimum=0.0,
	maximum=5.0,
	step=0.01,
	label="Strength - how strong the emotion is. Setting it to too large a value may result in unstable output.",
	)



	# voice select
	toggle = gr.Radio(choices=RADIO_CHOICES, label="Choose voice", value=RADIO_CHOICES[0])

	with gr.Row(visible=True) as row_1:
	preset_dropdown = gr.Dropdown(
	PRESET_VOICES.keys(), label="Preset voices", value=list(PRESET_VOICES.keys())[0]
	)
	with gr.Accordion("Preview: Preset voices", open=False):
	for label, path in PRESET_VOICES.items():
	gr.Audio(value=path, label=label)

	with gr.Row(visible=False) as row_2:
	upload_target = gr.Audio(
	sources=["upload"],
	type="filepath",
	label="Upload a clean sample to clone.",
	)
	with gr.Row():
	emotion_name = gr.Radio(choices=EMO_NAMES, label="Emotion", value=EMO_NAMES[0])
	with gr.Row(visible=True) as row_3:
	upload_neutral = gr.Audio(
	sources=["upload"],
	type="filepath",
	label="Upload a neutral sample to compute the emotion direction. Should be same speaker as the emotional sample.",
	)

	upload_emo = gr.Audio(
	sources=["upload"],
	type="filepath",
	label="Upload an emotional sample to compute the emotion direction. Should be same speaker as the neutral sample.",
	)

	toggle.change(
	change_voice_selection_layout,
	inputs=toggle,
	outputs=[row_1, row_2],
	)

	# emotion_name.change(
	# change_emotion_selection_layout,
	# inputs=emotion_name,
	# outputs=[row_3],
	# )

	with gr.Column():
	speech = gr.Audio(
	type="filepath",
	label="Model says...",
	)

	submit = gr.Button("Generate Speech")
	submit.click(
	fn=generate_sample,
	inputs=[to_say, emotion_name, upload_target, upload_emo, upload_neutral, strength, top_p, guidance, preset_dropdown, toggle],
	outputs=speech,
	)

	demo.launch()