Spaces:

BricksDisplay
/

OuteTTS-Speaker-Creator

Running on Zero

App Files Files Community

OuteTTS-Speaker-Creator / app.py

hans00

Tip error when create speaker failed

216ef9a unverified about 9 hours ago

raw

history blame contribute delete

8 kB

	import gradio as gr
	# Import alias module before outetts to setup whisper redirection
	import alias as _alias
	import outetts
	import json
	import tempfile
	import hashlib
	import os
	from typing import Optional
	from outetts.models.info import MODEL_INFO
	from outetts.utils import helpers
	from huggingface_hub import hf_hub_download
	import torch
	from transformers import BitsAndBytesConfig
	import spaces

	# Available OuteTTS models based on the documentation
	MODELS = {v.value: v for _, v in outetts.Models.__members__.items()}

	MODEL_QUANTIZATION = {
	outetts.Models.VERSION_0_1_SIZE_350M: outetts.LlamaCppQuantization.FP16,
	outetts.Models.VERSION_0_2_SIZE_500M: outetts.LlamaCppQuantization.FP16,
	outetts.Models.VERSION_0_3_SIZE_500M: outetts.LlamaCppQuantization.FP16,
	}

	# Cache for speaker profiles to avoid re-transcribing the same audio
	speaker_cache = {}

	def get_file_hash(file_path):
	"""Calculate MD5 hash of a file for caching purposes."""
	hash_md5 = hashlib.md5()
	with open(file_path, "rb") as f:
	for chunk in iter(lambda: f.read(4096), b""):
	hash_md5.update(chunk)
	return hash_md5.hexdigest()

	def try_ggml_model(model: outetts.Models, backend: outetts.Backend, quantization: outetts.LlamaCppQuantization):
	model_config = MODEL_INFO[model]
	repo = f"OuteAI/{model.value}-GGUF"
	filename = f"{model.value}-{quantization.value}.gguf"
	model_path = hf_hub_download(
	repo_id=repo,
	filename=filename,
	local_dir=os.path.join(helpers.get_cache_dir(), "gguf"),
	local_files_only=False
	)
	generation_type = outetts.GenerationType.CHUNKED
	if model_config['interface_version'] == outetts.InterfaceVersion.V3:
	generation_type = outetts.GenerationType.GUIDED_WORDS
	return outetts.ModelConfig(
	model_path=model_path,
	tokenizer_path=f"OuteAI/{model.value}",
	backend=backend,
	n_gpu_layers=99,
	verbose=False,
	device=None,
	dtype=None,
	additional_model_config={},
	audio_codec_path=None,
	generation_type=generation_type,
	**model_config
	)

	def get_interface(model_name: str):
	"""Get interface instance for the model (no caching to avoid CUDA memory issues)."""
	model = MODELS[model_name]

	try:
	quantization = MODEL_QUANTIZATION.get(model, outetts.LlamaCppQuantization.Q8_0)
	config = try_ggml_model(model, outetts.Backend.LLAMACPP, quantization)
	except:
	has_cuda = torch.cuda.is_available()
	model_config = MODEL_INFO[model]
	config = outetts.ModelConfig(
	model_path=f"OuteAI/{model_name}",
	tokenizer_path=f"OuteAI/{model_name}",
	backend=outetts.Backend.HF,
	additional_model_config={
	"device_map": "auto" if has_cuda else "cpu",
	"quantization_config": BitsAndBytesConfig(
	load_in_4bit=True,
	llm_int8_enable_fp32_cpu_offload=True
	) if has_cuda else None,
	},
	**model_config
	)

	# Initialize the interface
	interface = outetts.Interface(config=config)
	return interface

	def get_or_create_speaker(interface, audio_file):
	"""Get speaker from cache or create new one if not cached."""
	# Calculate file hash for caching
	file_hash = get_file_hash(audio_file)
	cache_key = f"{interface.config.interface_version}_{file_hash}"

	# Check if speaker profile is already cached
	if cache_key in speaker_cache:
	print(f"✅ Using cached speaker profile for {os.path.basename(audio_file)}")
	return speaker_cache[cache_key]

	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Create new speaker profile
	print(f"🔄 Creating new speaker profile for {os.path.basename(audio_file)}")
	try:
	speaker = interface.create_speaker(audio_file, whisper_model="large-v3-turbo", whisper_device=device)

	# Cache the speaker profile
	speaker_cache[cache_key] = speaker
	print(f"💾 Cached speaker profile ({len(speaker_cache)} total cached)")

	return speaker
	except Exception as e:
	return f"❌ Error creating speaker profile: {str(e)}"

	@spaces.GPU
	def create_speaker_and_generate(model_name, audio_file, test_text: Optional[str] = None, temperature: float = 0.4):
	"""Create speaker from audio and optionally generate test audio."""
	if audio_file is None:
	# Return default values for startup/caching purposes
	return "Please upload an audio file to create a speaker profile.", None

	# Get interface (no caching to avoid CUDA memory issues)
	interface = get_interface(model_name)

	# Get or create speaker profile (with caching)
	speaker_result = get_or_create_speaker(interface, audio_file)

	# Check if speaker_result is an error message
	if isinstance(speaker_result, str) and speaker_result.startswith("❌"):
	return speaker_result, None

	# Convert speaker dict to formatted JSON
	speaker_json = json.dumps(speaker_result, indent=2, ensure_ascii=False)

	# Generate test audio if text is provided
	generated_audio = None
	if test_text and test_text.strip():
	output = interface.generate(
	config=outetts.GenerationConfig(
	text=test_text,
	speaker=speaker_result,
	sampler_config=outetts.SamplerConfig(
	temperature=temperature
	),
	max_length=MODEL_INFO[MODELS[model_name]]["max_seq_length"]
	)
	)

	# Save to temporary file
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	output.save(f.name)
	generated_audio = f.name

	return speaker_json, generated_audio

	example_text = "Hello, this is a test of the OuteTTS speaker profile."

	# Create the Gradio interface
	demo = gr.Interface(
	fn=create_speaker_and_generate,
	inputs=[
	gr.Dropdown(
	choices=list(MODELS.keys()),
	value=list(MODELS.keys())[-1],
	label="Select OuteTTS Model",
	info="Choose the model variant to use"
	),
	gr.Audio(
	label="Upload Reference Audio (Max 20 seconds)",
	type="filepath",
	sources=["upload", "microphone"]
	),
	gr.Textbox(
	label="Test Text (Optional)",
	placeholder="Enter text to generate speech (leave empty to only create speaker profile)...",
	lines=3,
	value=None
	),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	step=0.1,
	value=0.4,
	label="Temperature",
	info="Controls randomness in generation"
	)
	],
	outputs=[
	gr.Textbox(
	label="Speaker Profile (JSON)",
	lines=15,
	max_lines=20,
	show_copy_button=True
	),
	gr.Audio(
	label="Generated Test Audio (if text provided)",
	type="filepath"
	)
	],
	title="🎙️ OuteTTS Speaker Creator",
	description="Create and manage speaker profiles for OuteTTS text-to-speech synthesis. Upload audio to create a speaker profile, and optionally provide test text to generate sample audio.",
	theme=gr.themes.Soft(),
	examples=[
	["OuteTTS-1.0-0.6B", None, example_text, 0.2],
	["OuteTTS-0.3-500M", None, example_text, 0.2],
	],
	cache_examples=False,
	flagging_mode="never"
	)

	if __name__ == "__main__":
	# Launch with optimized configuration for HuggingFace Spaces
	demo.launch(
	server_name="0.0.0.0", # Allow external connections
	server_port=7860,
	share=False, # Set to True if you want a public link
	show_api=True, # Show API documentation
	show_error=True # Show detailed error messages
	)