hans00's picture
Tip error when create speaker failed
216ef9a unverified
import gradio as gr
# Import alias module before outetts to setup whisper redirection
import alias as _alias
import outetts
import json
import tempfile
import hashlib
import os
from typing import Optional
from outetts.models.info import MODEL_INFO
from outetts.utils import helpers
from huggingface_hub import hf_hub_download
import torch
from transformers import BitsAndBytesConfig
import spaces
# Available OuteTTS models based on the documentation
MODELS = {v.value: v for _, v in outetts.Models.__members__.items()}
MODEL_QUANTIZATION = {
outetts.Models.VERSION_0_1_SIZE_350M: outetts.LlamaCppQuantization.FP16,
outetts.Models.VERSION_0_2_SIZE_500M: outetts.LlamaCppQuantization.FP16,
outetts.Models.VERSION_0_3_SIZE_500M: outetts.LlamaCppQuantization.FP16,
}
# Cache for speaker profiles to avoid re-transcribing the same audio
speaker_cache = {}
def get_file_hash(file_path):
"""Calculate MD5 hash of a file for caching purposes."""
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def try_ggml_model(model: outetts.Models, backend: outetts.Backend, quantization: outetts.LlamaCppQuantization):
model_config = MODEL_INFO[model]
repo = f"OuteAI/{model.value}-GGUF"
filename = f"{model.value}-{quantization.value}.gguf"
model_path = hf_hub_download(
repo_id=repo,
filename=filename,
local_dir=os.path.join(helpers.get_cache_dir(), "gguf"),
local_files_only=False
)
generation_type = outetts.GenerationType.CHUNKED
if model_config['interface_version'] == outetts.InterfaceVersion.V3:
generation_type = outetts.GenerationType.GUIDED_WORDS
return outetts.ModelConfig(
model_path=model_path,
tokenizer_path=f"OuteAI/{model.value}",
backend=backend,
n_gpu_layers=99,
verbose=False,
device=None,
dtype=None,
additional_model_config={},
audio_codec_path=None,
generation_type=generation_type,
**model_config
)
def get_interface(model_name: str):
"""Get interface instance for the model (no caching to avoid CUDA memory issues)."""
model = MODELS[model_name]
try:
quantization = MODEL_QUANTIZATION.get(model, outetts.LlamaCppQuantization.Q8_0)
config = try_ggml_model(model, outetts.Backend.LLAMACPP, quantization)
except:
has_cuda = torch.cuda.is_available()
model_config = MODEL_INFO[model]
config = outetts.ModelConfig(
model_path=f"OuteAI/{model_name}",
tokenizer_path=f"OuteAI/{model_name}",
backend=outetts.Backend.HF,
additional_model_config={
"device_map": "auto" if has_cuda else "cpu",
"quantization_config": BitsAndBytesConfig(
load_in_4bit=True,
llm_int8_enable_fp32_cpu_offload=True
) if has_cuda else None,
},
**model_config
)
# Initialize the interface
interface = outetts.Interface(config=config)
return interface
def get_or_create_speaker(interface, audio_file):
"""Get speaker from cache or create new one if not cached."""
# Calculate file hash for caching
file_hash = get_file_hash(audio_file)
cache_key = f"{interface.config.interface_version}_{file_hash}"
# Check if speaker profile is already cached
if cache_key in speaker_cache:
print(f"βœ… Using cached speaker profile for {os.path.basename(audio_file)}")
return speaker_cache[cache_key]
device = "cuda" if torch.cuda.is_available() else "cpu"
# Create new speaker profile
print(f"πŸ”„ Creating new speaker profile for {os.path.basename(audio_file)}")
try:
speaker = interface.create_speaker(audio_file, whisper_model="large-v3-turbo", whisper_device=device)
# Cache the speaker profile
speaker_cache[cache_key] = speaker
print(f"πŸ’Ύ Cached speaker profile ({len(speaker_cache)} total cached)")
return speaker
except Exception as e:
return f"❌ Error creating speaker profile: {str(e)}"
@spaces.GPU
def create_speaker_and_generate(model_name, audio_file, test_text: Optional[str] = None, temperature: float = 0.4):
"""Create speaker from audio and optionally generate test audio."""
if audio_file is None:
# Return default values for startup/caching purposes
return "Please upload an audio file to create a speaker profile.", None
# Get interface (no caching to avoid CUDA memory issues)
interface = get_interface(model_name)
# Get or create speaker profile (with caching)
speaker_result = get_or_create_speaker(interface, audio_file)
# Check if speaker_result is an error message
if isinstance(speaker_result, str) and speaker_result.startswith("❌"):
return speaker_result, None
# Convert speaker dict to formatted JSON
speaker_json = json.dumps(speaker_result, indent=2, ensure_ascii=False)
# Generate test audio if text is provided
generated_audio = None
if test_text and test_text.strip():
output = interface.generate(
config=outetts.GenerationConfig(
text=test_text,
speaker=speaker_result,
sampler_config=outetts.SamplerConfig(
temperature=temperature
),
max_length=MODEL_INFO[MODELS[model_name]]["max_seq_length"]
)
)
# Save to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
output.save(f.name)
generated_audio = f.name
return speaker_json, generated_audio
example_text = "Hello, this is a test of the OuteTTS speaker profile."
# Create the Gradio interface
demo = gr.Interface(
fn=create_speaker_and_generate,
inputs=[
gr.Dropdown(
choices=list(MODELS.keys()),
value=list(MODELS.keys())[-1],
label="Select OuteTTS Model",
info="Choose the model variant to use"
),
gr.Audio(
label="Upload Reference Audio (Max 20 seconds)",
type="filepath",
sources=["upload", "microphone"]
),
gr.Textbox(
label="Test Text (Optional)",
placeholder="Enter text to generate speech (leave empty to only create speaker profile)...",
lines=3,
value=None
),
gr.Slider(
minimum=0.1,
maximum=1.0,
step=0.1,
value=0.4,
label="Temperature",
info="Controls randomness in generation"
)
],
outputs=[
gr.Textbox(
label="Speaker Profile (JSON)",
lines=15,
max_lines=20,
show_copy_button=True
),
gr.Audio(
label="Generated Test Audio (if text provided)",
type="filepath"
)
],
title="πŸŽ™οΈ OuteTTS Speaker Creator",
description="Create and manage speaker profiles for OuteTTS text-to-speech synthesis. Upload audio to create a speaker profile, and optionally provide test text to generate sample audio.",
theme=gr.themes.Soft(),
examples=[
["OuteTTS-1.0-0.6B", None, example_text, 0.2],
["OuteTTS-0.3-500M", None, example_text, 0.2],
],
cache_examples=False,
flagging_mode="never"
)
if __name__ == "__main__":
# Launch with optimized configuration for HuggingFace Spaces
demo.launch(
server_name="0.0.0.0", # Allow external connections
server_port=7860,
share=False, # Set to True if you want a public link
show_api=True, # Show API documentation
show_error=True # Show detailed error messages
)