Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
# Import alias module before outetts to setup whisper redirection | |
import alias as _alias | |
import outetts | |
import json | |
import tempfile | |
import hashlib | |
import os | |
from typing import Optional | |
from outetts.models.info import MODEL_INFO | |
from outetts.utils import helpers | |
from huggingface_hub import hf_hub_download | |
import torch | |
from transformers import BitsAndBytesConfig | |
import spaces | |
# Available OuteTTS models based on the documentation | |
MODELS = {v.value: v for _, v in outetts.Models.__members__.items()} | |
MODEL_QUANTIZATION = { | |
outetts.Models.VERSION_0_1_SIZE_350M: outetts.LlamaCppQuantization.FP16, | |
outetts.Models.VERSION_0_2_SIZE_500M: outetts.LlamaCppQuantization.FP16, | |
outetts.Models.VERSION_0_3_SIZE_500M: outetts.LlamaCppQuantization.FP16, | |
} | |
# Cache for speaker profiles to avoid re-transcribing the same audio | |
speaker_cache = {} | |
def get_file_hash(file_path): | |
"""Calculate MD5 hash of a file for caching purposes.""" | |
hash_md5 = hashlib.md5() | |
with open(file_path, "rb") as f: | |
for chunk in iter(lambda: f.read(4096), b""): | |
hash_md5.update(chunk) | |
return hash_md5.hexdigest() | |
def try_ggml_model(model: outetts.Models, backend: outetts.Backend, quantization: outetts.LlamaCppQuantization): | |
model_config = MODEL_INFO[model] | |
repo = f"OuteAI/{model.value}-GGUF" | |
filename = f"{model.value}-{quantization.value}.gguf" | |
model_path = hf_hub_download( | |
repo_id=repo, | |
filename=filename, | |
local_dir=os.path.join(helpers.get_cache_dir(), "gguf"), | |
local_files_only=False | |
) | |
generation_type = outetts.GenerationType.CHUNKED | |
if model_config['interface_version'] == outetts.InterfaceVersion.V3: | |
generation_type = outetts.GenerationType.GUIDED_WORDS | |
return outetts.ModelConfig( | |
model_path=model_path, | |
tokenizer_path=f"OuteAI/{model.value}", | |
backend=backend, | |
n_gpu_layers=99, | |
verbose=False, | |
device=None, | |
dtype=None, | |
additional_model_config={}, | |
audio_codec_path=None, | |
generation_type=generation_type, | |
**model_config | |
) | |
def get_interface(model_name: str): | |
"""Get interface instance for the model (no caching to avoid CUDA memory issues).""" | |
model = MODELS[model_name] | |
try: | |
quantization = MODEL_QUANTIZATION.get(model, outetts.LlamaCppQuantization.Q8_0) | |
config = try_ggml_model(model, outetts.Backend.LLAMACPP, quantization) | |
except: | |
has_cuda = torch.cuda.is_available() | |
model_config = MODEL_INFO[model] | |
config = outetts.ModelConfig( | |
model_path=f"OuteAI/{model_name}", | |
tokenizer_path=f"OuteAI/{model_name}", | |
backend=outetts.Backend.HF, | |
additional_model_config={ | |
"device_map": "auto" if has_cuda else "cpu", | |
"quantization_config": BitsAndBytesConfig( | |
load_in_4bit=True, | |
llm_int8_enable_fp32_cpu_offload=True | |
) if has_cuda else None, | |
}, | |
**model_config | |
) | |
# Initialize the interface | |
interface = outetts.Interface(config=config) | |
return interface | |
def get_or_create_speaker(interface, audio_file): | |
"""Get speaker from cache or create new one if not cached.""" | |
# Calculate file hash for caching | |
file_hash = get_file_hash(audio_file) | |
cache_key = f"{interface.config.interface_version}_{file_hash}" | |
# Check if speaker profile is already cached | |
if cache_key in speaker_cache: | |
print(f"β Using cached speaker profile for {os.path.basename(audio_file)}") | |
return speaker_cache[cache_key] | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Create new speaker profile | |
print(f"π Creating new speaker profile for {os.path.basename(audio_file)}") | |
try: | |
speaker = interface.create_speaker(audio_file, whisper_model="large-v3-turbo", whisper_device=device) | |
# Cache the speaker profile | |
speaker_cache[cache_key] = speaker | |
print(f"πΎ Cached speaker profile ({len(speaker_cache)} total cached)") | |
return speaker | |
except Exception as e: | |
return f"β Error creating speaker profile: {str(e)}" | |
def create_speaker_and_generate(model_name, audio_file, test_text: Optional[str] = None, temperature: float = 0.4): | |
"""Create speaker from audio and optionally generate test audio.""" | |
if audio_file is None: | |
# Return default values for startup/caching purposes | |
return "Please upload an audio file to create a speaker profile.", None | |
# Get interface (no caching to avoid CUDA memory issues) | |
interface = get_interface(model_name) | |
# Get or create speaker profile (with caching) | |
speaker_result = get_or_create_speaker(interface, audio_file) | |
# Check if speaker_result is an error message | |
if isinstance(speaker_result, str) and speaker_result.startswith("β"): | |
return speaker_result, None | |
# Convert speaker dict to formatted JSON | |
speaker_json = json.dumps(speaker_result, indent=2, ensure_ascii=False) | |
# Generate test audio if text is provided | |
generated_audio = None | |
if test_text and test_text.strip(): | |
output = interface.generate( | |
config=outetts.GenerationConfig( | |
text=test_text, | |
speaker=speaker_result, | |
sampler_config=outetts.SamplerConfig( | |
temperature=temperature | |
), | |
max_length=MODEL_INFO[MODELS[model_name]]["max_seq_length"] | |
) | |
) | |
# Save to temporary file | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
output.save(f.name) | |
generated_audio = f.name | |
return speaker_json, generated_audio | |
example_text = "Hello, this is a test of the OuteTTS speaker profile." | |
# Create the Gradio interface | |
demo = gr.Interface( | |
fn=create_speaker_and_generate, | |
inputs=[ | |
gr.Dropdown( | |
choices=list(MODELS.keys()), | |
value=list(MODELS.keys())[-1], | |
label="Select OuteTTS Model", | |
info="Choose the model variant to use" | |
), | |
gr.Audio( | |
label="Upload Reference Audio (Max 20 seconds)", | |
type="filepath", | |
sources=["upload", "microphone"] | |
), | |
gr.Textbox( | |
label="Test Text (Optional)", | |
placeholder="Enter text to generate speech (leave empty to only create speaker profile)...", | |
lines=3, | |
value=None | |
), | |
gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
step=0.1, | |
value=0.4, | |
label="Temperature", | |
info="Controls randomness in generation" | |
) | |
], | |
outputs=[ | |
gr.Textbox( | |
label="Speaker Profile (JSON)", | |
lines=15, | |
max_lines=20, | |
show_copy_button=True | |
), | |
gr.Audio( | |
label="Generated Test Audio (if text provided)", | |
type="filepath" | |
) | |
], | |
title="ποΈ OuteTTS Speaker Creator", | |
description="Create and manage speaker profiles for OuteTTS text-to-speech synthesis. Upload audio to create a speaker profile, and optionally provide test text to generate sample audio.", | |
theme=gr.themes.Soft(), | |
examples=[ | |
["OuteTTS-1.0-0.6B", None, example_text, 0.2], | |
["OuteTTS-0.3-500M", None, example_text, 0.2], | |
], | |
cache_examples=False, | |
flagging_mode="never" | |
) | |
if __name__ == "__main__": | |
# Launch with optimized configuration for HuggingFace Spaces | |
demo.launch( | |
server_name="0.0.0.0", # Allow external connections | |
server_port=7860, | |
share=False, # Set to True if you want a public link | |
show_api=True, # Show API documentation | |
show_error=True # Show detailed error messages | |
) | |