Spaces:

BricksDisplay
/

OuteTTS-Speaker-Creator

Running on Zero

File size: 8,004 Bytes

ab1b361
ca494e8
bc05181
ab1b361
 
 
92715cf
 
 
a6b9820
 
 
bc05181
 
af5a2a8
ab1b361
 
3825e40
ab1b361
f5220fd
03fd9ee
 
 
f5220fd
 
92715cf
 
 
 
 
 
 
 
 
 
 
476f08d
a6b9820
476f08d
 
 
 
 
 
 
 
1098b3c
 
 
476f08d
 
 
 
 
 
 
 
 
 
1098b3c
476f08d
 
a6b9820
476f08d
 
3825e40
a6b9820
03fd9ee
 
 
 
 
a6b9820
 
 
 
 
bc05181
 
 
cf59503
 
bc05181
 
a6b9820
 
3825e40
 
 
 
ab1b361
f5220fd
92715cf
 
 
f5220fd
92715cf
 
 
 
 
 
0c34cba
 
92715cf
 
216ef9a
 
 
 
 
 
 
 
 
 
92715cf
5f40fbe
92715cf
3f709ea
ab1b361
4eb335e
 
3825e40
476f08d
 
3f709ea
92715cf
216ef9a
 
 
 
 
3825e40
 
216ef9a
3825e40
3f709ea
 
 
 
 
 
216ef9a
3f709ea
 
a6b9820
 
ab1b361
 
3f709ea
 
 
 
 
3825e40
3f709ea
ab1b361
92715cf
 
ab1b361
3f709ea
 
 
 
 
 
 
 
 
 
216ef9a
3f709ea
 
 
 
 
 
 
92715cf
3f709ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92715cf
 
 
4eb335e
 
 
3f709ea
ab1b361
92715cf
4eb335e
92715cf

import gradio as gr
# Import alias module before outetts to setup whisper redirection
import alias as _alias
import outetts
import json
import tempfile
import hashlib
import os
from typing import Optional
from outetts.models.info import MODEL_INFO
from outetts.utils import helpers
from huggingface_hub import hf_hub_download
import torch
from transformers import BitsAndBytesConfig
import spaces

# Available OuteTTS models based on the documentation
MODELS = {v.value: v for _, v in outetts.Models.__members__.items()}

MODEL_QUANTIZATION = {
    outetts.Models.VERSION_0_1_SIZE_350M: outetts.LlamaCppQuantization.FP16,
    outetts.Models.VERSION_0_2_SIZE_500M: outetts.LlamaCppQuantization.FP16,
    outetts.Models.VERSION_0_3_SIZE_500M: outetts.LlamaCppQuantization.FP16,
}

# Cache for speaker profiles to avoid re-transcribing the same audio
speaker_cache = {}

def get_file_hash(file_path):
    """Calculate MD5 hash of a file for caching purposes."""
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def try_ggml_model(model: outetts.Models, backend: outetts.Backend, quantization: outetts.LlamaCppQuantization):
    model_config = MODEL_INFO[model]
    repo = f"OuteAI/{model.value}-GGUF"
    filename = f"{model.value}-{quantization.value}.gguf"
    model_path = hf_hub_download(
        repo_id=repo,
        filename=filename,
        local_dir=os.path.join(helpers.get_cache_dir(), "gguf"),
        local_files_only=False
    )
    generation_type = outetts.GenerationType.CHUNKED
    if model_config['interface_version'] == outetts.InterfaceVersion.V3:
        generation_type = outetts.GenerationType.GUIDED_WORDS
    return outetts.ModelConfig(
        model_path=model_path,
        tokenizer_path=f"OuteAI/{model.value}",
        backend=backend,
        n_gpu_layers=99,
        verbose=False,
        device=None,
        dtype=None,
        additional_model_config={},
        audio_codec_path=None,
        generation_type=generation_type,
        **model_config
    )

def get_interface(model_name: str):
    """Get interface instance for the model (no caching to avoid CUDA memory issues)."""
    model = MODELS[model_name]

    try:
        quantization = MODEL_QUANTIZATION.get(model, outetts.LlamaCppQuantization.Q8_0)
        config = try_ggml_model(model, outetts.Backend.LLAMACPP, quantization)
    except:
        has_cuda = torch.cuda.is_available()
        model_config = MODEL_INFO[model]
        config = outetts.ModelConfig(
            model_path=f"OuteAI/{model_name}",
            tokenizer_path=f"OuteAI/{model_name}",
            backend=outetts.Backend.HF,
            additional_model_config={
                "device_map": "auto" if has_cuda else "cpu",
                "quantization_config": BitsAndBytesConfig(
                    load_in_4bit=True,
                    llm_int8_enable_fp32_cpu_offload=True
                ) if has_cuda else None,
            },
            **model_config
        )
    
    # Initialize the interface
    interface = outetts.Interface(config=config)
    return interface

def get_or_create_speaker(interface, audio_file):
    """Get speaker from cache or create new one if not cached."""
    # Calculate file hash for caching
    file_hash = get_file_hash(audio_file)
    cache_key = f"{interface.config.interface_version}_{file_hash}"
    
    # Check if speaker profile is already cached
    if cache_key in speaker_cache:
        print(f"✅ Using cached speaker profile for {os.path.basename(audio_file)}")
        return speaker_cache[cache_key]
    
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Create new speaker profile
    print(f"🔄 Creating new speaker profile for {os.path.basename(audio_file)}")
    try:
        speaker = interface.create_speaker(audio_file, whisper_model="large-v3-turbo", whisper_device=device)
        
        # Cache the speaker profile
        speaker_cache[cache_key] = speaker
        print(f"💾 Cached speaker profile ({len(speaker_cache)} total cached)")
        
        return speaker
    except Exception as e:
        return f"❌ Error creating speaker profile: {str(e)}"

@spaces.GPU
def create_speaker_and_generate(model_name, audio_file, test_text: Optional[str] = None, temperature: float = 0.4):
    """Create speaker from audio and optionally generate test audio."""
    if audio_file is None:
        # Return default values for startup/caching purposes
        return "Please upload an audio file to create a speaker profile.", None
    
    # Get interface (no caching to avoid CUDA memory issues)
    interface = get_interface(model_name)
    
    # Get or create speaker profile (with caching)
    speaker_result = get_or_create_speaker(interface, audio_file)
    
    # Check if speaker_result is an error message
    if isinstance(speaker_result, str) and speaker_result.startswith("❌"):
        return speaker_result, None
    
    # Convert speaker dict to formatted JSON
    speaker_json = json.dumps(speaker_result, indent=2, ensure_ascii=False)
    
    # Generate test audio if text is provided
    generated_audio = None
    if test_text and test_text.strip():
        output = interface.generate(
            config=outetts.GenerationConfig(
                text=test_text,
                speaker=speaker_result,
                sampler_config=outetts.SamplerConfig(
                    temperature=temperature
                ),
                max_length=MODEL_INFO[MODELS[model_name]]["max_seq_length"]
            )
        )
        
        # Save to temporary file
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            output.save(f.name)
            generated_audio = f.name
    
    return speaker_json, generated_audio

example_text = "Hello, this is a test of the OuteTTS speaker profile."

# Create the Gradio interface
demo = gr.Interface(
    fn=create_speaker_and_generate,
    inputs=[
        gr.Dropdown(
            choices=list(MODELS.keys()),
            value=list(MODELS.keys())[-1],
            label="Select OuteTTS Model",
            info="Choose the model variant to use"
        ),
        gr.Audio(
            label="Upload Reference Audio (Max 20 seconds)",
            type="filepath",
            sources=["upload", "microphone"]
        ),
        gr.Textbox(
            label="Test Text (Optional)",
            placeholder="Enter text to generate speech (leave empty to only create speaker profile)...",
            lines=3,
            value=None
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            step=0.1,
            value=0.4,
            label="Temperature",
            info="Controls randomness in generation"
        )
    ],
    outputs=[
        gr.Textbox(
            label="Speaker Profile (JSON)",
            lines=15,
            max_lines=20,
            show_copy_button=True
        ),
        gr.Audio(
            label="Generated Test Audio (if text provided)",
            type="filepath"
        )
    ],
    title="🎙️ OuteTTS Speaker Creator",
    description="Create and manage speaker profiles for OuteTTS text-to-speech synthesis. Upload audio to create a speaker profile, and optionally provide test text to generate sample audio.",
    theme=gr.themes.Soft(),
    examples=[
        ["OuteTTS-1.0-0.6B", None, example_text, 0.2],
        ["OuteTTS-0.3-500M", None, example_text, 0.2],
    ],
    cache_examples=False,
    flagging_mode="never"
)

if __name__ == "__main__":
    # Launch with optimized configuration for HuggingFace Spaces
    demo.launch(
        server_name="0.0.0.0",  # Allow external connections
        server_port=7860,
        share=False,           # Set to True if you want a public link
        show_api=True,         # Show API documentation
        show_error=True        # Show detailed error messages
    )