Spaces:
Running
on
Zero
Running
on
Zero
File size: 8,004 Bytes
ab1b361 ca494e8 bc05181 ab1b361 92715cf a6b9820 bc05181 af5a2a8 ab1b361 3825e40 ab1b361 f5220fd 03fd9ee f5220fd 92715cf 476f08d a6b9820 476f08d 1098b3c 476f08d 1098b3c 476f08d a6b9820 476f08d 3825e40 a6b9820 03fd9ee a6b9820 bc05181 cf59503 bc05181 a6b9820 3825e40 ab1b361 f5220fd 92715cf f5220fd 92715cf 0c34cba 92715cf 216ef9a 92715cf 5f40fbe 92715cf 3f709ea ab1b361 4eb335e 3825e40 476f08d 3f709ea 92715cf 216ef9a 3825e40 216ef9a 3825e40 3f709ea 216ef9a 3f709ea a6b9820 ab1b361 3f709ea 3825e40 3f709ea ab1b361 92715cf ab1b361 3f709ea 216ef9a 3f709ea 92715cf 3f709ea 92715cf 4eb335e 3f709ea ab1b361 92715cf 4eb335e 92715cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
import gradio as gr
# Import alias module before outetts to setup whisper redirection
import alias as _alias
import outetts
import json
import tempfile
import hashlib
import os
from typing import Optional
from outetts.models.info import MODEL_INFO
from outetts.utils import helpers
from huggingface_hub import hf_hub_download
import torch
from transformers import BitsAndBytesConfig
import spaces
# Available OuteTTS models based on the documentation
MODELS = {v.value: v for _, v in outetts.Models.__members__.items()}
MODEL_QUANTIZATION = {
outetts.Models.VERSION_0_1_SIZE_350M: outetts.LlamaCppQuantization.FP16,
outetts.Models.VERSION_0_2_SIZE_500M: outetts.LlamaCppQuantization.FP16,
outetts.Models.VERSION_0_3_SIZE_500M: outetts.LlamaCppQuantization.FP16,
}
# Cache for speaker profiles to avoid re-transcribing the same audio
speaker_cache = {}
def get_file_hash(file_path):
"""Calculate MD5 hash of a file for caching purposes."""
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def try_ggml_model(model: outetts.Models, backend: outetts.Backend, quantization: outetts.LlamaCppQuantization):
model_config = MODEL_INFO[model]
repo = f"OuteAI/{model.value}-GGUF"
filename = f"{model.value}-{quantization.value}.gguf"
model_path = hf_hub_download(
repo_id=repo,
filename=filename,
local_dir=os.path.join(helpers.get_cache_dir(), "gguf"),
local_files_only=False
)
generation_type = outetts.GenerationType.CHUNKED
if model_config['interface_version'] == outetts.InterfaceVersion.V3:
generation_type = outetts.GenerationType.GUIDED_WORDS
return outetts.ModelConfig(
model_path=model_path,
tokenizer_path=f"OuteAI/{model.value}",
backend=backend,
n_gpu_layers=99,
verbose=False,
device=None,
dtype=None,
additional_model_config={},
audio_codec_path=None,
generation_type=generation_type,
**model_config
)
def get_interface(model_name: str):
"""Get interface instance for the model (no caching to avoid CUDA memory issues)."""
model = MODELS[model_name]
try:
quantization = MODEL_QUANTIZATION.get(model, outetts.LlamaCppQuantization.Q8_0)
config = try_ggml_model(model, outetts.Backend.LLAMACPP, quantization)
except:
has_cuda = torch.cuda.is_available()
model_config = MODEL_INFO[model]
config = outetts.ModelConfig(
model_path=f"OuteAI/{model_name}",
tokenizer_path=f"OuteAI/{model_name}",
backend=outetts.Backend.HF,
additional_model_config={
"device_map": "auto" if has_cuda else "cpu",
"quantization_config": BitsAndBytesConfig(
load_in_4bit=True,
llm_int8_enable_fp32_cpu_offload=True
) if has_cuda else None,
},
**model_config
)
# Initialize the interface
interface = outetts.Interface(config=config)
return interface
def get_or_create_speaker(interface, audio_file):
"""Get speaker from cache or create new one if not cached."""
# Calculate file hash for caching
file_hash = get_file_hash(audio_file)
cache_key = f"{interface.config.interface_version}_{file_hash}"
# Check if speaker profile is already cached
if cache_key in speaker_cache:
print(f"β
Using cached speaker profile for {os.path.basename(audio_file)}")
return speaker_cache[cache_key]
device = "cuda" if torch.cuda.is_available() else "cpu"
# Create new speaker profile
print(f"π Creating new speaker profile for {os.path.basename(audio_file)}")
try:
speaker = interface.create_speaker(audio_file, whisper_model="large-v3-turbo", whisper_device=device)
# Cache the speaker profile
speaker_cache[cache_key] = speaker
print(f"πΎ Cached speaker profile ({len(speaker_cache)} total cached)")
return speaker
except Exception as e:
return f"β Error creating speaker profile: {str(e)}"
@spaces.GPU
def create_speaker_and_generate(model_name, audio_file, test_text: Optional[str] = None, temperature: float = 0.4):
"""Create speaker from audio and optionally generate test audio."""
if audio_file is None:
# Return default values for startup/caching purposes
return "Please upload an audio file to create a speaker profile.", None
# Get interface (no caching to avoid CUDA memory issues)
interface = get_interface(model_name)
# Get or create speaker profile (with caching)
speaker_result = get_or_create_speaker(interface, audio_file)
# Check if speaker_result is an error message
if isinstance(speaker_result, str) and speaker_result.startswith("β"):
return speaker_result, None
# Convert speaker dict to formatted JSON
speaker_json = json.dumps(speaker_result, indent=2, ensure_ascii=False)
# Generate test audio if text is provided
generated_audio = None
if test_text and test_text.strip():
output = interface.generate(
config=outetts.GenerationConfig(
text=test_text,
speaker=speaker_result,
sampler_config=outetts.SamplerConfig(
temperature=temperature
),
max_length=MODEL_INFO[MODELS[model_name]]["max_seq_length"]
)
)
# Save to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
output.save(f.name)
generated_audio = f.name
return speaker_json, generated_audio
example_text = "Hello, this is a test of the OuteTTS speaker profile."
# Create the Gradio interface
demo = gr.Interface(
fn=create_speaker_and_generate,
inputs=[
gr.Dropdown(
choices=list(MODELS.keys()),
value=list(MODELS.keys())[-1],
label="Select OuteTTS Model",
info="Choose the model variant to use"
),
gr.Audio(
label="Upload Reference Audio (Max 20 seconds)",
type="filepath",
sources=["upload", "microphone"]
),
gr.Textbox(
label="Test Text (Optional)",
placeholder="Enter text to generate speech (leave empty to only create speaker profile)...",
lines=3,
value=None
),
gr.Slider(
minimum=0.1,
maximum=1.0,
step=0.1,
value=0.4,
label="Temperature",
info="Controls randomness in generation"
)
],
outputs=[
gr.Textbox(
label="Speaker Profile (JSON)",
lines=15,
max_lines=20,
show_copy_button=True
),
gr.Audio(
label="Generated Test Audio (if text provided)",
type="filepath"
)
],
title="ποΈ OuteTTS Speaker Creator",
description="Create and manage speaker profiles for OuteTTS text-to-speech synthesis. Upload audio to create a speaker profile, and optionally provide test text to generate sample audio.",
theme=gr.themes.Soft(),
examples=[
["OuteTTS-1.0-0.6B", None, example_text, 0.2],
["OuteTTS-0.3-500M", None, example_text, 0.2],
],
cache_examples=False,
flagging_mode="never"
)
if __name__ == "__main__":
# Launch with optimized configuration for HuggingFace Spaces
demo.launch(
server_name="0.0.0.0", # Allow external connections
server_port=7860,
share=False, # Set to True if you want a public link
show_api=True, # Show API documentation
show_error=True # Show detailed error messages
)
|