Spaces:
Runtime error
Runtime error
""" | |
Optimized SpeechT5 Armenian TTS Application | |
========================================== | |
High-performance Gradio application with advanced optimization features. | |
""" | |
import gradio as gr | |
import numpy as np | |
import logging | |
import time | |
from typing import Tuple, Optional | |
import os | |
import sys | |
# Add src to path for imports | |
current_dir = os.path.dirname(os.path.abspath(__file__)) | |
src_path = os.path.join(current_dir, 'src') | |
if src_path not in sys.path: | |
sys.path.insert(0, src_path) | |
try: | |
from src.pipeline import TTSPipeline | |
except ImportError as e: | |
logging.error(f"Failed to import pipeline: {e}") | |
# Fallback import attempt | |
sys.path.append(os.path.join(os.path.dirname(__file__), 'src')) | |
from src.pipeline import TTSPipeline | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
) | |
logger = logging.getLogger(__name__) | |
# Global pipeline instance | |
tts_pipeline: Optional[TTSPipeline] = None | |
def initialize_pipeline(): | |
"""Initialize the TTS pipeline with error handling.""" | |
global tts_pipeline | |
try: | |
logger.info("Initializing TTS Pipeline...") | |
tts_pipeline = TTSPipeline( | |
model_checkpoint="Edmon02/TTS_NB_2", | |
max_chunk_length=200, # Optimal for 5-20s clips | |
crossfade_duration=0.1, | |
use_mixed_precision=True | |
) | |
# Apply production optimizations | |
tts_pipeline.optimize_for_production() | |
logger.info("TTS Pipeline initialized successfully") | |
return True | |
except Exception as e: | |
logger.error(f"Failed to initialize TTS pipeline: {e}") | |
return False | |
def predict(text: str, speaker: str, | |
enable_chunking: bool = True, | |
apply_processing: bool = True) -> Tuple[int, np.ndarray]: | |
""" | |
Main prediction function with optimization and error handling. | |
Args: | |
text: Input text to synthesize | |
speaker: Speaker selection | |
enable_chunking: Whether to enable intelligent chunking | |
apply_processing: Whether to apply audio post-processing | |
Returns: | |
Tuple of (sample_rate, audio_array) | |
""" | |
global tts_pipeline | |
start_time = time.time() | |
try: | |
# Validate inputs | |
if not text or not text.strip(): | |
logger.warning("Empty text provided") | |
return 16000, np.zeros(0, dtype=np.int16) | |
if tts_pipeline is None: | |
logger.error("TTS pipeline not initialized") | |
return 16000, np.zeros(0, dtype=np.int16) | |
# Extract speaker code from selection | |
speaker_code = speaker.split("(")[0].strip() | |
# Log request | |
logger.info(f"Processing request: {len(text)} chars, speaker: {speaker_code}") | |
# Synthesize speech | |
sample_rate, audio = tts_pipeline.synthesize( | |
text=text, | |
speaker=speaker_code, | |
enable_chunking=enable_chunking, | |
apply_audio_processing=apply_processing | |
) | |
# Log performance | |
total_time = time.time() - start_time | |
audio_duration = len(audio) / sample_rate if len(audio) > 0 else 0 | |
rtf = total_time / audio_duration if audio_duration > 0 else float('inf') | |
logger.info(f"Request completed in {total_time:.3f}s (RTF: {rtf:.2f})") | |
return sample_rate, audio | |
except Exception as e: | |
logger.error(f"Prediction failed: {e}") | |
return 16000, np.zeros(0, dtype=np.int16) | |
def get_performance_info() -> str: | |
"""Get performance statistics as formatted string.""" | |
global tts_pipeline | |
if tts_pipeline is None: | |
return "Pipeline not initialized" | |
try: | |
stats = tts_pipeline.get_performance_stats() | |
info = f""" | |
**Performance Statistics:** | |
- Total Inferences: {stats['pipeline_stats']['total_inferences']} | |
- Average Processing Time: {stats['pipeline_stats']['avg_processing_time']:.3f}s | |
- Translation Cache Size: {stats['text_processor_stats']['translation_cache_size']} | |
- Model Inferences: {stats['model_stats']['total_inferences']} | |
- Average Model Time: {stats['model_stats'].get('avg_inference_time', 0):.3f}s | |
""" | |
return info.strip() | |
except Exception as e: | |
return f"Error getting performance info: {e}" | |
def health_check() -> str: | |
"""Perform system health check.""" | |
global tts_pipeline | |
if tts_pipeline is None: | |
return "❌ Pipeline not initialized" | |
try: | |
health = tts_pipeline.health_check() | |
if health["status"] == "healthy": | |
return "✅ All systems operational" | |
elif health["status"] == "degraded": | |
return "⚠️ Some components have issues" | |
else: | |
return f"❌ System error: {health.get('error', 'Unknown error')}" | |
except Exception as e: | |
return f"❌ Health check failed: {e}" | |
# Application metadata | |
TITLE = "🎤 SpeechT5 Armenian TTS - Optimized" | |
DESCRIPTION = """ | |
# High-Performance Armenian Text-to-Speech | |
This is an **optimized version** of SpeechT5 for Armenian language synthesis, featuring: | |
### 🚀 **Performance Optimizations** | |
- **Intelligent Text Chunking**: Handles long texts by splitting them intelligently at sentence boundaries | |
- **Caching**: Translation and embedding caching for faster repeated requests | |
- **Mixed Precision**: GPU optimization with FP16 inference when available | |
- **Crossfading**: Smooth audio transitions between chunks for natural-sounding longer texts | |
### 🎯 **Advanced Features** | |
- **Smart Text Processing**: Automatic number-to-word conversion with Armenian translation | |
- **Audio Post-Processing**: Noise gating, normalization, and dynamic range optimization | |
- **Robust Error Handling**: Graceful fallbacks and comprehensive logging | |
- **Real-time Performance Monitoring**: Track processing times and system health | |
### 📝 **Usage Tips** | |
- **Short texts** (< 200 chars): Processed directly for maximum speed | |
- **Long texts**: Automatically chunked with overlap for seamless audio | |
- **Numbers**: Automatically converted to Armenian words | |
- **Performance**: Enable chunking for texts longer than a few sentences | |
### 🎵 **Audio Quality** | |
- Sample Rate: 16 kHz | |
- Optimized for natural prosody and clear pronunciation | |
- Cross-fade transitions for multi-chunk synthesis | |
The model was trained on short clips (5-20s) but uses advanced algorithms to handle longer texts effectively. | |
""" | |
EXAMPLES = [ | |
# Short examples for quick testing | |
["Բարև ձեզ, ինչպե՞ս եք:", "BDL (male)", True, True], | |
["Այսօր գեղեցիկ օր է:", "BDL (male)", False, True], | |
# Medium examples demonstrating chunking | |
["Հայաստանն ունի հարուստ պատմություն և մշակույթ: Երևանը մայրաքաղաքն է, որն ունի 2800 տարվա պատմություն:", "BDL (male)", True, True], | |
# Long example with numbers | |
["Արարատ լեռը բարձրությունը 5165 մետր է: Այն Հայաստանի խորհրդանիշն է և գտնվում է Թուրքիայի տարածքում: Լեռան վրա ըստ Աստվածաշնչի՝ կանգնել է Նոյի տապանը 40 օրվա ջրհեղեղից հետո:", "BDL (male)", True, True], | |
# Technical example | |
["Մեքենայի շարժիչը 150 ձիուժ է և 2.0 լիտր ծավալ ունի: Այն կարող է արագացնել 0-ից 100 կմ/ժ 8.5 վայրկյանում:", "BDL (male)", True, True], | |
] | |
# Custom CSS for better styling | |
CUSTOM_CSS = """ | |
.gradio-container { | |
max-width: 1200px !important; | |
margin: auto !important; | |
} | |
.performance-info { | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
padding: 15px; | |
border-radius: 10px; | |
color: white; | |
margin: 10px 0; | |
} | |
.health-status { | |
padding: 10px; | |
border-radius: 8px; | |
margin: 10px 0; | |
font-weight: bold; | |
} | |
.status-healthy { background-color: #d4edda; color: #155724; } | |
.status-warning { background-color: #fff3cd; color: #856404; } | |
.status-error { background-color: #f8d7da; color: #721c24; } | |
""" | |
def create_interface(): | |
"""Create and configure the Gradio interface.""" | |
with gr.Blocks( | |
theme=gr.themes.Soft(), | |
css=CUSTOM_CSS, | |
title="SpeechT5 Armenian TTS" | |
) as interface: | |
# Header | |
gr.Markdown(f"# {TITLE}") | |
gr.Markdown(DESCRIPTION) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
# Main input controls | |
text_input = gr.Textbox( | |
label="📝 Input Text (Armenian)", | |
placeholder="Մուտքագրեք ձեր տեքստը այստեղ...", | |
lines=3, | |
max_lines=10 | |
) | |
with gr.Row(): | |
speaker_input = gr.Radio( | |
label="🎭 Speaker", | |
choices=["BDL (male)"], | |
value="BDL (male)" | |
) | |
with gr.Row(): | |
chunking_checkbox = gr.Checkbox( | |
label="🧩 Enable Intelligent Chunking", | |
value=True, | |
info="Automatically split long texts for better quality" | |
) | |
processing_checkbox = gr.Checkbox( | |
label="🎚️ Apply Audio Processing", | |
value=True, | |
info="Apply noise gating, normalization, and crossfading" | |
) | |
# Generate button | |
generate_btn = gr.Button( | |
"🎤 Generate Speech", | |
variant="primary", | |
size="lg" | |
) | |
with gr.Column(scale=1): | |
# System information panel | |
gr.Markdown("### 📊 System Status") | |
health_display = gr.Textbox( | |
label="Health Status", | |
value="Initializing...", | |
interactive=False, | |
max_lines=1 | |
) | |
performance_display = gr.Textbox( | |
label="Performance Stats", | |
value="No data yet", | |
interactive=False, | |
max_lines=8 | |
) | |
refresh_btn = gr.Button("🔄 Refresh Stats", size="sm") | |
# Output | |
audio_output = gr.Audio( | |
label="🔊 Generated Speech", | |
type="numpy", | |
interactive=False | |
) | |
# Examples section | |
gr.Markdown("### 💡 Example Texts") | |
# Use simpler Examples component to avoid schema issues | |
examples = gr.Examples( | |
examples=EXAMPLES, | |
inputs=[text_input, speaker_input, chunking_checkbox, processing_checkbox], | |
outputs=audio_output, | |
fn=predict, | |
cache_examples=False, # Disable caching to avoid schema issues | |
label="Click any example to try it:" | |
) | |
# Event handlers | |
generate_btn.click( | |
fn=predict, | |
inputs=[text_input, speaker_input, chunking_checkbox, processing_checkbox], | |
outputs=[audio_output], | |
show_progress="full" | |
) | |
refresh_btn.click( | |
fn=lambda: (health_check(), get_performance_info()), | |
outputs=[health_display, performance_display], | |
show_progress="minimal" | |
) | |
# Auto-refresh health status on load | |
interface.load( | |
fn=lambda: (health_check(), get_performance_info()), | |
outputs=[health_display, performance_display] | |
) | |
return interface | |
def main(): | |
"""Main application entry point.""" | |
logger.info("Starting SpeechT5 Armenian TTS Application") | |
# Initialize pipeline | |
if not initialize_pipeline(): | |
logger.error("Failed to initialize TTS pipeline - exiting") | |
sys.exit(1) | |
# Create and launch interface | |
interface = create_interface() | |
# Launch with optimized settings | |
interface.launch( | |
share=False, # Disable share for HF Spaces | |
inbrowser=False, | |
show_error=True, | |
quiet=False, | |
server_name="0.0.0.0", # Allow external connections | |
server_port=7860, # Standard Gradio port | |
max_threads=4, # Limit concurrent requests | |
) | |
if __name__ == "__main__": | |
main() | |