""" Optimized SpeechT5 Armenian TTS Application ========================================== High-performance Gradio application with advanced optimization features. """ import gradio as gr import numpy as np import logging import time from typing import Tuple, Optional import os import sys # Add src to path for imports current_dir = os.path.dirname(os.path.abspath(__file__)) src_path = os.path.join(current_dir, 'src') if src_path not in sys.path: sys.path.insert(0, src_path) try: from src.pipeline import TTSPipeline except ImportError as e: logging.error(f"Failed to import pipeline: {e}") # Fallback import attempt sys.path.append(os.path.join(os.path.dirname(__file__), 'src')) from src.pipeline import TTSPipeline # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Global pipeline instance tts_pipeline: Optional[TTSPipeline] = None def initialize_pipeline(): """Initialize the TTS pipeline with error handling.""" global tts_pipeline try: logger.info("Initializing TTS Pipeline...") tts_pipeline = TTSPipeline( model_checkpoint="Edmon02/TTS_NB_2", max_chunk_length=200, # Optimal for 5-20s clips crossfade_duration=0.1, use_mixed_precision=True ) # Apply production optimizations tts_pipeline.optimize_for_production() logger.info("TTS Pipeline initialized successfully") return True except Exception as e: logger.error(f"Failed to initialize TTS pipeline: {e}") return False def predict(text: str, speaker: str, enable_chunking: bool = True, apply_processing: bool = True) -> Tuple[int, np.ndarray]: """ Main prediction function with optimization and error handling. Args: text: Input text to synthesize speaker: Speaker selection enable_chunking: Whether to enable intelligent chunking apply_processing: Whether to apply audio post-processing Returns: Tuple of (sample_rate, audio_array) """ global tts_pipeline start_time = time.time() try: # Validate inputs if not text or not text.strip(): logger.warning("Empty text provided") return 16000, np.zeros(0, dtype=np.int16) if tts_pipeline is None: logger.error("TTS pipeline not initialized") return 16000, np.zeros(0, dtype=np.int16) # Extract speaker code from selection speaker_code = speaker.split("(")[0].strip() # Log request logger.info(f"Processing request: {len(text)} chars, speaker: {speaker_code}") # Synthesize speech sample_rate, audio = tts_pipeline.synthesize( text=text, speaker=speaker_code, enable_chunking=enable_chunking, apply_audio_processing=apply_processing ) # Log performance total_time = time.time() - start_time audio_duration = len(audio) / sample_rate if len(audio) > 0 else 0 rtf = total_time / audio_duration if audio_duration > 0 else float('inf') logger.info(f"Request completed in {total_time:.3f}s (RTF: {rtf:.2f})") return sample_rate, audio except Exception as e: logger.error(f"Prediction failed: {e}") return 16000, np.zeros(0, dtype=np.int16) def get_performance_info() -> str: """Get performance statistics as formatted string.""" global tts_pipeline if tts_pipeline is None: return "Pipeline not initialized" try: stats = tts_pipeline.get_performance_stats() info = f""" **Performance Statistics:** - Total Inferences: {stats['pipeline_stats']['total_inferences']} - Average Processing Time: {stats['pipeline_stats']['avg_processing_time']:.3f}s - Translation Cache Size: {stats['text_processor_stats']['translation_cache_size']} - Model Inferences: {stats['model_stats']['total_inferences']} - Average Model Time: {stats['model_stats'].get('avg_inference_time', 0):.3f}s """ return info.strip() except Exception as e: return f"Error getting performance info: {e}" def health_check() -> str: """Perform system health check.""" global tts_pipeline if tts_pipeline is None: return "❌ Pipeline not initialized" try: health = tts_pipeline.health_check() if health["status"] == "healthy": return "✅ All systems operational" elif health["status"] == "degraded": return "⚠️ Some components have issues" else: return f"❌ System error: {health.get('error', 'Unknown error')}" except Exception as e: return f"❌ Health check failed: {e}" # Application metadata TITLE = "🎤 SpeechT5 Armenian TTS - Optimized" DESCRIPTION = """ # High-Performance Armenian Text-to-Speech This is an **optimized version** of SpeechT5 for Armenian language synthesis, featuring: ### 🚀 **Performance Optimizations** - **Intelligent Text Chunking**: Handles long texts by splitting them intelligently at sentence boundaries - **Caching**: Translation and embedding caching for faster repeated requests - **Mixed Precision**: GPU optimization with FP16 inference when available - **Crossfading**: Smooth audio transitions between chunks for natural-sounding longer texts ### 🎯 **Advanced Features** - **Smart Text Processing**: Automatic number-to-word conversion with Armenian translation - **Audio Post-Processing**: Noise gating, normalization, and dynamic range optimization - **Robust Error Handling**: Graceful fallbacks and comprehensive logging - **Real-time Performance Monitoring**: Track processing times and system health ### 📝 **Usage Tips** - **Short texts** (< 200 chars): Processed directly for maximum speed - **Long texts**: Automatically chunked with overlap for seamless audio - **Numbers**: Automatically converted to Armenian words - **Performance**: Enable chunking for texts longer than a few sentences ### 🎵 **Audio Quality** - Sample Rate: 16 kHz - Optimized for natural prosody and clear pronunciation - Cross-fade transitions for multi-chunk synthesis The model was trained on short clips (5-20s) but uses advanced algorithms to handle longer texts effectively. """ EXAMPLES = [ # Short examples for quick testing ["Բարև ձեզ, ինչպե՞ս եք:", "BDL (male)", True, True], ["Այսօր գեղեցիկ օր է:", "BDL (male)", False, True], # Medium examples demonstrating chunking ["Հայաստանն ունի հարուստ պատմություն և մշակույթ: Երևանը մայրաքաղաքն է, որն ունի 2800 տարվա պատմություն:", "BDL (male)", True, True], # Long example with numbers ["Արարատ լեռը բարձրությունը 5165 մետր է: Այն Հայաստանի խորհրդանիշն է և գտնվում է Թուրքիայի տարածքում: Լեռան վրա ըստ Աստվածաշնչի՝ կանգնել է Նոյի տապանը 40 օրվա ջրհեղեղից հետո:", "BDL (male)", True, True], # Technical example ["Մեքենայի շարժիչը 150 ձիուժ է և 2.0 լիտր ծավալ ունի: Այն կարող է արագացնել 0-ից 100 կմ/ժ 8.5 վայրկյանում:", "BDL (male)", True, True], ] # Custom CSS for better styling CUSTOM_CSS = """ .gradio-container { max-width: 1200px !important; margin: auto !important; } .performance-info { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 15px; border-radius: 10px; color: white; margin: 10px 0; } .health-status { padding: 10px; border-radius: 8px; margin: 10px 0; font-weight: bold; } .status-healthy { background-color: #d4edda; color: #155724; } .status-warning { background-color: #fff3cd; color: #856404; } .status-error { background-color: #f8d7da; color: #721c24; } """ def create_interface(): """Create and configure the Gradio interface.""" with gr.Blocks( theme=gr.themes.Soft(), css=CUSTOM_CSS, title="SpeechT5 Armenian TTS" ) as interface: # Header gr.Markdown(f"# {TITLE}") gr.Markdown(DESCRIPTION) with gr.Row(): with gr.Column(scale=2): # Main input controls text_input = gr.Textbox( label="📝 Input Text (Armenian)", placeholder="Մուտքագրեք ձեր տեքստը այստեղ...", lines=3, max_lines=10 ) with gr.Row(): speaker_input = gr.Radio( label="🎭 Speaker", choices=["BDL (male)"], value="BDL (male)" ) with gr.Row(): chunking_checkbox = gr.Checkbox( label="🧩 Enable Intelligent Chunking", value=True, info="Automatically split long texts for better quality" ) processing_checkbox = gr.Checkbox( label="🎚️ Apply Audio Processing", value=True, info="Apply noise gating, normalization, and crossfading" ) # Generate button generate_btn = gr.Button( "🎤 Generate Speech", variant="primary", size="lg" ) with gr.Column(scale=1): # System information panel gr.Markdown("### 📊 System Status") health_display = gr.Textbox( label="Health Status", value="Initializing...", interactive=False, max_lines=1 ) performance_display = gr.Textbox( label="Performance Stats", value="No data yet", interactive=False, max_lines=8 ) refresh_btn = gr.Button("🔄 Refresh Stats", size="sm") # Output audio_output = gr.Audio( label="🔊 Generated Speech", type="numpy", interactive=False ) # Examples section gr.Markdown("### 💡 Example Texts") # Use simpler Examples component to avoid schema issues examples = gr.Examples( examples=EXAMPLES, inputs=[text_input, speaker_input, chunking_checkbox, processing_checkbox], outputs=audio_output, fn=predict, cache_examples=False, # Disable caching to avoid schema issues label="Click any example to try it:" ) # Event handlers generate_btn.click( fn=predict, inputs=[text_input, speaker_input, chunking_checkbox, processing_checkbox], outputs=[audio_output], show_progress="full" ) refresh_btn.click( fn=lambda: (health_check(), get_performance_info()), outputs=[health_display, performance_display], show_progress="minimal" ) # Auto-refresh health status on load interface.load( fn=lambda: (health_check(), get_performance_info()), outputs=[health_display, performance_display] ) return interface def main(): """Main application entry point.""" logger.info("Starting SpeechT5 Armenian TTS Application") # Initialize pipeline if not initialize_pipeline(): logger.error("Failed to initialize TTS pipeline - exiting") sys.exit(1) # Create and launch interface interface = create_interface() # Launch with optimized settings interface.launch( share=False, # Disable share for HF Spaces inbrowser=False, show_error=True, quiet=False, server_name="0.0.0.0", # Allow external connections server_port=7860, # Standard Gradio port max_threads=4, # Limit concurrent requests ) if __name__ == "__main__": main()