SpeechT5_hy / archive /app_optimized.py
Edmon02's picture
feat: Implement project organization plan and optimize TTS deployment
3f1840e
"""
Optimized SpeechT5 Armenian TTS Application
==========================================
High-performance Gradio application with advanced optimization features.
"""
import gradio as gr
import numpy as np
import logging
import time
from typing import Tuple, Optional
import os
import sys
# Add src to path for imports
current_dir = os.path.dirname(os.path.abspath(__file__))
src_path = os.path.join(current_dir, 'src')
if src_path not in sys.path:
sys.path.insert(0, src_path)
try:
from src.pipeline import TTSPipeline
except ImportError as e:
logging.error(f"Failed to import pipeline: {e}")
# Fallback import attempt
sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
from src.pipeline import TTSPipeline
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Global pipeline instance
tts_pipeline: Optional[TTSPipeline] = None
def initialize_pipeline():
"""Initialize the TTS pipeline with error handling."""
global tts_pipeline
try:
logger.info("Initializing TTS Pipeline...")
tts_pipeline = TTSPipeline(
model_checkpoint="Edmon02/TTS_NB_2",
max_chunk_length=200, # Optimal for 5-20s clips
crossfade_duration=0.1,
use_mixed_precision=True
)
# Apply production optimizations
tts_pipeline.optimize_for_production()
logger.info("TTS Pipeline initialized successfully")
return True
except Exception as e:
logger.error(f"Failed to initialize TTS pipeline: {e}")
return False
def predict(text: str, speaker: str,
enable_chunking: bool = True,
apply_processing: bool = True) -> Tuple[int, np.ndarray]:
"""
Main prediction function with optimization and error handling.
Args:
text: Input text to synthesize
speaker: Speaker selection
enable_chunking: Whether to enable intelligent chunking
apply_processing: Whether to apply audio post-processing
Returns:
Tuple of (sample_rate, audio_array)
"""
global tts_pipeline
start_time = time.time()
try:
# Validate inputs
if not text or not text.strip():
logger.warning("Empty text provided")
return 16000, np.zeros(0, dtype=np.int16)
if tts_pipeline is None:
logger.error("TTS pipeline not initialized")
return 16000, np.zeros(0, dtype=np.int16)
# Extract speaker code from selection
speaker_code = speaker.split("(")[0].strip()
# Log request
logger.info(f"Processing request: {len(text)} chars, speaker: {speaker_code}")
# Synthesize speech
sample_rate, audio = tts_pipeline.synthesize(
text=text,
speaker=speaker_code,
enable_chunking=enable_chunking,
apply_audio_processing=apply_processing
)
# Log performance
total_time = time.time() - start_time
audio_duration = len(audio) / sample_rate if len(audio) > 0 else 0
rtf = total_time / audio_duration if audio_duration > 0 else float('inf')
logger.info(f"Request completed in {total_time:.3f}s (RTF: {rtf:.2f})")
return sample_rate, audio
except Exception as e:
logger.error(f"Prediction failed: {e}")
return 16000, np.zeros(0, dtype=np.int16)
def get_performance_info() -> str:
"""Get performance statistics as formatted string."""
global tts_pipeline
if tts_pipeline is None:
return "Pipeline not initialized"
try:
stats = tts_pipeline.get_performance_stats()
info = f"""
**Performance Statistics:**
- Total Inferences: {stats['pipeline_stats']['total_inferences']}
- Average Processing Time: {stats['pipeline_stats']['avg_processing_time']:.3f}s
- Translation Cache Size: {stats['text_processor_stats']['translation_cache_size']}
- Model Inferences: {stats['model_stats']['total_inferences']}
- Average Model Time: {stats['model_stats'].get('avg_inference_time', 0):.3f}s
"""
return info.strip()
except Exception as e:
return f"Error getting performance info: {e}"
def health_check() -> str:
"""Perform system health check."""
global tts_pipeline
if tts_pipeline is None:
return "❌ Pipeline not initialized"
try:
health = tts_pipeline.health_check()
if health["status"] == "healthy":
return "✅ All systems operational"
elif health["status"] == "degraded":
return "⚠️ Some components have issues"
else:
return f"❌ System error: {health.get('error', 'Unknown error')}"
except Exception as e:
return f"❌ Health check failed: {e}"
# Application metadata
TITLE = "🎤 SpeechT5 Armenian TTS - Optimized"
DESCRIPTION = """
# High-Performance Armenian Text-to-Speech
This is an **optimized version** of SpeechT5 for Armenian language synthesis, featuring:
### 🚀 **Performance Optimizations**
- **Intelligent Text Chunking**: Handles long texts by splitting them intelligently at sentence boundaries
- **Caching**: Translation and embedding caching for faster repeated requests
- **Mixed Precision**: GPU optimization with FP16 inference when available
- **Crossfading**: Smooth audio transitions between chunks for natural-sounding longer texts
### 🎯 **Advanced Features**
- **Smart Text Processing**: Automatic number-to-word conversion with Armenian translation
- **Audio Post-Processing**: Noise gating, normalization, and dynamic range optimization
- **Robust Error Handling**: Graceful fallbacks and comprehensive logging
- **Real-time Performance Monitoring**: Track processing times and system health
### 📝 **Usage Tips**
- **Short texts** (< 200 chars): Processed directly for maximum speed
- **Long texts**: Automatically chunked with overlap for seamless audio
- **Numbers**: Automatically converted to Armenian words
- **Performance**: Enable chunking for texts longer than a few sentences
### 🎵 **Audio Quality**
- Sample Rate: 16 kHz
- Optimized for natural prosody and clear pronunciation
- Cross-fade transitions for multi-chunk synthesis
The model was trained on short clips (5-20s) but uses advanced algorithms to handle longer texts effectively.
"""
EXAMPLES = [
# Short examples for quick testing
["Բարև ձեզ, ինչպե՞ս եք:", "BDL (male)", True, True],
["Այսօր գեղեցիկ օր է:", "BDL (male)", False, True],
# Medium examples demonstrating chunking
["Հայաստանն ունի հարուստ պատմություն և մշակույթ: Երևանը մայրաքաղաքն է, որն ունի 2800 տարվա պատմություն:", "BDL (male)", True, True],
# Long example with numbers
["Արարատ լեռը բարձրությունը 5165 մետր է: Այն Հայաստանի խորհրդանիշն է և գտնվում է Թուրքիայի տարածքում: Լեռան վրա ըստ Աստվածաշնչի՝ կանգնել է Նոյի տապանը 40 օրվա ջրհեղեղից հետո:", "BDL (male)", True, True],
# Technical example
["Մեքենայի շարժիչը 150 ձիուժ է և 2.0 լիտր ծավալ ունի: Այն կարող է արագացնել 0-ից 100 կմ/ժ 8.5 վայրկյանում:", "BDL (male)", True, True],
]
# Custom CSS for better styling
CUSTOM_CSS = """
.gradio-container {
max-width: 1200px !important;
margin: auto !important;
}
.performance-info {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
padding: 15px;
border-radius: 10px;
color: white;
margin: 10px 0;
}
.health-status {
padding: 10px;
border-radius: 8px;
margin: 10px 0;
font-weight: bold;
}
.status-healthy { background-color: #d4edda; color: #155724; }
.status-warning { background-color: #fff3cd; color: #856404; }
.status-error { background-color: #f8d7da; color: #721c24; }
"""
def create_interface():
"""Create and configure the Gradio interface."""
with gr.Blocks(
theme=gr.themes.Soft(),
css=CUSTOM_CSS,
title="SpeechT5 Armenian TTS"
) as interface:
# Header
gr.Markdown(f"# {TITLE}")
gr.Markdown(DESCRIPTION)
with gr.Row():
with gr.Column(scale=2):
# Main input controls
text_input = gr.Textbox(
label="📝 Input Text (Armenian)",
placeholder="Մուտքագրեք ձեր տեքստը այստեղ...",
lines=3,
max_lines=10
)
with gr.Row():
speaker_input = gr.Radio(
label="🎭 Speaker",
choices=["BDL (male)"],
value="BDL (male)"
)
with gr.Row():
chunking_checkbox = gr.Checkbox(
label="🧩 Enable Intelligent Chunking",
value=True,
info="Automatically split long texts for better quality"
)
processing_checkbox = gr.Checkbox(
label="🎚️ Apply Audio Processing",
value=True,
info="Apply noise gating, normalization, and crossfading"
)
# Generate button
generate_btn = gr.Button(
"🎤 Generate Speech",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
# System information panel
gr.Markdown("### 📊 System Status")
health_display = gr.Textbox(
label="Health Status",
value="Initializing...",
interactive=False,
max_lines=1
)
performance_display = gr.Textbox(
label="Performance Stats",
value="No data yet",
interactive=False,
max_lines=8
)
refresh_btn = gr.Button("🔄 Refresh Stats", size="sm")
# Output
audio_output = gr.Audio(
label="🔊 Generated Speech",
type="numpy",
interactive=False
)
# Examples section
gr.Markdown("### 💡 Example Texts")
# Use simpler Examples component to avoid schema issues
examples = gr.Examples(
examples=EXAMPLES,
inputs=[text_input, speaker_input, chunking_checkbox, processing_checkbox],
outputs=audio_output,
fn=predict,
cache_examples=False, # Disable caching to avoid schema issues
label="Click any example to try it:"
)
# Event handlers
generate_btn.click(
fn=predict,
inputs=[text_input, speaker_input, chunking_checkbox, processing_checkbox],
outputs=[audio_output],
show_progress="full"
)
refresh_btn.click(
fn=lambda: (health_check(), get_performance_info()),
outputs=[health_display, performance_display],
show_progress="minimal"
)
# Auto-refresh health status on load
interface.load(
fn=lambda: (health_check(), get_performance_info()),
outputs=[health_display, performance_display]
)
return interface
def main():
"""Main application entry point."""
logger.info("Starting SpeechT5 Armenian TTS Application")
# Initialize pipeline
if not initialize_pipeline():
logger.error("Failed to initialize TTS pipeline - exiting")
sys.exit(1)
# Create and launch interface
interface = create_interface()
# Launch with optimized settings
interface.launch(
share=False, # Disable share for HF Spaces
inbrowser=False,
show_error=True,
quiet=False,
server_name="0.0.0.0", # Allow external connections
server_port=7860, # Standard Gradio port
max_threads=4, # Limit concurrent requests
)
if __name__ == "__main__":
main()