Spaces:

Edmon02
/

SpeechT5_hy

Runtime error

File size: 5,700 Bytes

3f1840e

"""
Armenian TTS - HuggingFace Spaces Compatible
===========================================

Final version optimized for HF Spaces with Gradio 3.x compatibility.
"""

import gradio as gr
import numpy as np
import logging
import os
import sys

# Minimal logging setup
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# Global variables
tts_pipeline = None
pipeline_ready = False

def initialize_tts():
    """Initialize TTS pipeline with comprehensive error handling."""
    global tts_pipeline, pipeline_ready
    
    try:
        # Setup path for imports
        current_dir = os.path.dirname(os.path.abspath(__file__))
        src_path = os.path.join(current_dir, 'src')
        if src_path not in sys.path:
            sys.path.insert(0, src_path)
        
        # Import and initialize pipeline
        from src.pipeline import TTSPipeline
        
        logger.info("Initializing TTS pipeline...")
        tts_pipeline = TTSPipeline(
            model_checkpoint="Edmon02/TTS_NB_2",
            max_chunk_length=200,
            crossfade_duration=0.1,
            use_mixed_precision=True
        )
        
        # Apply optimizations
        tts_pipeline.optimize_for_production()
        pipeline_ready = True
        logger.info("TTS pipeline ready!")
        return True
        
    except Exception as e:
        logger.error(f"Failed to initialize TTS pipeline: {e}")
        pipeline_ready = False
        return False

def synthesize_speech(text):
    """
    Main synthesis function with fallback handling.
    
    Args:
        text (str): Armenian text to synthesize
        
    Returns:
        tuple: (sample_rate, audio_array)
    """
    # Validate input
    if not text or not isinstance(text, str) or not text.strip():
        return create_silence(1.0)
    
    # Check pipeline status
    if not pipeline_ready or tts_pipeline is None:
        logger.warning("Pipeline not ready, generating fallback audio")
        return create_fallback_audio(text)
    
    try:
        logger.info(f"Synthesizing: {text[:50]}...")
        
        # Generate speech using pipeline
        sample_rate, audio = tts_pipeline.synthesize(
            text=text.strip(),
            speaker="BDL",
            enable_chunking=True,
            apply_audio_processing=True
        )
        
        logger.info(f"Generated {len(audio)} samples at {sample_rate}Hz")
        return sample_rate, audio
        
    except Exception as e:
        logger.error(f"Synthesis error: {e}")
        return create_fallback_audio(text)

def create_silence(duration_seconds=1.0):
    """Create silence audio."""
    sample_rate = 16000
    samples = int(duration_seconds * sample_rate)
    return sample_rate, np.zeros(samples, dtype=np.int16)

def create_fallback_audio(text):
    """Create simple fallback audio based on text."""
    # Calculate duration based on text length
    duration = min(max(len(text) * 0.1, 0.5), 5.0)
    sample_rate = 16000
    samples = int(duration * sample_rate)
    
    # Generate simple tone
    t = np.linspace(0, duration, samples)
    
    # Create a pleasant tone sequence
    base_freq = 440  # A4
    audio = np.sin(2 * np.pi * base_freq * t) * 0.3
    
    # Add some variation for longer texts
    if len(text) > 20:
        audio += np.sin(2 * np.pi * (base_freq * 1.5) * t) * 0.2
    
    # Apply fade in/out
    fade_samples = min(samples // 10, 1000)
    if fade_samples > 0:
        fade_in = np.linspace(0, 1, fade_samples)
        fade_out = np.linspace(1, 0, fade_samples)
        audio[:fade_samples] *= fade_in
        audio[-fade_samples:] *= fade_out
    
    # Convert to int16
    audio_int16 = (audio * 32767).astype(np.int16)
    
    return sample_rate, audio_int16

# Initialize the pipeline
logger.info("Starting Armenian TTS application...")
init_success = initialize_tts()

if init_success:
    app_status = "🟢 TTS System Ready"
    app_description = """
    🎤 **Armenian Text-to-Speech System**
    
    Convert Armenian text to natural speech using SpeechT5.
    
    **How to use:**
    1. Enter Armenian text in the box below
    2. Click Submit to generate speech
    3. Play the generated audio
    
    **Tips for best results:**
    - Use standard Armenian script (Unicode)
    - Include punctuation for natural pauses
    - Shorter sentences work better for quality
    """
else:
    app_status = "🟡 Test Mode (Limited Functionality)"
    app_description = """
    🎤 **Armenian Text-to-Speech System - Test Mode**
    
    The TTS system is running in test mode with limited functionality.
    Text input will generate simple audio tones as placeholders.
    """

# Create the Gradio interface using Gradio 3.x syntax
demo = gr.Interface(
    fn=synthesize_speech,
    inputs=gr.inputs.Textbox(
        lines=3,
        placeholder="Մուտքագրեք ձեր հայերեն տեքստը այստեղ...",
        label="Armenian Text Input"
    ),
    outputs=gr.outputs.Audio(
        label="Generated Speech"
    ),
    title=f"🇦🇲 Armenian Text-to-Speech {app_status}",
    description=app_description,
    examples=[
        "Բարև ձեզ, ինչպե՞ս եք:",
        "Այսօր գեղեցիկ օր է:",
        "Շնորհակալություն:",
        "Հայաստան իմ սիրելի երկիրն է:",
        "Երևանը Հայաստանի մայրաքաղաքն է:"
    ],
    theme="default",
    allow_screenshot=False,
    allow_flagging="never"
)

# Launch the application
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=False,
        quiet=False
    )