import gradio as gr
from ttsmms import download, TTS
from langdetect import detect
import os
from pydub import AudioSegment
from pydub.playback import play

# Ensure ffmpeg works inside Hugging Face Spaces
AudioSegment.converter = "/usr/bin/ffmpeg"  

# Download and load TTS models
swahili_dir = download("swh", "./data/swahili")
english_dir = download("eng", "./data/english")  # Ensure an English TTS model is available

swahili_tts = TTS(swahili_dir)
english_tts = TTS(english_dir)

# Function to process mixed-language text
def text_to_speech(text):
    words = text.split()  # Split text into words
    audio_clips = []
    
    for word in words:
        lang = detect(word)  # Detect language of each word
        wav_path = f"./temp_{word}.wav"

        if lang == "sw":
            swahili_tts.synthesis(word, wav_path=wav_path)
        else:
            english_tts.synthesis(word, wav_path=wav_path)

        audio_clips.append(AudioSegment.from_wav(wav_path))
        os.remove(wav_path)  # Remove temporary files

    # Combine all audio clips
    final_audio = sum(audio_clips)
    output_path = "./output.wav"
    final_audio.export(output_path, format="wav")

    return output_path

# Gradio UI
gr.Interface(
    fn=text_to_speech,
    inputs=gr.Textbox(label="Enter Text"),
    outputs=gr.Audio(label="Generated Speech"),
    title="Swahili & English Text-to-Speech",
    description="Type text in Swahili and English, and listen to the mixed-language speech.",
).launch()