import gradio as gr from ttsmms import download, TTS from langdetect import detect import os from pydub import AudioSegment from pydub.playback import play # Ensure ffmpeg works inside Hugging Face Spaces AudioSegment.converter = "/usr/bin/ffmpeg" # Download and load TTS models swahili_dir = download("swh", "./data/swahili") english_dir = download("eng", "./data/english") # Ensure an English TTS model is available swahili_tts = TTS(swahili_dir) english_tts = TTS(english_dir) # Function to process mixed-language text def text_to_speech(text): words = text.split() # Split text into words audio_clips = [] for word in words: lang = detect(word) # Detect language of each word wav_path = f"./temp_{word}.wav" if lang == "sw": swahili_tts.synthesis(word, wav_path=wav_path) else: english_tts.synthesis(word, wav_path=wav_path) audio_clips.append(AudioSegment.from_wav(wav_path)) os.remove(wav_path) # Remove temporary files # Combine all audio clips final_audio = sum(audio_clips) output_path = "./output.wav" final_audio.export(output_path, format="wav") return output_path # Gradio UI gr.Interface( fn=text_to_speech, inputs=gr.Textbox(label="Enter Text"), outputs=gr.Audio(label="Generated Speech"), title="Swahili & English Text-to-Speech", description="Type text in Swahili and English, and listen to the mixed-language speech.", ).launch()