Futuresony's picture
Create app.py
3bb7afd verified
raw
history blame
1.49 kB
import gradio as gr
from ttsmms import download, TTS
from langdetect import detect
import os
from pydub import AudioSegment
from pydub.playback import play
# Ensure ffmpeg works inside Hugging Face Spaces
AudioSegment.converter = "/usr/bin/ffmpeg"
# Download and load TTS models
swahili_dir = download("swh", "./data/swahili")
english_dir = download("eng", "./data/english") # Ensure an English TTS model is available
swahili_tts = TTS(swahili_dir)
english_tts = TTS(english_dir)
# Function to process mixed-language text
def text_to_speech(text):
words = text.split() # Split text into words
audio_clips = []
for word in words:
lang = detect(word) # Detect language of each word
wav_path = f"./temp_{word}.wav"
if lang == "sw":
swahili_tts.synthesis(word, wav_path=wav_path)
else:
english_tts.synthesis(word, wav_path=wav_path)
audio_clips.append(AudioSegment.from_wav(wav_path))
os.remove(wav_path) # Remove temporary files
# Combine all audio clips
final_audio = sum(audio_clips)
output_path = "./output.wav"
final_audio.export(output_path, format="wav")
return output_path
# Gradio UI
gr.Interface(
fn=text_to_speech,
inputs=gr.Textbox(label="Enter Text"),
outputs=gr.Audio(label="Generated Speech"),
title="Swahili & English Text-to-Speech",
description="Type text in Swahili and English, and listen to the mixed-language speech.",
).launch()