|
import gradio as gr |
|
from ttsmms import download, TTS |
|
from langdetect import detect |
|
import os |
|
from pydub import AudioSegment |
|
from pydub.playback import play |
|
|
|
|
|
AudioSegment.converter = "/usr/bin/ffmpeg" |
|
|
|
|
|
swahili_dir = download("swh", "./data/swahili") |
|
english_dir = download("eng", "./data/english") |
|
|
|
swahili_tts = TTS(swahili_dir) |
|
english_tts = TTS(english_dir) |
|
|
|
|
|
def text_to_speech(text): |
|
words = text.split() |
|
audio_clips = [] |
|
|
|
for word in words: |
|
lang = detect(word) |
|
wav_path = f"./temp_{word}.wav" |
|
|
|
if lang == "sw": |
|
swahili_tts.synthesis(word, wav_path=wav_path) |
|
else: |
|
english_tts.synthesis(word, wav_path=wav_path) |
|
|
|
audio_clips.append(AudioSegment.from_wav(wav_path)) |
|
os.remove(wav_path) |
|
|
|
|
|
final_audio = sum(audio_clips) |
|
output_path = "./output.wav" |
|
final_audio.export(output_path, format="wav") |
|
|
|
return output_path |
|
|
|
|
|
gr.Interface( |
|
fn=text_to_speech, |
|
inputs=gr.Textbox(label="Enter Text"), |
|
outputs=gr.Audio(label="Generated Speech"), |
|
title="Swahili & English Text-to-Speech", |
|
description="Type text in Swahili and English, and listen to the mixed-language speech.", |
|
).launch() |
|
|