Spaces:

Futuresony
/

Automatic-speech-recognition

Sleeping

Create app.py

3bb7afd verified 5 months ago

1.49 kB

	import gradio as gr
	from ttsmms import download, TTS
	from langdetect import detect
	import os
	from pydub import AudioSegment
	from pydub.playback import play

	# Ensure ffmpeg works inside Hugging Face Spaces
	AudioSegment.converter = "/usr/bin/ffmpeg"

	# Download and load TTS models
	swahili_dir = download("swh", "./data/swahili")
	english_dir = download("eng", "./data/english") # Ensure an English TTS model is available

	swahili_tts = TTS(swahili_dir)
	english_tts = TTS(english_dir)

	# Function to process mixed-language text
	def text_to_speech(text):
	words = text.split() # Split text into words
	audio_clips = []

	for word in words:
	lang = detect(word) # Detect language of each word
	wav_path = f"./temp_{word}.wav"

	if lang == "sw":
	swahili_tts.synthesis(word, wav_path=wav_path)
	else:
	english_tts.synthesis(word, wav_path=wav_path)

	audio_clips.append(AudioSegment.from_wav(wav_path))
	os.remove(wav_path) # Remove temporary files

	# Combine all audio clips
	final_audio = sum(audio_clips)
	output_path = "./output.wav"
	final_audio.export(output_path, format="wav")

	return output_path

	# Gradio UI
	gr.Interface(
	fn=text_to_speech,
	inputs=gr.Textbox(label="Enter Text"),
	outputs=gr.Audio(label="Generated Speech"),
	title="Swahili & English Text-to-Speech",
	description="Type text in Swahili and English, and listen to the mixed-language speech.",
	).launch()