import gradio as gr from TTS.api import TTS import os import tempfile import sounddevice as sd from scipy.io.wavfile import write from concurrent.futures import ThreadPoolExecutor # Agree to Coqui's terms os.environ["COQUI_TOS_AGREED"] = "1" os.environ["OMP_NUM_THREADS"] = "2" # Set CPU threads to 8 (adjust based on your CPU cores) # Load the model and optimize CPU usage tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", gpu=False) tts.to("cpu") # Supported languages by the model LANGUAGES = { "English": "en", "Spanish": "es", "German": "de", "French": "fr", "Italian": "it", "Hindi" : "hi", "Russian": "ru", "Turkish": "tr", "Japanese": "ja", "Korean": "ko", "Hungarian": "hu" } # Function to generate voice def generate_voice(text, speaker_audio, language): output_path = tempfile.mktemp(suffix=".wav") tts.tts_to_file( text=text, speaker_wav=speaker_audio, file_path=output_path, language=LANGUAGES.get(language, "en"), sample_rate=44100, ) return output_path # Function to record audio from the mic def record_audio(duration=10, filename="mic_input.wav"): fs = 44100 # Sample rate print("Recording...") audio_data = sd.rec(int(duration * fs), samplerate=fs, channels=1) sd.wait() # Wait until recording is finished write(filename, fs, audio_data) print(f"Recording saved as {filename}") return filename # Gradio interface with gr.Blocks() as demo: gr.Markdown("# 🗣️ Voice Cloning with Coqui XTTS-v2") with gr.Row(): text_input = gr.Textbox(label="Enter Text", placeholder="Type the text you want to synthesize...") speaker_audio_input = gr.Audio(label="Upload Speaker Audio (WAV)", type="filepath") language_dropdown = gr.Dropdown( label="Select Output Language", choices=list(LANGUAGES.keys()), value="English" ) mic_button = gr.Button("Record from Mic") output_audio = gr.Audio(label="Generated Voice", type="filepath") generate_button = gr.Button("Generate Voice") mic_button.click( fn=lambda: record_audio(duration=10), inputs=[], outputs=speaker_audio_input, ) generate_button.click( fn=generate_voice, inputs=[text_input, speaker_audio_input, language_dropdown], outputs=output_audio ) # Launch the app demo.launch(server_name="0.0.0.0", server_port=7860, share=True)