import gradio as gr import os import shutil #from huggingface_hub import snapshot_download import numpy as np from scipy.io import wavfile """ model_ids = [ 'suno/bark', ] for model_id in model_ids: model_name = model_id.split('/')[-1] snapshot_download(model_id, local_dir=f'checkpoints/{model_name}') from TTS.tts.configs.bark_config import BarkConfig from TTS.tts.models.bark import Bark #os.environ['CUDA_VISIBLE_DEVICES'] = '1' config = BarkConfig() model = Bark.init_from_config(config) model.load_checkpoint(config, checkpoint_dir="checkpoints/bark", eval=True) """ from TTS.api import TTS tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True) def infer(prompt, input_wav_file): print("SAVING THE AUDIO FILE TO WHERE IT BELONGS") # Path to your WAV file source_path = input_wav_file # Destination directory destination_directory = "bark_voices" # Extract the file name without the extension file_name = os.path.splitext(os.path.basename(source_path))[0] # Construct the full destination directory path destination_path = os.path.join(destination_directory, file_name) # Create the new directory os.makedirs(destination_path, exist_ok=True) # Move the WAV file to the new directory shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav")) """ text = prompt print("SYNTHETIZING...") # with random speaker #output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None) # cloning a speaker. # It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz` output_dict = model.synthesize( text, config, speaker_id=f"{file_name}", voice_dirs="bark_voices/", gpu=True ) print(output_dict) sample_rate = 24000 # Replace with the actual sample rate print("WRITING WAVE FILE") wavfile.write( 'output.wav', sample_rate, output_dict['wav'] ) """ tts.tts_to_file(text=prompt, file_path="output.wav", voice_dir="bark_voices/", speaker=f"{file_name}") # List all the files and subdirectories in the given directory contents = os.listdir(f"bark_voices/{file_name}") # Print the contents for item in contents: print(item) tts_video = gr.make_waveform(audio="output.wav") return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True) css = """ #col-container {max-width: 780px; margin-left: auto; margin-right: auto;} img[src*='#center'] { display: block; margin: auto; } .footer { margin-bottom: 45px; margin-top: 10px; text-align: center; border-bottom: 1px solid #e5e5e5; } .footer>p { font-size: .8rem; display: inline-block; padding: 0 10px; transform: translateY(10px); background: white; } .dark .footer { border-color: #303030; } .dark .footer>p { background: #0b0f19; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.Markdown("""
Clone any voice in less than 2 minutes with this Coqui TSS + Bark demo !
Upload a clean 20 seconds WAV file of the voice you want to clone,
type your text-to-speech prompt and hit submit !