import gradio as gr import os import shutil from huggingface_hub import snapshot_download import numpy as np from scipy.io import wavfile model_ids = [ 'suno/bark', ] for model_id in model_ids: model_name = model_id.split('/')[-1] snapshot_download(model_id, local_dir=f'checkpoints/{model_name}') from TTS.tts.configs.bark_config import BarkConfig from TTS.tts.models.bark import Bark config = BarkConfig() model = Bark.init_from_config(config) model.load_checkpoint(config, checkpoint_dir="checkpoints/bark", eval=True) def infer(prompt, input_wav_file): # Path to your WAV file source_path = input_wav_file # Destination directory destination_directory = "bark_voices" # Extract the file name without the extension file_name = os.path.splitext(os.path.basename(source_path))[0] # Construct the full destination directory path destination_path = os.path.join(destination_directory, file_name) # Create the new directory os.makedirs(destination_path, exist_ok=True) # Move the WAV file to the new directory shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav")) text = prompt # with random speaker #output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None) # cloning a speaker. # It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz` output_dict = model.synthesize(text, config, speaker_id=f"{file_name}", voice_dirs="bark_voices/") print(output_dict) sample_rate = 24000 # Replace with the actual sample rate wavfile.write('output.wav', sample_rate, output_dict['wav']) return "output.wav" gr.Interface(fn=infer, inputs=[gr.Textbox(label="Text to speech prompt"), gr.Audio( label="WAV voice to clone", type="filepath", source="upload")], outputs=[gr.Audio()], title="Instant Voice Cloning").launch()