File size: 1,218 Bytes
9659078 5a84593 f523090 5a84593 9659078 5a84593 9659078 a4b64d4 ac4deb7 8e00ffa 9659078 f523090 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import gradio as gr
from huggingface_hub import snapshot_download
import numpy as np
from scipy.io import wavfile
model_ids = [
'suno/bark',
]
for model_id in model_ids:
model_name = model_id.split('/')[-1]
snapshot_download(model_id, local_dir=f'checkpoints/{model_name}')
from TTS.tts.configs.bark_config import BarkConfig
from TTS.tts.models.bark import Bark
config = BarkConfig()
model = Bark.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="checkpoints/bark", eval=True)
def infer(prompt):
text = "Hello, my name is Manmay , how are you?"
# with random speaker
#output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None)
# cloning a speaker.
# It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz`
output_dict = model.synthesize(text, config, speaker_id="speaker", voice_dirs="bark_voices/")
print(output_dict)
sample_rate = 44100 # Replace with the actual sample rate
wavfile.write('output.wav', sample_rate, output_dict['wav'])
return "output.wav"
gr.Interface(fn=infer, inputs=[gr.Textbox()], outputs=[gr.Audio()]).launch() |