File size: 2,077 Bytes
9659078
5f924a4
 
9659078
5a84593
f523090
 
 
5a84593
 
 
 
 
 
 
 
9659078
 
 
 
 
5a84593
9659078
5f924a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9659078
e236784
9659078
 
a4b64d4
 
 
 
5f924a4
8e00ffa
9659078
3c31edb
f523090
 
 
 
 
 
e236784
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
import os 
import shutil

from huggingface_hub import snapshot_download
import numpy as np
from scipy.io import wavfile


model_ids = [
    'suno/bark',
]
for model_id in model_ids:
    model_name = model_id.split('/')[-1]
    snapshot_download(model_id, local_dir=f'checkpoints/{model_name}')

from TTS.tts.configs.bark_config import BarkConfig
from TTS.tts.models.bark import Bark

config = BarkConfig()
model = Bark.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="checkpoints/bark", eval=True)

def infer(prompt, input_wav_file):

    # Path to your WAV file
    source_path = input_wav_file

    # Destination directory
    destination_directory = "bark_voices"

    # Extract the file name without the extension
    file_name = os.path.splitext(os.path.basename(source_path))[0]

    # Construct the full destination directory path
    destination_path = os.path.join(destination_directory, file_name)

    # Create the new directory
    os.makedirs(destination_path, exist_ok=True)

    # Move the WAV file to the new directory
    shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav"))


    text = prompt

    # with random speaker
    #output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None)

    # cloning a speaker.
    # It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz`
    output_dict = model.synthesize(text, config, speaker_id=f"{file_name}", voice_dirs="bark_voices/")
    print(output_dict)

    sample_rate = 24000  # Replace with the actual sample rate

    wavfile.write('output.wav', sample_rate, output_dict['wav'])

   
    return "output.wav"

gr.Interface(fn=infer, 
             inputs=[gr.Textbox(label="Text to speech prompt"), 
                     gr.Audio(
                         label="WAV voice to clone", 
                         type="filepath",
                         source="upload")],
             outputs=[gr.Audio()], 
             title="Instant Voice Cloning").launch()