File size: 1,921 Bytes
2707e70
f7f6e43
 
 
 
2707e70
ba00a43
 
 
 
f7f6e43
ba00a43
 
 
 
f5c319c
 
f7f6e43
 
 
aa1b2f5
467f7e2
 
 
 
 
 
 
 
 
f7f6e43
 
 
467f7e2
 
 
 
 
 
f7f6e43
 
 
 
 
 
 
 
 
 
 
 
2707e70
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import gradio as gr
from transformers import BarkModel, AutoProcessor
import torch
from scipy.io.wavfile import write as write_wav
import os

## if you run on GPU use the following code: ####
device = "cuda" if torch.cuda.is_available() else "cpu"
model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
model.enable_cpu_offload()

# ### if you run on CPU use the following code: ####
# device = "cpu"
# ### load in fp16
# model = BarkModel.from_pretrained("suno/bark-small").to(device)

processor = AutoProcessor.from_pretrained("suno/bark")

voice_preset = "v2/en_speaker_3"

# generate audio
# def generate_audio(text, preset, output_file_name="bark_generation"):
#     file_name = output_file_name + ".wav"
#     inputs = processor(text, voice_preset=preset)
#     audio_array = model.generate(**inputs)
#     audio_array = audio_array.cpu().numpy().squeeze()   
#     sample_rate = model.generation_config.sample_rate
#     write_wav(file_name, sample_rate, audio_array)
#     return file_name

def generate_audio(text, preset, output_file_name="bark_generation"):
    file_name = output_file_name + ".wav"
    inputs = processor(text, voice_preset=preset)
    
    # Ensure the inputs are on the right device
    for k, v in inputs.items():
        if isinstance(v, torch.Tensor):
            inputs[k] = v.to(device)
            
    audio_array = model.generate(**inputs)
    audio_array = audio_array.cpu().numpy().squeeze()   
    sample_rate = model.generation_config.sample_rate
    write_wav(file_name, sample_rate, audio_array)
    return file_name


#Bark Presets List
presets = ["v2/en_speaker_0","v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_3",  "v2/en_speaker_4",  "v2/en_speaker_5",  "v2/en_speaker_6"]

#Gradio Interface
iface = gr.Interface(fn=generate_audio, inputs=["text", gr.components.Dropdown(choices=presets), "text"], outputs="audio")
iface.launch()