import torch from threading import Thread from transformers import AutoProcessor from transformers import set_seed from vocos_bark import BarkModel from scipy.io.wavfile import write from pydub import AudioSegment import numpy as np import os import gradio as gr import uuid import io from vocos import Vocos import os os.environ["GRADIO_TEMP_DIR"] = "/home/yoach/spaces/tmp" set_seed(0) def _grab_best_device(use_gpu=True): if torch.cuda.device_count() > 0 and use_gpu: device = "cuda" else: device = "cpu" return device device = _grab_best_device() HUB_PATH = "suno/bark" processor = AutoProcessor.from_pretrained(HUB_PATH) speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys() if "speaker" in key]) SAMPLE_RATE = 24_000 vocos = Vocos.from_pretrained("hubertsiuzdak/vocos-encodec-24khz-v2").to(device) # import model if device == "cpu": bark = BarkModel.from_pretrained(HUB_PATH) else: bark = BarkModel.from_pretrained(HUB_PATH).to(device) bark = bark.to_bettertransformer() # streaming inference def generate_audio(text, voice_preset = None, lag = 0): if voice_preset not in speaker_embeddings: voice_preset = None sentences = [ text, ] inputs = processor(sentences, voice_preset=voice_preset).to(device) # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way. fine_output = bark.generate( **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True ) print("Fine tokens generated") with torch.no_grad(): encodec_waveform = bark.codec_decode(fine_output) features = vocos.codes_to_features(fine_output.transpose(0,1)) vocos_waveform = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device)) return (SAMPLE_RATE, encodec_waveform.cpu().squeeze().numpy()), (SAMPLE_RATE, vocos_waveform.cpu().squeeze().numpy()) # Gradio blocks demo with gr.Blocks() as demo_blocks: gr.Markdown("""

🐶BARK with Vocos

""") gr.HTML("""

📢Vocos-enhanced TTS 🦾!

""") with gr.Group(): with gr.Row(): inp_text = gr.Textbox(label="What should Bark say?", info="Enter text here") dd = gr.Dropdown( speaker_embeddings, value=None, label="Available voice presets", info="Defaults to no speaker embeddings!" ) with gr.Row(): btn = gr.Button("Bark with Vocos TTS") with gr.Row(): out_audio_encodec = gr.Audio(type="numpy", autoplay=False, label="original output", show_label=True) out_audio_vocos = gr.Audio(type="numpy", autoplay=False, label="vocos enhanced output", show_label=True) btn.click(generate_audio, [inp_text, dd], [out_audio_encodec, out_audio_vocos]) demo_blocks.queue().launch(debug=True)