Spaces:

MBZUAI
/

artst-tts-demo

Runtime error

File size: 2,831 Bytes

import os
import torch
import gradio as gr
import os.path as op
import pyarabic.araby as araby

from artst.tasks.artst import ArTSTTask
from transformers import SpeechT5HifiGan
from artst.models.artst import ArTSTTransformerModel
from fairseq.tasks.hubert_pretraining import LabelEncoder
from fairseq.data.audio.speech_to_text_dataset import get_features_or_waveform 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

WORK_DIR = os.getcwd()
checkpoint = torch.load('ckpts/clartts_tts.pt')
checkpoint['cfg']['task'].t5_task = 't2s'
checkpoint['cfg']['task'].hubert_label_dir = "utils/"
checkpoint['cfg']['task'].bpe_tokenizer = "utils/arabic.model"
checkpoint['cfg']['task'].data = "utils/"
task = ArTSTTask.setup_task(checkpoint['cfg']['task'])

emb_path='embs/clartts.npy'
model = ArTSTTransformerModel.build_model(checkpoint['cfg']['model'], task)
model.load_state_dict(checkpoint['model'])

checkpoint['cfg']['task'].bpe_tokenizer = task.build_bpe(checkpoint['cfg']['model'])
tokenizer = checkpoint['cfg']['task'].bpe_tokenizer

processor = LabelEncoder(task.dicts['text'])

vocoder = SpeechT5HifiGan.from_pretrained('microsoft/speecht5_hifigan').to(device)

def get_embs(emb_path):
    spkembs = get_features_or_waveform(emb_path)
    spkembs = torch.from_numpy(spkembs).float().unsqueeze(0)
    return spkembs

def process_text(text):
    text = araby.strip_diacritics(text)
    return processor(tokenizer.encode(text)).reshape(1, -1)

net_input = {}

def inference(text, spkr=emb_path):
    net_input['src_tokens'] = process_text(text)
    net_input['spkembs'] = get_embs(spkr)
    outs, _, attn = task.generate_speech(
            [model], 
            net_input,
        )
    with torch.no_grad():
        gen_audio = vocoder(outs.to(device))
    return (16000,gen_audio.cpu().numpy())

text_box = gr.Textbox(max_lines=2, label="Arabic Text", rtl=True)
out = gr.Audio(label="Synthesized Audio", type="numpy")
title="ArTST: Arabic Speech Synthesis"
description="ArTST: Arabic text and speech transformer based on the T5 transformer. This space demonstarates the TTS checkpoint finetuned on \
    the CLARTTS dataset. The model is pre-trained on the MGB-2 dataset.Check the  <a href='https://github.com/mbzuai-nlp/ArTST'> ArTST repo</a> for implementation code and \
    Read our <a href='https://arxiv.org/abs/2310.16621'>paper</a> for more details."
examples=["لأن فراق المألوف في العادة ومجانبة ما صار متفقا عليه بالمواضعة",\
    "ومن لطيف حكمته أن جعل لكل عبادة حالتين",\
    "فمن لهم عدل الإنسان مع من فوقه"]

demo = gr.Interface(inference, \
    inputs=text_box, outputs=out, title=title, description=description, examples=examples)

if __name__ == "__main__":
    demo.launch(share=True)