import os import tempfile import gradio as gr from TTS.api import TTS from TTS.utils.synthesizer import Synthesizer from huggingface_hub import hf_hub_download import json # Define constants MODEL_INFO = [ ["VITS Grapheme Multispeaker CV15(90K)", "best_model_56960.pth", "config.json", "saillab/multi_speaker"], ["VITS Grapheme Azure (61000)", "checkpoint_61000.pth", "config.json", "saillab/persian-tts-azure-grapheme-60K"], ["VITS Grapheme ARM24 Fine-Tuned on 1 (66651)", "best_model_66651.pth", "config.json", "saillab/persian-tts-grapheme-arm24-finetuned-on1"], ["VITS Grapheme ARM24 Fine-Tuned on 1 (120000)", "checkpoint_120000.pth", "config.json", "saillab/persian-tts-grapheme-arm24-finetuned-on1"], ] # Extract model names from MODEL_INFO MODEL_NAMES = [info[0] for info in MODEL_INFO] MAX_TXT_LEN = 400 TOKEN = os.getenv('HUGGING_FACE_HUB_TOKEN') model_files = {} config_files = {} # Create a dictionary to store synthesizer objects for each model synthesizers = {} # Download models and initialize synthesizers for info in MODEL_INFO: model_name, model_file, config_file, repo_name = info[:4] print(f"|> Downloading: {model_name}") # Download model and config files model_files[model_name] = hf_hub_download(repo_id=repo_name, filename=model_file, use_auth_token=TOKEN) config_files[model_name] = hf_hub_download(repo_id=repo_name, filename=config_file, use_auth_token=TOKEN) # Initialize synthesizer for the model synthesizer = Synthesizer(tts_checkpoint=model_files[model_name], tts_config_path=config_files[model_name], use_cuda=False) synthesizers[model_name] = synthesizer def synthesize(text: str, model_name: str) -> str: if len(text) > MAX_TXT_LEN: text = text[:MAX_TXT_LEN] print(f"Input text was cut off as it exceeded the {MAX_TXT_LEN} character limit.") synthesizer = synthesizers[model_name] if synthesizer is None: raise NameError("Model not found") wavs = synthesizer.tts(text) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: synthesizer.save_wav(wavs, fp) return fp.name iface = gr.Interface( fn=synthesize, inputs=[ gr.Textbox(label="Enter Text to Synthesize:", value="زین همرهان سست عناصر، دلم گرفت."), gr.Radio(label="Pick a Model", choices=MODEL_NAMES, value=MODEL_NAMES[0], type="value"), ], outputs=gr.Audio(label="Output", type='filepath'), examples=[["زین همرهان سست عناصر، دلم گرفت.", MODEL_NAMES[0]]], title='Persian TTS Playground', description=""" ### Persian text to speech model demo. """, article="", live=False ) iface.launch()