import os import sys import re import json import torch import inflect import random import uroman as ur import numpy as np import torchaudio import gradio as gr import subprocess from transformers import AutoModelForCausalLM, AutoTokenizer from outetts.wav_tokenizer.decoder import WavTokenizer # Check if yarngpt is installed, if not install it manually try: from yarngpt.audiotokenizer import AudioTokenizerV2 except ImportError: print("YarnGPT not found, attempting to install...") subprocess.run(["chmod", "+x", "install.sh"], check=True) subprocess.run(["./install.sh"], check=True) # Add the yarngpt directory to the Python path sys.path.append(os.path.join(os.getcwd(), "yarngpt")) # Try importing again from yarngpt.audiotokenizer import AudioTokenizerV2 # Check if model files exist wav_tokenizer_config_path = "wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml" wav_tokenizer_model_path = "wavtokenizer_large_speech_320_24k.ckpt" if not os.path.exists(wav_tokenizer_config_path) or not os.path.exists(wav_tokenizer_model_path): print("Model files not found, downloading...") if not os.path.exists(wav_tokenizer_config_path): subprocess.run([ "wget", "https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml" ], check=True) if not os.path.exists(wav_tokenizer_model_path): subprocess.run([ "wget", "https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt" ], check=True) # Initialize paths and models tokenizer_path = "saheedniyi/YarnGPT2" # Add debug info print(f"Current directory: {os.getcwd()}") print(f"Files in directory: {os.listdir('.')}") print(f"Config exists: {os.path.exists(wav_tokenizer_config_path)}") print(f"Model exists: {os.path.exists(wav_tokenizer_model_path)}") # Initialize the audio tokenizer try: print("Initializing audio tokenizer...") audio_tokenizer = AudioTokenizerV2( tokenizer_path, wav_tokenizer_model_path, wav_tokenizer_config_path ) print("Audio tokenizer initialized") except Exception as e: print(f"Error initializing audio tokenizer: {str(e)}") raise # Load the model try: print("Loading model...") model = AutoModelForCausalLM.from_pretrained( tokenizer_path, torch_dtype="auto" ).to(audio_tokenizer.device) print("Model loaded") except Exception as e: print(f"Error loading model: {str(e)}") raise # Function to generate speech def generate_speech(text, language, speaker_name, temperature=0.1, repetition_penalty=1.1): # Create prompt prompt = audio_tokenizer.create_prompt(text, lang=language, speaker_name=speaker_name) # Tokenize prompt input_ids = audio_tokenizer.tokenize_prompt(prompt) # Generate output output = model.generate( input_ids=input_ids, temperature=temperature, repetition_penalty=repetition_penalty, max_length=4000, ) # Get audio codes and convert to audio codes = audio_tokenizer.get_codes(output) audio = audio_tokenizer.get_audio(codes) # Save audio to file output_path = "output.wav" torchaudio.save(output_path, audio, sample_rate=24000) return output_path # Create Gradio interface def tts_interface(text, language, speaker_name, temperature, repetition_penalty): try: print(f"Generating speech for: {text[:30]}...") audio_path = generate_speech( text, language, speaker_name, temperature, repetition_penalty ) print("Speech generated successfully") return audio_path except Exception as e: print(f"Error in tts_interface: {str(e)}") return f"Error: {str(e)}" # Define available languages and speakers languages = ["english", "igbo", "yoruba", "hausa", "pidgin"] speakers = ["idera", "enitan", "abeo", "eniola", "kachi", "aisha", "amara", "bello", "chidi"] # Create the Gradio interface demo = gr.Interface( fn=tts_interface, inputs=[ gr.Textbox(label="Text to convert to speech", lines=5, value="Welcome to YarnGPT text-to-speech model for African languages."), gr.Dropdown(languages, label="Language", value="english"), gr.Dropdown(speakers, label="Speaker", value="idera"), gr.Slider(0.1, 1.0, value=0.1, label="Temperature"), gr.Slider(1.0, 2.0, value=1.1, label="Repetition Penalty"), ], outputs=gr.Audio(type="filepath"), title="YarnGPT Text-to-Speech", description="Convert text to speech using YarnGPT model for various African languages", ) # Launch the app if __name__ == "__main__": print("Starting Gradio interface...") demo.launch()