Spaces:

okewunmi
/

tts

Running

App Files Files Community

tts / app.py

okewunmi

Update app.py

6c2dbc0 verified 23 days ago

raw

history blame contribute delete

9.13 kB

	import os
	import sys
	import gradio as gr
	import torch
	import torchaudio
	import uroman
	import numpy as np
	import requests
	import hashlib
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from outetts.wav_tokenizer.decoder import WavTokenizer

	# Set up logging
	import logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Clone YarnGPT at startup
	if not os.path.exists("yarngpt"):
	logger.info("Cloning YarnGPT repository...")
	os.system("git clone https://github.com/saheedniyi02/yarngpt.git")
	# Add the repository to Python path
	sys.path.append("yarngpt")
	else:
	sys.path.append("yarngpt")

	# Import the YarnGPT AudioTokenizer
	from yarngpt.audiotokenizer import AudioTokenizerV2

	# Constants and paths
	MODEL_PATH = "saheedniyi/YarnGPT2b"
	WAV_TOKENIZER_CONFIG_URL = "https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
	WAV_TOKENIZER_MODEL_URL = "https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt"
	WAV_TOKENIZER_CONFIG_PATH = "wavtokenizer_config.yaml"
	WAV_TOKENIZER_MODEL_PATH = "wavtokenizer_model.ckpt"

	# Function to download files with verification
	def download_file(url, output_path):
	"""Download a file with progress tracking and verification"""
	logger.info(f"Downloading {url} to {output_path}")

	# Stream the file download
	with requests.get(url, stream=True) as response:
	response.raise_for_status()
	total_size = int(response.headers.get('content-length', 0))

	with open(output_path, 'wb') as f:
	downloaded = 0
	for chunk in response.iter_content(chunk_size=8192):
	if chunk:
	f.write(chunk)
	downloaded += len(chunk)
	percent = int(100 * downloaded / total_size) if total_size > 0 else 0
	if percent % 10 == 0:
	logger.info(f"Download progress: {percent}%")

	# Verify the file exists and has content
	if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
	logger.info(f"Successfully downloaded {output_path}")
	return True
	else:
	logger.error(f"Failed to download {output_path}")
	return False

	# Download the required files
	def download_required_files():
	# Download config file
	if not os.path.exists(WAV_TOKENIZER_CONFIG_PATH) or os.path.getsize(WAV_TOKENIZER_CONFIG_PATH) == 0:
	logger.info("Downloading WavTokenizer config...")
	if not download_file(WAV_TOKENIZER_CONFIG_URL, WAV_TOKENIZER_CONFIG_PATH):
	raise RuntimeError("Failed to download WavTokenizer config")

	# Download model file
	if not os.path.exists(WAV_TOKENIZER_MODEL_PATH) or os.path.getsize(WAV_TOKENIZER_MODEL_PATH) == 0:
	logger.info("Downloading WavTokenizer model...")
	if not download_file(WAV_TOKENIZER_MODEL_URL, WAV_TOKENIZER_MODEL_PATH):
	raise RuntimeError("Failed to download WavTokenizer model")

	# Verify files exist
	if not os.path.exists(WAV_TOKENIZER_CONFIG_PATH) or not os.path.exists(WAV_TOKENIZER_MODEL_PATH):
	raise RuntimeError("Required files not found")

	# Verify files have content
	if os.path.getsize(WAV_TOKENIZER_CONFIG_PATH) == 0 or os.path.getsize(WAV_TOKENIZER_MODEL_PATH) == 0:
	raise RuntimeError("Downloaded files are empty")

	logger.info("All required files are downloaded and verified")

	# Initialize the model and tokenizer
	def initialize_model():
	try:
	# Download required files
	download_required_files()

	logger.info("Initializing AudioTokenizer...")
	audio_tokenizer = AudioTokenizerV2(
	MODEL_PATH,
	WAV_TOKENIZER_MODEL_PATH,
	WAV_TOKENIZER_CONFIG_PATH
	)

	logger.info("Loading YarnGPT model...")
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_PATH,
	torch_dtype="auto"
	).to(audio_tokenizer.device)

	logger.info("Model initialization complete!")
	return model, audio_tokenizer
	except Exception as e:
	logger.error(f"Failed to initialize model: {str(e)}")
	raise

	# Initialize the model and tokenizer
	logger.info("Starting model initialization...")
	try:
	model, audio_tokenizer = initialize_model()
	except Exception as e:
	logger.error(f"Error initializing model: {str(e)}")
	# Provide a basic interface to show the error
	demo = gr.Interface(
	fn=lambda x: f"Model initialization failed: {str(e)}. Please check the space logs for more details.",
	inputs=gr.Textbox(label="Error occurred during initialization"),
	outputs=gr.Textbox(),
	title="YarnGPT - Initialization Error"
	)
	demo.launch()
	# Exit the script
	sys.exit(1)

	# Available voices and languages
	VOICES = ["idera", "jude", "kemi", "tunde", "funmi"]
	LANGUAGES = ["english", "yoruba", "igbo", "hausa", "pidgin"]

	# Function to generate speech
	def generate_speech(text, language, voice, temperature=0.1, rep_penalty=1.1):
	if not text:
	return None, "Please enter some text to convert to speech."

	try:
	logger.info(f"Generating speech for text: {text[:50]}...")

	# Create prompt
	prompt = audio_tokenizer.create_prompt(text, lang=language, speaker_name=voice)

	# Tokenize prompt
	input_ids = audio_tokenizer.tokenize_prompt(prompt)

	# Generate output
	output = model.generate(
	input_ids=input_ids,
	temperature=temperature,
	repetition_penalty=rep_penalty,
	max_length=4000,
	)

	# Convert to audio
	codes = audio_tokenizer.get_codes(output)
	audio = audio_tokenizer.get_audio(codes)

	# Save audio to file
	temp_audio_path = "output.wav"
	torchaudio.save(temp_audio_path, audio, sample_rate=24000)

	logger.info("Speech generation complete")
	return temp_audio_path, f"Successfully generated speech for: {text[:50]}..."

	except Exception as e:
	logger.error(f"Error generating speech: {str(e)}")
	return None, f"Error generating speech: {str(e)}"

	# Example text for demonstration
	examples = [
	["Hello, my name is Claude. I am an AI assistant created by Anthropic.", "english", "idera"],
	["Báwo ni o ṣe wà? Mo ń gbádùn ọjọ́ mi.", "yoruba", "kemi"],
	["I don dey come house now, make you prepare food.", "pidgin", "jude"]
	]

	# Create the Gradio interface
	with gr.Blocks(title="YarnGPT - Nigerian Accented Text-to-Speech") as demo:
	gr.Markdown("# YarnGPT - Nigerian Accented Text-to-Speech")
	gr.Markdown("Generate speech with Nigerian accents using YarnGPT model.")

	with gr.Tab("Basic TTS"):
	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Text to convert to speech",
	placeholder="Enter text here...",
	lines=5
	)
	language = gr.Dropdown(
	label="Language",
	choices=LANGUAGES,
	value="english"
	)
	voice = gr.Dropdown(
	label="Voice",
	choices=VOICES,
	value="idera"
	)
	temperature = gr.Slider(
	label="Temperature",
	minimum=0.1,
	maximum=1.0,
	value=0.1,
	step=0.1
	)
	rep_penalty = gr.Slider(
	label="Repetition Penalty",
	minimum=1.0,
	maximum=2.0,
	value=1.1,
	step=0.1
	)
	generate_btn = gr.Button("Generate Speech")

	with gr.Column():
	audio_output = gr.Audio(label="Generated Speech")
	status_output = gr.Textbox(label="Status")

	gr.Examples(
	examples=examples,
	inputs=[text_input, language, voice],
	outputs=[audio_output, status_output],
	fn=generate_speech,
	cache_examples=False
	)

	generate_btn.click(
	generate_speech,
	inputs=[text_input, language, voice, temperature, rep_penalty],
	outputs=[audio_output, status_output]
	)

	gr.Markdown("""
	## About YarnGPT
	YarnGPT is a text-to-speech model with Nigerian accents. It supports multiple languages and voices.

	### Credits
	- Model by [saheedniyi](https://huggingface.co/saheedniyi/YarnGPT2b)
	- [Original Repository](https://github.com/saheedniyi02/yarngpt)
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.launch()