Spaces:

okewunmi
/

tts

Running

App Files Files Community

okewunmi commited on Mar 13

Commit

03d09ab

verified ·

1 Parent(s): 5155c90

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -129

app.py CHANGED Viewed

@@ -1,151 +1,137 @@
 import os
-import sys
-import re
-import json
 import torch
-import inflect
-import random
-import uroman as ur
-import numpy as np
 import torchaudio
-import gradio as gr
-import subprocess
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from outetts.wav_tokenizer.decoder import WavTokenizer
-# Clone the YarnGPT repository if it doesn't exist
-if not os.path.exists("yarngpt"):
-    print("Cloning YarnGPT repository...")
-    subprocess.run(["git", "clone", "https://github.com/saheedniyi02/yarngpt.git"], check=True)
-# Add the yarngpt directory to the Python path
-yarngpt_path = os.path.abspath("yarngpt")
-if yarngpt_path not in sys.path:
-    sys.path.append(yarngpt_path)
-    print(f"Added {yarngpt_path} to Python path")
-# Now try importing from yarngpt
 from yarngpt.audiotokenizer import AudioTokenizerV2
-# Download model files if they don't exist
-wav_tokenizer_config_path = "wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
-wav_tokenizer_model_path = "wavtokenizer_large_speech_320_24k.ckpt"
-if not os.path.exists(wav_tokenizer_config_path):
-    print(f"Downloading {wav_tokenizer_config_path}...")
-    subprocess.run([
-        "wget",
-        "https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
-    ], check=True)
-if not os.path.exists(wav_tokenizer_model_path):
-    print(f"Downloading {wav_tokenizer_model_path}...")
-    subprocess.run([
-        "wget",
-        "https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt"
-    ], check=True)
-# Initialize paths and models
-tokenizer_path = "saheedniyi/YarnGPT2"
-# Print debug info
-print(f"Current directory: {os.getcwd()}")
-print(f"Files in directory: {os.listdir('.')}")
-print(f"Config exists: {os.path.exists(wav_tokenizer_config_path)}")
-print(f"Model exists: {os.path.exists(wav_tokenizer_model_path)}")
-# Initialize the audio tokenizer
-print("Initializing audio tokenizer...")
-audio_tokenizer = AudioTokenizerV2(
-    tokenizer_path, wav_tokenizer_model_path, wav_tokenizer_config_path
-)
-print("Audio tokenizer initialized")
-# Load the model
-print("Loading model...")
-model = AutoModelForCausalLM.from_pretrained(
-    tokenizer_path, torch_dtype="auto"
-).to(audio_tokenizer.device)
-print("Model loaded successfully")
 # Function to generate speech
-def generate_speech(text, language, speaker_name, temperature=0.1, repetition_penalty=1.1):
-    print(f"Generating speech for: '{text[:50]}...'")
-    print(f"Parameters: language={language}, speaker={speaker_name}, temp={temperature}, rep_penalty={repetition_penalty}")
-    # Create prompt
-    prompt = audio_tokenizer.create_prompt(text, lang=language, speaker_name=speaker_name)
-    print("Prompt created")
-    # Tokenize prompt
-    input_ids = audio_tokenizer.tokenize_prompt(prompt)
-    print("Prompt tokenized")
-    # Generate output
-    output = model.generate(
-        input_ids=input_ids,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        max_length=4000,
-    )
-    print("Model generation complete")
-    # Get audio codes and convert to audio
-    codes = audio_tokenizer.get_codes(output)
-    print("Audio codes extracted")
-    audio = audio_tokenizer.get_audio(codes)
-    print("Audio generated")
-    # Save audio to file
-    output_path = "output.wav"
-    torchaudio.save(output_path, audio, sample_rate=24000)
-    print(f"Audio saved to {output_path}")
-    return output_path
-# Create Gradio interface
-def tts_interface(text, language, speaker_name, temperature, repetition_penalty):
     try:
-        audio_path = generate_speech(
-            text,
-            language,
-            speaker_name,
-            temperature,
-            repetition_penalty
         )
-        return audio_path
     except Exception as e:
-        import traceback
-        error_details = traceback.format_exc()
-        print(f"Error in tts_interface: {str(e)}\n{error_details}")
-        return f"Error: {str(e)}"
-# Define available languages and speakers
-languages = ["english", "igbo", "yoruba", "hausa", "pidgin"]
-speakers = ["idera", "enitan", "abeo", "eniola", "kachi", "aisha", "amara", "bello", "chidi"]
 # Create the Gradio interface
-demo = gr.Interface(
-    fn=tts_interface,
-    inputs=[
-        gr.Textbox(label="Text to convert to speech", lines=5, value="Welcome to YarnGPT text-to-speech model for African languages."),
-        gr.Dropdown(languages, label="Language", value="english"),
-        gr.Dropdown(speakers, label="Speaker", value="idera"),
-        gr.Slider(0.1, 1.0, value=0.1, label="Temperature"),
-        gr.Slider(1.0, 2.0, value=1.1, label="Repetition Penalty"),
-    ],
-    outputs=gr.Audio(type="filepath"),
-    title="YarnGPT Text-to-Speech",
-    description="Convert text to speech using YarnGPT model for various African languages.",
-    examples=[
-        ["The election was won by businessman and politician, Moshood Abiola, but Babangida annulled the results, citing concerns over national security.", "english", "idera", 0.1, 1.1],
-        ["Hello, how are you today?", "english", "enitan", 0.1, 1.1],
-        ["Bawo ni?", "yoruba", "eniola", 0.2, 1.2],
-    ]
-)
 # Launch the app
-if __name__ == "__main__":
-    print("Starting Gradio interface...")
-    demo.launch()

 import os
+import gradio as gr
 import torch
 import torchaudio
+import uroman
+import numpy as np
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from outetts.wav_tokenizer.decoder import WavTokenizer
+# Import the YarnGPT AudioTokenizer
+# Assuming the git repository is cloned in the same directory
 from yarngpt.audiotokenizer import AudioTokenizerV2
+# Constants and paths
+MODEL_PATH = "saheedniyi/YarnGPT2b"
+WAV_TOKENIZER_CONFIG_PATH = "wavtokenizer_config.yaml"
+WAV_TOKENIZER_MODEL_PATH = "wavtokenizer_model.ckpt"
+# Download the model files at startup
+os.system(f"wget -O {WAV_TOKENIZER_CONFIG_PATH} https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml")
+os.system(f"wget -O {WAV_TOKENIZER_MODEL_PATH} https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt")
+os.system("git clone https://github.com/saheedniyi02/yarngpt.git")
+# Initialize the model and tokenizer
+def initialize_model():
+    audio_tokenizer = AudioTokenizerV2(
+        MODEL_PATH,
+        WAV_TOKENIZER_MODEL_PATH,
+        WAV_TOKENIZER_CONFIG_PATH
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        torch_dtype="auto"
+    ).to(audio_tokenizer.device)
+    return model, audio_tokenizer
+# Initialize the model and tokenizer
+model, audio_tokenizer = initialize_model()
+# Available voices and languages
+VOICES = ["idera", "jude", "kemi", "tunde", "funmi"]
+LANGUAGES = ["english", "yoruba", "igbo", "hausa", "pidgin"]
 # Function to generate speech
+def generate_speech(text, language, voice, temperature=0.1, rep_penalty=1.1):
+    if not text:
+        return None, "Please enter some text to convert to speech."
     try:
+        # Create prompt
+        prompt = audio_tokenizer.create_prompt(text, lang=language, speaker_name=voice)
+        # Tokenize prompt
+        input_ids = audio_tokenizer.tokenize_prompt(prompt)
+        # Generate output
+        output = model.generate(
+            input_ids=input_ids,
+            temperature=temperature,
+            repetition_penalty=rep_penalty,
+            max_length=4000,
         )
+        # Convert to audio
+        codes = audio_tokenizer.get_codes(output)
+        audio = audio_tokenizer.get_audio(codes)
+        # Save audio to file
+        temp_audio_path = "output.wav"
+        torchaudio.save(temp_audio_path, audio, sample_rate=24000)
+        return temp_audio_path, f"Successfully generated speech for: {text[:50]}..."
     except Exception as e:
+        return None, f"Error generating speech: {str(e)}"
 # Create the Gradio interface
+with gr.Blocks(title="YarnGPT - Nigerian Accented Text-to-Speech") as demo:
+    gr.Markdown("# YarnGPT - Nigerian Accented Text-to-Speech")
+    gr.Markdown("Generate speech with Nigerian accents using YarnGPT model.")
+    with gr.Tab("Basic TTS"):
+        with gr.Row():
+            with gr.Column():
+                text_input = gr.Textbox(
+                    label="Text to convert to speech",
+                    placeholder="Enter text here...",
+                    lines=5
+                )
+                language = gr.Dropdown(
+                    label="Language",
+                    choices=LANGUAGES,
+                    value="english"
+                )
+                voice = gr.Dropdown(
+                    label="Voice",
+                    choices=VOICES,
+                    value="idera"
+                )
+                temperature = gr.Slider(
+                    label="Temperature",
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=0.1,
+                    step=0.1
+                )
+                rep_penalty = gr.Slider(
+                    label="Repetition Penalty",
+                    minimum=1.0,
+                    maximum=2.0,
+                    value=1.1,
+                    step=0.1
+                )
+                generate_btn = gr.Button("Generate Speech")
+            with gr.Column():
+                audio_output = gr.Audio(label="Generated Speech")
+                status_output = gr.Textbox(label="Status")
+    generate_btn.click(
+        generate_speech,
+        inputs=[text_input, language, voice, temperature, rep_penalty],
+        outputs=[audio_output, status_output]
+    )
+    gr.Markdown("""
+    ## About YarnGPT
+    YarnGPT is a text-to-speech model with Nigerian accents. It supports multiple languages and voices.
+    ### Credits
+    - Model by [saheedniyi](https://huggingface.co/saheedniyi/YarnGPT2b)
+    - [Original Repository](https://github.com/saheedniyi02/yarngpt)
+    """)
 # Launch the app
+demo.launch()