Spaces:

ganga4364
/

mms-tts-bod

Sleeping

App Files Files Community

ganga4364 commited on Oct 8, 2024

Commit

aa1eb45

verified ·

1 Parent(s): 72ea965

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -52

app.py CHANGED Viewed

@@ -1,62 +1,42 @@
 import gradio as gr
-from transformers import pipeline
-import scipy.io.wavfile
 import os
 import datetime
 import shutil
-import soundfile as sf
-import nltk
-import numpy as np  # Add numpy to handle audio data
-nltk.download('punkt')  # Ensure that 'punkt' tokenizer is downloaded
-from nltk import sent_tokenize
-# Load the TTS pipeline with the specified model
-model_id = "ganga4364/mms-tts-multi-speakers"
-synthesiser = pipeline("text-to-speech", model=model_id)
-# Prepare sentences using NLTK for splitting into multiple sentences
-def prepare_sentences(text):
-    return sent_tokenize(text)
-# Function to generate audio for each sentence and combine them
-def generate_audio(input_text):
-    # Prepare sentences from the input text
-    sentences = prepare_sentences(input_text)
-    # Create a unique directory for storing audio chunks
-    current_datetime = datetime.datetime.now()
-    timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")
-    user_dir = f"u_{timestamp}"
-    os.makedirs(user_dir, exist_ok=True)
-    audio_files = []
-    for i, sentence in enumerate(sentences):
-        # Perform TTS inference for each sentence
-        print(f"Processing sentence {i+1}: {sentence}")
-        speech = synthesiser(sentence)
-        # Extract the audio data and sampling rate from the pipeline output
-        audio_data = np.array(speech["audio"])  # Ensure the data is a NumPy array
-        sample_rate = speech["sampling_rate"]
-        # Save each sentence as a separate audio file
-        wav_path = f"{user_dir}/s_{str(i).zfill(10)}.wav"
-        print(f"Saving audio to {wav_path}")
-        scipy.io.wavfile.write(wav_path, rate=sample_rate, data=audio_data.astype(np.int16))  # Ensure 16-bit format for WAV
-        audio_files.append(wav_path)
-    # Combine all audio files into one file
-    combined_file_path = combine_wav(user_dir, timestamp)
-    return combined_file_path
-# Function to combine all WAV files into one
 def combine_wav(source_dir, stamp):
     # Get a list of all WAV files in the folder
     wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]
     # Sort the files alphabetically to ensure the correct order of combination
     wav_files.sort()
@@ -65,28 +45,47 @@ def combine_wav(source_dir, stamp):
     sr = None
     for file in wav_files:
         file_path = os.path.join(source_dir, file)
-        print(f"Combining {file_path}")
         data, sample_rate = sf.read(file_path)
         if sr is None:
             sr = sample_rate  # Set the sample rate based on the first file
         combined_data.extend(data)
     # Save the combined audio to a new WAV file
     combined_file_path = f"{stamp}_combined.wav"
-    sf.write(combined_file_path, np.array(combined_data), sr)
     # Clean up temporary files
     shutil.rmtree(source_dir)
     return combined_file_path
 # Create the Gradio interface
 iface = gr.Interface(
-    fn=generate_audio,
     inputs="text",
     outputs="audio",  # Output should be the combined audio file
     title="Tibetan TTS Model",
-    description="Enter text to generate speech using a fine-tuned Tibetan voice model. The text will be split into sentences, and the generated audio will be combined and returned."
 )
 # Launch the Gradio interface

 import gradio as gr
 import os
+import soundfile as sf
+import uuid
 import datetime
 import shutil
+from ttsmms import download
+from ttsmms import TTS
+# Description for the Gradio interface
+this_description = """Text To Speech for Tibetan - using MMS TTS."""
+# Download and load the Tibetan TTS model
+tts_model_path = download("bod", "./data")
+tts = TTS(tts_model_path)
+# Function to prepare sentences (here you can use sentence splitting if needed)
+def prepare_sentences(text, lang="bod"):
+    sentences = []
+    # Not sure why this can fix unclear pronunciation for the first word of vie
+    text = text.lower()
+    paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]
+    sentences = [
+        sentence
+        for paragraph in paragraphs
+        for sentence in nltk_sent_tokenize(paragraph)
+        if sentence.strip()
+    ]
+    return sentences
+# Function to combine all generated WAV files into a single file
 def combine_wav(source_dir, stamp):
     # Get a list of all WAV files in the folder
     wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]
     # Sort the files alphabetically to ensure the correct order of combination
     wav_files.sort()
     sr = None
     for file in wav_files:
         file_path = os.path.join(source_dir, file)
         data, sample_rate = sf.read(file_path)
         if sr is None:
             sr = sample_rate  # Set the sample rate based on the first file
         combined_data.extend(data)
     # Save the combined audio to a new WAV file
     combined_file_path = f"{stamp}_combined.wav"
+    sf.write(combined_file_path, combined_data, sr)
     # Clean up temporary files
     shutil.rmtree(source_dir)
+    return combined_file_path
+# Main function to process Tibetan text and generate audio
+def tts_tibetan(input_text):
+    # Prepare sentences from the input text
+    sentences = prepare_sentences(input_text)
+    # Create a unique directory for storing audio chunks
+    current_datetime = datetime.datetime.now()
+    timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")
+    user_dir = f"u_{timestamp}"
+    os.makedirs(user_dir, exist_ok=True)
+    # Generate audio for each sentence
+    for i, sentence in enumerate(sentences):
+        tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav")
+    # Combine the generated audio into one file
+    combined_file_path = combine_wav(user_dir, timestamp)
     return combined_file_path
 # Create the Gradio interface
 iface = gr.Interface(
+    fn=tts_tibetan,
     inputs="text",
     outputs="audio",  # Output should be the combined audio file
     title="Tibetan TTS Model",
+    description=this_description
 )
 # Launch the Gradio interface