Spaces:

ganga4364
/

mms-tts-bod

Sleeping

App Files Files Community

ganga4364 commited on Oct 8, 2024

Commit

d6262cc

verified ·

1 Parent(s): 9490b6a

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -15

app.py CHANGED Viewed

@@ -2,35 +2,89 @@ import gradio as gr
 from transformers import pipeline
 import scipy.io.wavfile
 from io import BytesIO
-# Load the TTS pipeline with the specified VITS model
 model_id = "ganga4364/mms-tts-multi-speakers"
 synthesiser = pipeline("text-to-speech", model=model_id)
-# Function to generate audio from input text and save it to a file
 def generate_audio(input_text):
-    # Perform TTS inference
-    speech = synthesiser(input_text)
-    # Extract the audio data and sampling rate from the pipeline output
-    audio_data = speech["audio"][0]
-    sample_rate = speech["sampling_rate"]
-    # Save the audio to a file (e.g., 'output.wav')
-    file_path = "output.wav"
-    scipy.io.wavfile.write(file_path, rate=sample_rate, data=audio_data)
-    # Return the file path so Gradio can return the audio file
-    return file_path
 # Create the Gradio interface
 iface = gr.Interface(
     fn=generate_audio,
     inputs="text",
-    outputs="audio",  # Output should be the audio file
     title="Tibetan TTS Model",
-    description="Enter text to generate speech using a fine-tuned Tibetan voice model and return the audio."
 )
-# Launch the Gradio app
 iface.launch()

 from transformers import pipeline
 import scipy.io.wavfile
 from io import BytesIO
+import os
+import datetime
+import uuid
+import shutil
+import soundfile as sf
+import nltk
+nltk.download('punkt')  # Ensure that 'punkt' tokenizer is downloaded
+from nltk import sent_tokenize
+# Load the TTS pipeline with the specified model
 model_id = "ganga4364/mms-tts-multi-speakers"
 synthesiser = pipeline("text-to-speech", model=model_id)
+# Prepare sentences using NLTK for splitting into multiple sentences
+def prepare_sentences(text):
+    return sent_tokenize(text)
+# Function to generate audio for each sentence and combine them
 def generate_audio(input_text):
+    # Prepare sentences from the input text
+    sentences = prepare_sentences(input_text)
+    # Create a unique directory for storing audio chunks
+    current_datetime = datetime.datetime.now()
+    timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")
+    user_dir = f"u_{timestamp}"
+    os.makedirs(user_dir, exist_ok=True)
+    audio_files = []
+    for i, sentence in enumerate(sentences):
+        # Perform TTS inference for each sentence
+        speech = synthesiser(sentence)
+        # Extract the audio data and sampling rate from the pipeline output
+        audio_data = speech["audio"][0]
+        sample_rate = speech["sampling_rate"]
+        # Save each sentence as a separate audio file
+        wav_path = f"{user_dir}/s_{str(i).zfill(10)}.wav"
+        scipy.io.wavfile.write(wav_path, rate=sample_rate, data=audio_data)
+        audio_files.append(wav_path)
+    # Combine all audio files into one file
+    combined_file_path = combine_wav(user_dir, timestamp)
+    return combined_file_path
+# Function to combine all WAV files into one
+def combine_wav(source_dir, stamp):
+    # Get a list of all WAV files in the folder
+    wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]
+    # Sort the files alphabetically to ensure the correct order of combination
+    wav_files.sort()
+    # Combine the WAV files
+    combined_data = []
+    sr = None
+    for file in wav_files:
+        file_path = os.path.join(source_dir, file)
+        data, sample_rate = sf.read(file_path)
+        if sr is None:
+            sr = sample_rate  # Set the sample rate based on the first file
+        combined_data.extend(data)
+    # Save the combined audio to a new WAV file
+    combined_file_path = f"{stamp}_combined.wav"
+    sf.write(combined_file_path, combined_data, sr)
+    # Clean up temporary files
+    shutil.rmtree(source_dir)
+    return combined_file_path
 # Create the Gradio interface
 iface = gr.Interface(
     fn=generate_audio,
     inputs="text",
+    outputs="audio",  # Output should be the combined audio file
     title="Tibetan TTS Model",
+    description="Enter text to generate speech using a fine-tuned Tibetan voice model. The text will be split into sentences, and the generated audio will be combined and returned."
 )
+# Launch the Gradio interface
 iface.launch()