Spaces:

ganga4364
/

mms-tts-bod

Sleeping

ganga4364 commited on Oct 8, 2024

Commit

bfb8ce2

verified ·

1 Parent(s): f1e65b3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,15 +4,19 @@ import soundfile as sf
 import uuid
 import datetime
 import shutil
-from ttsmms import download
-from ttsmms import TTS
 # Description for the Gradio interface
-this_description = """Text To Speech for Tibetan - using MMS TTS."""
-# Download and load the Tibetan TTS model
-tts_model_path = download("bod", "./data")
-tts = TTS(tts_model_path)
 # Custom function to split Tibetan text into sentences
 def prepare_sentences(text, lang="bod"):
@@ -62,9 +66,18 @@ def tts_tibetan(input_text):
     user_dir = f"u_{timestamp}"
     os.makedirs(user_dir, exist_ok=True)
-    # Generate audio for each sentence
     for i, sentence in enumerate(sentences):
-        tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav")
     # Combine the generated audio into one file
     combined_file_path = combine_wav(user_dir, timestamp)

 import uuid
 import datetime
 import shutil
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+import scipy.io.wavfile
+import numpy as np
 # Description for the Gradio interface
+this_description = """Text To Speech for Tibetan - using your fine-tuned TTS model."""
+# Load your custom TTS model and processor for inference
+model_id = "ganga4364/mms-tts-bod-female"  # Replace with your fine-tuned model's ID
+# Use the text-to-speech pipeline with the custom model
+synthesiser = pipeline("text-to-speech", model_id)  # Use GPU if available
 # Custom function to split Tibetan text into sentences
 def prepare_sentences(text, lang="bod"):
     user_dir = f"u_{timestamp}"
     os.makedirs(user_dir, exist_ok=True)
+    # Generate audio for each sentence using your custom TTS model
     for i, sentence in enumerate(sentences):
+        # Perform TTS inference for each sentence
+        speech = synthesiser(sentence)
+        # Extract the audio data and sampling rate from the pipeline output
+        audio_data = np.array(speech["audio"])
+        sample_rate = speech["sampling_rate"]
+        # Save each sentence as a separate WAV file
+        wav_path = f"{user_dir}/s_{str(i).zfill(10)}.wav"
+        scipy.io.wavfile.write(wav_path, rate=sample_rate, data=audio_data.astype(np.int16))  # Ensure correct format
     # Combine the generated audio into one file
     combined_file_path = combine_wav(user_dir, timestamp)