ganga4364 commited on
Commit
bfb8ce2
·
verified ·
1 Parent(s): f1e65b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -8
app.py CHANGED
@@ -4,15 +4,19 @@ import soundfile as sf
4
  import uuid
5
  import datetime
6
  import shutil
7
- from ttsmms import download
8
- from ttsmms import TTS
 
9
 
10
  # Description for the Gradio interface
11
- this_description = """Text To Speech for Tibetan - using MMS TTS."""
12
 
13
- # Download and load the Tibetan TTS model
14
- tts_model_path = download("bod", "./data")
15
- tts = TTS(tts_model_path)
 
 
 
16
 
17
  # Custom function to split Tibetan text into sentences
18
  def prepare_sentences(text, lang="bod"):
@@ -62,9 +66,18 @@ def tts_tibetan(input_text):
62
  user_dir = f"u_{timestamp}"
63
  os.makedirs(user_dir, exist_ok=True)
64
 
65
- # Generate audio for each sentence
66
  for i, sentence in enumerate(sentences):
67
- tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav")
 
 
 
 
 
 
 
 
 
68
 
69
  # Combine the generated audio into one file
70
  combined_file_path = combine_wav(user_dir, timestamp)
 
4
  import uuid
5
  import datetime
6
  import shutil
7
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
8
+ import scipy.io.wavfile
9
+ import numpy as np
10
 
11
  # Description for the Gradio interface
12
+ this_description = """Text To Speech for Tibetan - using your fine-tuned TTS model."""
13
 
14
+ # Load your custom TTS model and processor for inference
15
+ model_id = "ganga4364/mms-tts-bod-female" # Replace with your fine-tuned model's ID
16
+
17
+
18
+ # Use the text-to-speech pipeline with the custom model
19
+ synthesiser = pipeline("text-to-speech", model_id) # Use GPU if available
20
 
21
  # Custom function to split Tibetan text into sentences
22
  def prepare_sentences(text, lang="bod"):
 
66
  user_dir = f"u_{timestamp}"
67
  os.makedirs(user_dir, exist_ok=True)
68
 
69
+ # Generate audio for each sentence using your custom TTS model
70
  for i, sentence in enumerate(sentences):
71
+ # Perform TTS inference for each sentence
72
+ speech = synthesiser(sentence)
73
+
74
+ # Extract the audio data and sampling rate from the pipeline output
75
+ audio_data = np.array(speech["audio"])
76
+ sample_rate = speech["sampling_rate"]
77
+
78
+ # Save each sentence as a separate WAV file
79
+ wav_path = f"{user_dir}/s_{str(i).zfill(10)}.wav"
80
+ scipy.io.wavfile.write(wav_path, rate=sample_rate, data=audio_data.astype(np.int16)) # Ensure correct format
81
 
82
  # Combine the generated audio into one file
83
  combined_file_path = combine_wav(user_dir, timestamp)