ganga4364 commited on
Commit
d6262cc
·
verified ·
1 Parent(s): 9490b6a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -15
app.py CHANGED
@@ -2,35 +2,89 @@ import gradio as gr
2
  from transformers import pipeline
3
  import scipy.io.wavfile
4
  from io import BytesIO
 
 
 
 
 
 
 
 
5
 
6
- # Load the TTS pipeline with the specified VITS model
7
  model_id = "ganga4364/mms-tts-multi-speakers"
8
  synthesiser = pipeline("text-to-speech", model=model_id)
9
 
10
- # Function to generate audio from input text and save it to a file
 
 
 
 
11
  def generate_audio(input_text):
12
- # Perform TTS inference
13
- speech = synthesiser(input_text)
 
 
 
 
 
 
14
 
15
- # Extract the audio data and sampling rate from the pipeline output
16
- audio_data = speech["audio"][0]
17
- sample_rate = speech["sampling_rate"]
18
 
19
- # Save the audio to a file (e.g., 'output.wav')
20
- file_path = "output.wav"
21
- scipy.io.wavfile.write(file_path, rate=sample_rate, data=audio_data)
 
 
 
 
 
 
 
 
 
22
 
23
- # Return the file path so Gradio can return the audio file
24
- return file_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  # Create the Gradio interface
27
  iface = gr.Interface(
28
  fn=generate_audio,
29
  inputs="text",
30
- outputs="audio", # Output should be the audio file
31
  title="Tibetan TTS Model",
32
- description="Enter text to generate speech using a fine-tuned Tibetan voice model and return the audio."
33
  )
34
 
35
- # Launch the Gradio app
36
  iface.launch()
 
2
  from transformers import pipeline
3
  import scipy.io.wavfile
4
  from io import BytesIO
5
+ import os
6
+ import datetime
7
+ import uuid
8
+ import shutil
9
+ import soundfile as sf
10
+ import nltk
11
+ nltk.download('punkt') # Ensure that 'punkt' tokenizer is downloaded
12
+ from nltk import sent_tokenize
13
 
14
+ # Load the TTS pipeline with the specified model
15
  model_id = "ganga4364/mms-tts-multi-speakers"
16
  synthesiser = pipeline("text-to-speech", model=model_id)
17
 
18
+ # Prepare sentences using NLTK for splitting into multiple sentences
19
+ def prepare_sentences(text):
20
+ return sent_tokenize(text)
21
+
22
+ # Function to generate audio for each sentence and combine them
23
  def generate_audio(input_text):
24
+ # Prepare sentences from the input text
25
+ sentences = prepare_sentences(input_text)
26
+
27
+ # Create a unique directory for storing audio chunks
28
+ current_datetime = datetime.datetime.now()
29
+ timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")
30
+ user_dir = f"u_{timestamp}"
31
+ os.makedirs(user_dir, exist_ok=True)
32
 
33
+ audio_files = []
 
 
34
 
35
+ for i, sentence in enumerate(sentences):
36
+ # Perform TTS inference for each sentence
37
+ speech = synthesiser(sentence)
38
+
39
+ # Extract the audio data and sampling rate from the pipeline output
40
+ audio_data = speech["audio"][0]
41
+ sample_rate = speech["sampling_rate"]
42
+
43
+ # Save each sentence as a separate audio file
44
+ wav_path = f"{user_dir}/s_{str(i).zfill(10)}.wav"
45
+ scipy.io.wavfile.write(wav_path, rate=sample_rate, data=audio_data)
46
+ audio_files.append(wav_path)
47
 
48
+ # Combine all audio files into one file
49
+ combined_file_path = combine_wav(user_dir, timestamp)
50
+
51
+ return combined_file_path
52
+
53
+ # Function to combine all WAV files into one
54
+ def combine_wav(source_dir, stamp):
55
+ # Get a list of all WAV files in the folder
56
+ wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]
57
+
58
+ # Sort the files alphabetically to ensure the correct order of combination
59
+ wav_files.sort()
60
+
61
+ # Combine the WAV files
62
+ combined_data = []
63
+ sr = None
64
+ for file in wav_files:
65
+ file_path = os.path.join(source_dir, file)
66
+ data, sample_rate = sf.read(file_path)
67
+ if sr is None:
68
+ sr = sample_rate # Set the sample rate based on the first file
69
+ combined_data.extend(data)
70
+
71
+ # Save the combined audio to a new WAV file
72
+ combined_file_path = f"{stamp}_combined.wav"
73
+ sf.write(combined_file_path, combined_data, sr)
74
+
75
+ # Clean up temporary files
76
+ shutil.rmtree(source_dir)
77
+
78
+ return combined_file_path
79
 
80
  # Create the Gradio interface
81
  iface = gr.Interface(
82
  fn=generate_audio,
83
  inputs="text",
84
+ outputs="audio", # Output should be the combined audio file
85
  title="Tibetan TTS Model",
86
+ description="Enter text to generate speech using a fine-tuned Tibetan voice model. The text will be split into sentences, and the generated audio will be combined and returned."
87
  )
88
 
89
+ # Launch the Gradio interface
90
  iface.launch()