ganga4364 commited on
Commit
aa1eb45
·
verified ·
1 Parent(s): 72ea965

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -52
app.py CHANGED
@@ -1,62 +1,42 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
- import scipy.io.wavfile
4
  import os
 
 
5
  import datetime
6
  import shutil
7
- import soundfile as sf
8
- import nltk
9
- import numpy as np # Add numpy to handle audio data
10
 
11
- nltk.download('punkt') # Ensure that 'punkt' tokenizer is downloaded
12
- from nltk import sent_tokenize
13
 
14
- # Load the TTS pipeline with the specified model
15
- model_id = "ganga4364/mms-tts-multi-speakers"
16
- synthesiser = pipeline("text-to-speech", model=model_id)
17
 
18
- # Prepare sentences using NLTK for splitting into multiple sentences
19
- def prepare_sentences(text):
20
- return sent_tokenize(text)
21
 
22
- # Function to generate audio for each sentence and combine them
23
- def generate_audio(input_text):
24
- # Prepare sentences from the input text
25
- sentences = prepare_sentences(input_text)
26
-
27
- # Create a unique directory for storing audio chunks
28
- current_datetime = datetime.datetime.now()
29
- timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")
30
- user_dir = f"u_{timestamp}"
31
- os.makedirs(user_dir, exist_ok=True)
32
-
33
- audio_files = []
34
-
35
- for i, sentence in enumerate(sentences):
36
- # Perform TTS inference for each sentence
37
- print(f"Processing sentence {i+1}: {sentence}")
38
- speech = synthesiser(sentence)
39
-
40
- # Extract the audio data and sampling rate from the pipeline output
41
- audio_data = np.array(speech["audio"]) # Ensure the data is a NumPy array
42
- sample_rate = speech["sampling_rate"]
43
-
44
- # Save each sentence as a separate audio file
45
- wav_path = f"{user_dir}/s_{str(i).zfill(10)}.wav"
46
- print(f"Saving audio to {wav_path}")
47
- scipy.io.wavfile.write(wav_path, rate=sample_rate, data=audio_data.astype(np.int16)) # Ensure 16-bit format for WAV
48
- audio_files.append(wav_path)
49
-
50
- # Combine all audio files into one file
51
- combined_file_path = combine_wav(user_dir, timestamp)
52
 
53
- return combined_file_path
 
 
 
 
 
 
 
54
 
55
- # Function to combine all WAV files into one
56
  def combine_wav(source_dir, stamp):
57
  # Get a list of all WAV files in the folder
58
  wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]
59
-
60
  # Sort the files alphabetically to ensure the correct order of combination
61
  wav_files.sort()
62
 
@@ -65,28 +45,47 @@ def combine_wav(source_dir, stamp):
65
  sr = None
66
  for file in wav_files:
67
  file_path = os.path.join(source_dir, file)
68
- print(f"Combining {file_path}")
69
  data, sample_rate = sf.read(file_path)
70
  if sr is None:
71
  sr = sample_rate # Set the sample rate based on the first file
72
  combined_data.extend(data)
73
-
74
  # Save the combined audio to a new WAV file
75
  combined_file_path = f"{stamp}_combined.wav"
76
- sf.write(combined_file_path, np.array(combined_data), sr)
77
-
78
  # Clean up temporary files
79
  shutil.rmtree(source_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  return combined_file_path
82
 
83
  # Create the Gradio interface
84
  iface = gr.Interface(
85
- fn=generate_audio,
86
  inputs="text",
87
  outputs="audio", # Output should be the combined audio file
88
  title="Tibetan TTS Model",
89
- description="Enter text to generate speech using a fine-tuned Tibetan voice model. The text will be split into sentences, and the generated audio will be combined and returned."
90
  )
91
 
92
  # Launch the Gradio interface
 
1
  import gradio as gr
 
 
2
  import os
3
+ import soundfile as sf
4
+ import uuid
5
  import datetime
6
  import shutil
7
+ from ttsmms import download
8
+ from ttsmms import TTS
 
9
 
10
+ # Description for the Gradio interface
11
+ this_description = """Text To Speech for Tibetan - using MMS TTS."""
12
 
13
+ # Download and load the Tibetan TTS model
14
+ tts_model_path = download("bod", "./data")
15
+ tts = TTS(tts_model_path)
16
 
17
+ # Function to prepare sentences (here you can use sentence splitting if needed)
18
+ def prepare_sentences(text, lang="bod"):
19
+ sentences = []
20
 
21
+ # Not sure why this can fix unclear pronunciation for the first word of vie
22
+ text = text.lower()
23
+
24
+ paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ sentences = [
27
+ sentence
28
+ for paragraph in paragraphs
29
+ for sentence in nltk_sent_tokenize(paragraph)
30
+ if sentence.strip()
31
+ ]
32
+ return sentences
33
+
34
 
35
+ # Function to combine all generated WAV files into a single file
36
  def combine_wav(source_dir, stamp):
37
  # Get a list of all WAV files in the folder
38
  wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]
39
+
40
  # Sort the files alphabetically to ensure the correct order of combination
41
  wav_files.sort()
42
 
 
45
  sr = None
46
  for file in wav_files:
47
  file_path = os.path.join(source_dir, file)
 
48
  data, sample_rate = sf.read(file_path)
49
  if sr is None:
50
  sr = sample_rate # Set the sample rate based on the first file
51
  combined_data.extend(data)
52
+
53
  # Save the combined audio to a new WAV file
54
  combined_file_path = f"{stamp}_combined.wav"
55
+ sf.write(combined_file_path, combined_data, sr)
56
+
57
  # Clean up temporary files
58
  shutil.rmtree(source_dir)
59
+
60
+ return combined_file_path
61
+
62
+ # Main function to process Tibetan text and generate audio
63
+ def tts_tibetan(input_text):
64
+ # Prepare sentences from the input text
65
+ sentences = prepare_sentences(input_text)
66
+
67
+ # Create a unique directory for storing audio chunks
68
+ current_datetime = datetime.datetime.now()
69
+ timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")
70
+ user_dir = f"u_{timestamp}"
71
+ os.makedirs(user_dir, exist_ok=True)
72
+
73
+ # Generate audio for each sentence
74
+ for i, sentence in enumerate(sentences):
75
+ tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav")
76
+
77
+ # Combine the generated audio into one file
78
+ combined_file_path = combine_wav(user_dir, timestamp)
79
 
80
  return combined_file_path
81
 
82
  # Create the Gradio interface
83
  iface = gr.Interface(
84
+ fn=tts_tibetan,
85
  inputs="text",
86
  outputs="audio", # Output should be the combined audio file
87
  title="Tibetan TTS Model",
88
+ description=this_description
89
  )
90
 
91
  # Launch the Gradio interface