Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,62 +1,42 @@
|
|
1 |
import gradio as gr
|
2 |
-
from transformers import pipeline
|
3 |
-
import scipy.io.wavfile
|
4 |
import os
|
|
|
|
|
5 |
import datetime
|
6 |
import shutil
|
7 |
-
|
8 |
-
import
|
9 |
-
import numpy as np # Add numpy to handle audio data
|
10 |
|
11 |
-
|
12 |
-
|
13 |
|
14 |
-
#
|
15 |
-
|
16 |
-
|
17 |
|
18 |
-
#
|
19 |
-
def prepare_sentences(text):
|
20 |
-
|
21 |
|
22 |
-
#
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
# Create a unique directory for storing audio chunks
|
28 |
-
current_datetime = datetime.datetime.now()
|
29 |
-
timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")
|
30 |
-
user_dir = f"u_{timestamp}"
|
31 |
-
os.makedirs(user_dir, exist_ok=True)
|
32 |
-
|
33 |
-
audio_files = []
|
34 |
-
|
35 |
-
for i, sentence in enumerate(sentences):
|
36 |
-
# Perform TTS inference for each sentence
|
37 |
-
print(f"Processing sentence {i+1}: {sentence}")
|
38 |
-
speech = synthesiser(sentence)
|
39 |
-
|
40 |
-
# Extract the audio data and sampling rate from the pipeline output
|
41 |
-
audio_data = np.array(speech["audio"]) # Ensure the data is a NumPy array
|
42 |
-
sample_rate = speech["sampling_rate"]
|
43 |
-
|
44 |
-
# Save each sentence as a separate audio file
|
45 |
-
wav_path = f"{user_dir}/s_{str(i).zfill(10)}.wav"
|
46 |
-
print(f"Saving audio to {wav_path}")
|
47 |
-
scipy.io.wavfile.write(wav_path, rate=sample_rate, data=audio_data.astype(np.int16)) # Ensure 16-bit format for WAV
|
48 |
-
audio_files.append(wav_path)
|
49 |
-
|
50 |
-
# Combine all audio files into one file
|
51 |
-
combined_file_path = combine_wav(user_dir, timestamp)
|
52 |
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
# Function to combine all WAV files into
|
56 |
def combine_wav(source_dir, stamp):
|
57 |
# Get a list of all WAV files in the folder
|
58 |
wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]
|
59 |
-
|
60 |
# Sort the files alphabetically to ensure the correct order of combination
|
61 |
wav_files.sort()
|
62 |
|
@@ -65,28 +45,47 @@ def combine_wav(source_dir, stamp):
|
|
65 |
sr = None
|
66 |
for file in wav_files:
|
67 |
file_path = os.path.join(source_dir, file)
|
68 |
-
print(f"Combining {file_path}")
|
69 |
data, sample_rate = sf.read(file_path)
|
70 |
if sr is None:
|
71 |
sr = sample_rate # Set the sample rate based on the first file
|
72 |
combined_data.extend(data)
|
73 |
-
|
74 |
# Save the combined audio to a new WAV file
|
75 |
combined_file_path = f"{stamp}_combined.wav"
|
76 |
-
sf.write(combined_file_path,
|
77 |
-
|
78 |
# Clean up temporary files
|
79 |
shutil.rmtree(source_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
return combined_file_path
|
82 |
|
83 |
# Create the Gradio interface
|
84 |
iface = gr.Interface(
|
85 |
-
fn=
|
86 |
inputs="text",
|
87 |
outputs="audio", # Output should be the combined audio file
|
88 |
title="Tibetan TTS Model",
|
89 |
-
description=
|
90 |
)
|
91 |
|
92 |
# Launch the Gradio interface
|
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
import os
|
3 |
+
import soundfile as sf
|
4 |
+
import uuid
|
5 |
import datetime
|
6 |
import shutil
|
7 |
+
from ttsmms import download
|
8 |
+
from ttsmms import TTS
|
|
|
9 |
|
10 |
+
# Description for the Gradio interface
|
11 |
+
this_description = """Text To Speech for Tibetan - using MMS TTS."""
|
12 |
|
13 |
+
# Download and load the Tibetan TTS model
|
14 |
+
tts_model_path = download("bod", "./data")
|
15 |
+
tts = TTS(tts_model_path)
|
16 |
|
17 |
+
# Function to prepare sentences (here you can use sentence splitting if needed)
|
18 |
+
def prepare_sentences(text, lang="bod"):
|
19 |
+
sentences = []
|
20 |
|
21 |
+
# Not sure why this can fix unclear pronunciation for the first word of vie
|
22 |
+
text = text.lower()
|
23 |
+
|
24 |
+
paragraphs = [paragraph for paragraph in text.split("\n") if paragraph.strip()]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
sentences = [
|
27 |
+
sentence
|
28 |
+
for paragraph in paragraphs
|
29 |
+
for sentence in nltk_sent_tokenize(paragraph)
|
30 |
+
if sentence.strip()
|
31 |
+
]
|
32 |
+
return sentences
|
33 |
+
|
34 |
|
35 |
+
# Function to combine all generated WAV files into a single file
|
36 |
def combine_wav(source_dir, stamp):
|
37 |
# Get a list of all WAV files in the folder
|
38 |
wav_files = [file for file in os.listdir(source_dir) if file.endswith(".wav")]
|
39 |
+
|
40 |
# Sort the files alphabetically to ensure the correct order of combination
|
41 |
wav_files.sort()
|
42 |
|
|
|
45 |
sr = None
|
46 |
for file in wav_files:
|
47 |
file_path = os.path.join(source_dir, file)
|
|
|
48 |
data, sample_rate = sf.read(file_path)
|
49 |
if sr is None:
|
50 |
sr = sample_rate # Set the sample rate based on the first file
|
51 |
combined_data.extend(data)
|
52 |
+
|
53 |
# Save the combined audio to a new WAV file
|
54 |
combined_file_path = f"{stamp}_combined.wav"
|
55 |
+
sf.write(combined_file_path, combined_data, sr)
|
56 |
+
|
57 |
# Clean up temporary files
|
58 |
shutil.rmtree(source_dir)
|
59 |
+
|
60 |
+
return combined_file_path
|
61 |
+
|
62 |
+
# Main function to process Tibetan text and generate audio
|
63 |
+
def tts_tibetan(input_text):
|
64 |
+
# Prepare sentences from the input text
|
65 |
+
sentences = prepare_sentences(input_text)
|
66 |
+
|
67 |
+
# Create a unique directory for storing audio chunks
|
68 |
+
current_datetime = datetime.datetime.now()
|
69 |
+
timestamp = current_datetime.strftime("%Y%m%d%H%M%S%f")
|
70 |
+
user_dir = f"u_{timestamp}"
|
71 |
+
os.makedirs(user_dir, exist_ok=True)
|
72 |
+
|
73 |
+
# Generate audio for each sentence
|
74 |
+
for i, sentence in enumerate(sentences):
|
75 |
+
tts.synthesis(sentence, wav_path=f"{user_dir}/s_{str(i).zfill(10)}.wav")
|
76 |
+
|
77 |
+
# Combine the generated audio into one file
|
78 |
+
combined_file_path = combine_wav(user_dir, timestamp)
|
79 |
|
80 |
return combined_file_path
|
81 |
|
82 |
# Create the Gradio interface
|
83 |
iface = gr.Interface(
|
84 |
+
fn=tts_tibetan,
|
85 |
inputs="text",
|
86 |
outputs="audio", # Output should be the combined audio file
|
87 |
title="Tibetan TTS Model",
|
88 |
+
description=this_description
|
89 |
)
|
90 |
|
91 |
# Launch the Gradio interface
|