Spaces:

Vinay15
/

Text-to-Speech_Model_for_English_Technical_Speech

Sleeping

App Files Files Community

Vinay15 commited on Oct 24, 2024

Commit

dedcf07

verified ·

1 Parent(s): 64adb3b

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -22

app.py CHANGED Viewed

@@ -1,7 +1,3 @@
-# Step 1: Install Gradio
-#pip install gradio
-# Step 2: Import necessary libraries
 import gradio as gr
 import json
 import torch
@@ -10,44 +6,38 @@ from datasets import load_dataset
 import soundfile as sf
 import io
-# Step 3: Load the models and the pronunciation dictionary
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-# Load pronunciation dictionary from JSON file
 with open("pronunciation_dict.json", "r") as f:
     pronunciation_dict = json.load(f)
-# Function to preprocess the input text
 def preprocess_text(text):
     for term, phonetic in pronunciation_dict.items():
         text = text.replace(term, phonetic)
     return text
-# Step 4: Define the TTS function
 def text_to_speech(input_text):
-    # Preprocess the text
     processed_text = preprocess_text(input_text)
-    # Convert the processed text to model inputs
     inputs = processor(text=processed_text, return_tensors="pt")
-    # Load xvector embeddings from dataset for speaker voice characteristics
     embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
     speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-    # Generate speech using the model and vocoder
     speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
-    # Convert generated speech to an in-memory buffer
     audio_buffer = io.BytesIO()
     sf.write(audio_buffer, speech.numpy(), samplerate=16000, format='WAV')
     audio_buffer.seek(0)
     return audio_buffer
-# Step 5: Create Gradio interface with examples
 examples = [
     "We are using API for authentication.",
     "CUDA and TensorFlow work together for deep learning models.",
@@ -56,14 +46,16 @@ examples = [
     "Natural language processing techniques are widely adopted."
 ]
 iface = gr.Interface(
     fn=text_to_speech,
     inputs="text",
     outputs="audio",
     title="Fine-tuning TTS for English with a Focus on Technical Vocabulary Using SpeechT5",
     description="Enter text with technical jargon for TTS conversion.",
-    examples=examples  # Adding preset examples for users
 )
-# Step 6: Launch the app
-iface.launch(share=True)

 import gradio as gr
 import json
 import torch
 import soundfile as sf
 import io
+# Load models and pronunciation dictionary
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 with open("pronunciation_dict.json", "r") as f:
     pronunciation_dict = json.load(f)
+# Preprocess the text
 def preprocess_text(text):
     for term, phonetic in pronunciation_dict.items():
         text = text.replace(term, phonetic)
     return text
+# Text-to-Speech function
 def text_to_speech(input_text):
     processed_text = preprocess_text(input_text)
     inputs = processor(text=processed_text, return_tensors="pt")
     embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
     speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
     speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+    # Prepare audio as BytesIO
     audio_buffer = io.BytesIO()
     sf.write(audio_buffer, speech.numpy(), samplerate=16000, format='WAV')
     audio_buffer.seek(0)
     return audio_buffer
+# Define examples
 examples = [
     "We are using API for authentication.",
     "CUDA and TensorFlow work together for deep learning models.",
     "Natural language processing techniques are widely adopted."
 ]
+# Create Gradio interface
 iface = gr.Interface(
     fn=text_to_speech,
     inputs="text",
     outputs="audio",
     title="Fine-tuning TTS for English with a Focus on Technical Vocabulary Using SpeechT5",
     description="Enter text with technical jargon for TTS conversion.",
+    examples=examples,
+    cache_examples=False  # Disable caching for now
 )
+# Launch interface
+iface.launch()