File size: 4,190 Bytes
08a2633 faabe22 08a2633 faabe22 08a2633 47c1a8b a7d44b7 08a2633 faabe22 08a2633 faabe22 08a2633 faabe22 08a2633 8fa00e8 faabe22 08a2633 faabe22 08a2633 8fa00e8 faabe22 47c1a8b faabe22 08a2633 faabe22 47c1a8b faabe22 47c1a8b faabe22 47c1a8b faabe22 47c1a8b bba37d1 faabe22 78e27d5 a07614e bba37d1 a07614e bba37d1 a07614e bba37d1 78e27d5 7b03b11 bba37d1 78e27d5 c406abb 78e27d5 bba37d1 7b03b11 faabe22 47c1a8b faabe22 bba37d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import gradio as gr
import json
import torch
import numpy as np
import re
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import soundfile as sf
# Step 1: Load the models and the pronunciation dictionary
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Load pronunciation dictionary from JSON file
with open("pronunciation_dict.json", "r") as f:
pronunciation_dict = json.load(f)
# Function to preprocess and apply pronunciation dictionary
def preprocess_text(text):
# Convert text to uppercase for uniformity in matching
text = text.upper()
for term, phonetic in pronunciation_dict.items():
# Replace terms with their phonetic equivalents
text = text.replace(term.upper(), phonetic)
return text
# Explicitly replace "API" with "A P I" to improve pronunciation
def custom_acronym_pronunciation(text):
text = text.replace("API", "ay p eei")
return text
# Step 2: Define the TTS function with sentence segmentation
def text_to_speech(input_text):
# Preprocess and segment text
processed_text = preprocess_text(input_text)
# Apply custom acronym handling
processed_text = custom_acronym_pronunciation(processed_text)
# Split the processed text by punctuation to form shorter segments
segments = re.split(r'(?<=[.!?]) +', processed_text)
# Load speaker embeddings for consistent voice
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
audio_outputs = []
# Generate speech for each text segment
for segment in segments:
if segment.strip(): # Ensure the segment is not empty
inputs = processor(text=segment, return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
audio_outputs.append(speech.numpy())
# Concatenate audio from all segments
complete_speech = np.concatenate(audio_outputs)
# Save the concatenated speech as a .wav file
output_file = "speech_output.wav"
sf.write(output_file, complete_speech, samplerate=16000)
return output_file
# Step 3: Create Gradio interface without sample examples
iface = gr.Interface(
fn=text_to_speech,
inputs="text",
outputs="audio",
title="Fine-tuning TTS for Technical Vocabulary",
description="""
Enter text containing technical terms or abbreviations for text-to-speech conversion. The model has been fine-tuned with a dataset specifically prepared to handle technical vocabulary and acronyms. This includes a pronunciation dictionary for terms such as API, CUDA, and OAuth. Sentence segmentation and custom pronunciation handling further optimize the output for natural, intelligible speech.
Note: Processing time may vary based on sentence length. Longer sentences may take additional time to generate speech. Additionally, the model’s performance improves as more technical terms are added to the pronunciation dictionary, enhancing accuracy for specialized vocabulary.
GitHub Repository: [Text-to-Speech Model for English Technical Speech](https://github.com/Vinay152003/Text-to-Speech_Model_for_English_Technical_Speech-Using-SpeechT5)
Report: [Project Report](https://drive.google.com/file/d/1CfnpeUi18R7De1uhilYuhMYLS_xXjh2Q/view)
""",
examples=[
["What is GPU?"],
["What are continuous integration systems, and what is their role in the automated-build process?"],
["Using CUDA for deep learning optimizes the model training on GPU."],
["In TTS models, the vocoder is essential for natural-sounding speech."],
["TensorFlow provides comprehensive tools for deep learning."],
["The API allows integration with OAuth and REST for scalable web services."]
]
)
# Step 4: Launch the app
iface.launch(share=True) |