import gradio as gr import json import torch import numpy as np import re from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset import soundfile as sf # Step 1: Load the models and the pronunciation dictionary processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Load pronunciation dictionary from JSON file with open("pronunciation_dict.json", "r") as f: pronunciation_dict = json.load(f) # Function to preprocess and apply pronunciation dictionary def preprocess_text(text): # Convert text to uppercase for uniformity in matching text = text.upper() for term, phonetic in pronunciation_dict.items(): # Replace terms with their phonetic equivalents text = text.replace(term.upper(), phonetic) return text # Explicitly replace "API" with "A P I" to improve pronunciation def custom_acronym_pronunciation(text): text = text.replace("API", "ay p eei") return text # Step 2: Define the TTS function with sentence segmentation def text_to_speech(input_text): # Preprocess and segment text processed_text = preprocess_text(input_text) # Apply custom acronym handling processed_text = custom_acronym_pronunciation(processed_text) # Split the processed text by punctuation to form shorter segments segments = re.split(r'(?<=[.!?]) +', processed_text) # Load speaker embeddings for consistent voice embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) audio_outputs = [] # Generate speech for each text segment for segment in segments: if segment.strip(): # Ensure the segment is not empty inputs = processor(text=segment, return_tensors="pt") speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) audio_outputs.append(speech.numpy()) # Concatenate audio from all segments complete_speech = np.concatenate(audio_outputs) # Save the concatenated speech as a .wav file output_file = "speech_output.wav" sf.write(output_file, complete_speech, samplerate=16000) return output_file # Step 3: Create Gradio interface without sample examples iface = gr.Interface( fn=text_to_speech, inputs="text", outputs="audio", title="Fine-tuning TTS for Technical Vocabulary", description=""" Enter text containing technical terms or abbreviations for text-to-speech conversion. The model has been fine-tuned with a dataset specifically prepared to handle technical vocabulary and acronyms. This includes a pronunciation dictionary for terms such as API, CUDA, and OAuth. Sentence segmentation and custom pronunciation handling further optimize the output for natural, intelligible speech. Note: Processing time may vary based on sentence length. Longer sentences may take additional time to generate speech. Additionally, the model’s performance improves as more technical terms are added to the pronunciation dictionary, enhancing accuracy for specialized vocabulary. GitHub Repository: [Text-to-Speech Model for English Technical Speech](https://github.com/Vinay152003/Text-to-Speech_Model_for_English_Technical_Speech-Using-SpeechT5) Report: [Project Report](https://drive.google.com/file/d/1CfnpeUi18R7De1uhilYuhMYLS_xXjh2Q/view) """, examples=[ ["What is GPU?"], ["What are continuous integration systems, and what is their role in the automated-build process?"], ["Using CUDA for deep learning optimizes the model training on GPU."], ["In TTS models, the vocoder is essential for natural-sounding speech."], ["TensorFlow provides comprehensive tools for deep learning."], ["The API allows integration with OAuth and REST for scalable web services."] ] ) # Step 4: Launch the app iface.launch(share=True)