# Step 1: Install Gradio #pip install gradio # Step 2: Import necessary libraries import gradio as gr import json import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset import soundfile as sf import io # Step 3: Load the models and the pronunciation dictionary processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Load pronunciation dictionary from JSON file with open("pronunciation_dict.json", "r") as f: pronunciation_dict = json.load(f) # Function to preprocess the input text def preprocess_text(text): for term, phonetic in pronunciation_dict.items(): text = text.replace(term, phonetic) return text # Step 4: Define the TTS function def text_to_speech(input_text): # Preprocess the text processed_text = preprocess_text(input_text) # Convert the processed text to model inputs inputs = processor(text=processed_text, return_tensors="pt") # Load xvector embeddings from dataset for speaker voice characteristics embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) # Generate speech using the model and vocoder speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) # Convert generated speech to an in-memory buffer audio_buffer = io.BytesIO() sf.write(audio_buffer, speech.numpy(), samplerate=16000, format='WAV') audio_buffer.seek(0) return audio_buffer # Step 5: Create Gradio interface with examples examples = [ "We are using API for authentication.", "CUDA and TensorFlow work together for deep learning models.", "The database uses NoSQL and supports JSON for data storage.", "Machine learning and artificial intelligence are advancing fast.", "Natural language processing techniques are widely adopted." ] iface = gr.Interface( fn=text_to_speech, inputs="text", outputs="audio", title="Fine-tuning TTS for English with a Focus on Technical Vocabulary Using SpeechT5", description="Enter text with technical jargon for TTS conversion.", examples=examples # Adding preset examples for users ) # Step 6: Launch the app iface.launch(share=True)