import gradio as gr import json import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset import soundfile as sf import io # Load models and pronunciation dictionary processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") with open("pronunciation_dict.json", "r") as f: pronunciation_dict = json.load(f) # Preprocess the text def preprocess_text(text): for term, phonetic in pronunciation_dict.items(): text = text.replace(term, phonetic) return text # Text-to-Speech function def text_to_speech(input_text): processed_text = preprocess_text(input_text) inputs = processor(text=processed_text, return_tensors="pt") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) # Prepare audio as BytesIO audio_buffer = io.BytesIO() sf.write(audio_buffer, speech.numpy(), samplerate=16000, format='WAV') audio_buffer.seek(0) return audio_buffer # Define examples examples = [ "We are using API for authentication.", "CUDA and TensorFlow work together for deep learning models.", "The database uses NoSQL and supports JSON for data storage.", "Machine learning and artificial intelligence are advancing fast.", "Natural language processing techniques are widely adopted." ] # Create Gradio interface iface = gr.Interface( fn=text_to_speech, inputs="text", outputs="audio", title="Fine-tuning TTS for English with a Focus on Technical Vocabulary Using SpeechT5", description="Enter text with technical jargon for TTS conversion.", examples=examples, cache_examples=False # Disable caching for now ) # Launch interface iface.launch()