|
import gradio as gr |
|
import json |
|
import torch |
|
import numpy as np |
|
import re |
|
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
|
from datasets import load_dataset |
|
import soundfile as sf |
|
|
|
|
|
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") |
|
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") |
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
|
|
|
with open("pronunciation_dict.json", "r") as f: |
|
pronunciation_dict = json.load(f) |
|
|
|
|
|
def preprocess_text(text): |
|
|
|
text = text.upper() |
|
for term, phonetic in pronunciation_dict.items(): |
|
|
|
text = text.replace(term.upper(), phonetic) |
|
return text |
|
|
|
|
|
def custom_acronym_pronunciation(text): |
|
text = text.replace("API", "ay p eei") |
|
return text |
|
|
|
|
|
def text_to_speech(input_text): |
|
|
|
processed_text = preprocess_text(input_text) |
|
|
|
processed_text = custom_acronym_pronunciation(processed_text) |
|
|
|
segments = re.split(r'(?<=[.!?]) +', processed_text) |
|
|
|
|
|
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") |
|
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) |
|
|
|
audio_outputs = [] |
|
|
|
|
|
for segment in segments: |
|
if segment.strip(): |
|
inputs = processor(text=segment, return_tensors="pt") |
|
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) |
|
audio_outputs.append(speech.numpy()) |
|
|
|
|
|
complete_speech = np.concatenate(audio_outputs) |
|
|
|
|
|
output_file = "speech_output.wav" |
|
sf.write(output_file, complete_speech, samplerate=16000) |
|
|
|
return output_file |
|
|
|
|
|
iface = gr.Interface( |
|
fn=text_to_speech, |
|
inputs="text", |
|
outputs="audio", |
|
title="Fine-tuning TTS for Technical Vocabulary", |
|
description=""" |
|
Enter text containing technical terms or abbreviations for text-to-speech conversion. The model has been fine-tuned with a dataset specifically prepared to handle technical vocabulary and acronyms. This includes a pronunciation dictionary for terms such as API, CUDA, and OAuth. Sentence segmentation and custom pronunciation handling further optimize the output for natural, intelligible speech. |
|
|
|
Note: Processing time may vary based on sentence length. Longer sentences may take additional time to generate speech. Additionally, the model’s performance improves as more technical terms are added to the pronunciation dictionary, enhancing accuracy for specialized vocabulary. |
|
|
|
GitHub Repository: [Text-to-Speech Model for English Technical Speech](https://github.com/Vinay152003/Text-to-Speech_Model_for_English_Technical_Speech-Using-SpeechT5) |
|
|
|
Report: [Project Report](https://drive.google.com/file/d/1CfnpeUi18R7De1uhilYuhMYLS_xXjh2Q/view) |
|
""", |
|
examples=[ |
|
["What is GPU?"], |
|
["What are continuous integration systems, and what is their role in the automated-build process?"], |
|
["Using CUDA for deep learning optimizes the model training on GPU."], |
|
["In TTS models, the vocoder is essential for natural-sounding speech."], |
|
["TensorFlow provides comprehensive tools for deep learning."], |
|
["The API allows integration with OAuth and REST for scalable web services."] |
|
] |
|
) |
|
|
|
|
|
iface.launch(share=True) |