File size: 2,432 Bytes
9cfe01a
cf0424c
9cfe01a
 
08a2633
 
 
 
 
 
 
cb03785
08a2633
 
 
 
 
a7d44b7
08a2633
 
 
 
 
 
 
 
cb03785
08a2633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cfe01a
 
63691a4
9cfe01a
 
 
63691a4
9cfe01a
 
 
 
 
 
f2006ef
9cfe01a
 
 
08a2633
cb03785
08a2633
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Step 1: Install Gradio
#pip install gradio

# Step 2: Import necessary libraries
import gradio as gr
import json
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import soundfile as sf

# Step 3: Load the models and the pronunciation dictionary
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Load pronunciation dictionary from JSON file
with open("pronunciation_dict.json", "r") as f:
    pronunciation_dict = json.load(f)

# Function to preprocess the input text
def preprocess_text(text):
    for term, phonetic in pronunciation_dict.items():
        text = text.replace(term, phonetic)
    return text

# Step 4: Define the TTS function
def text_to_speech(input_text):
    # Preprocess the text
    processed_text = preprocess_text(input_text)

    # Convert the processed text to model inputs
    inputs = processor(text=processed_text, return_tensors="pt")

    # Load xvector embeddings from dataset for speaker voice characteristics
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

    # Generate speech using the model and vocoder
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    # Save the generated speech as a .wav file
    output_file = "speech_output.wav"
    sf.write(output_file, speech.numpy(), samplerate=16000)

    return output_file

# Step 5: Create Gradio interface with examples
examples = [
    "We are using API for authentication.",
    "CUDA and TensorFlow work together for deep learning models.",
    "The database uses NoSQL and supports JSON for data storage.",
    "Machine learning and artificial intelligence are advancing fast.",
    "Natural language processing techniques are widely adopted."
]

iface = gr.Interface(
    fn=text_to_speech,
    inputs="text",
    outputs="audio",
    title="Fine-tuning TTS for English with a Focus on Technical Vocabulary Using SpeechT5",
    description="Enter text with technical jargon for TTS conversion.",
    examples=examples  # Adding preset examples for users
)

# Step 6: Launch the app
iface.launch(share=True)