File size: 3,097 Bytes
08a2633
 
 
faabe22
 
08a2633
 
 
 
faabe22
08a2633
 
 
 
47c1a8b
a7d44b7
08a2633
 
faabe22
08a2633
faabe22
 
08a2633
faabe22
 
08a2633
 
8fa00e8
 
 
 
 
faabe22
08a2633
faabe22
08a2633
8fa00e8
 
faabe22
 
47c1a8b
faabe22
08a2633
 
faabe22
 
47c1a8b
faabe22
 
 
 
 
 
47c1a8b
faabe22
 
 
 
47c1a8b
faabe22
47c1a8b
 
 
7b03b11
faabe22
 
 
 
 
7b03b11
 
8fa00e8
 
 
7b03b11
faabe22
47c1a8b
faabe22
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
import json
import torch
import numpy as np
import re
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import soundfile as sf

# Step 1: Load the models and the pronunciation dictionary
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Load pronunciation dictionary from JSON file
with open("pronunciation_dict.json", "r") as f:
    pronunciation_dict = json.load(f)

# Function to preprocess and apply pronunciation dictionary
def preprocess_text(text):
    # Convert text to uppercase for uniformity in matching
    text = text.upper()
    for term, phonetic in pronunciation_dict.items():
        # Replace terms with their phonetic equivalents
        text = text.replace(term.upper(), phonetic)
    return text

# Explicitly replace "API" with "A P I" to improve pronunciation
def custom_acronym_pronunciation(text):
    text = text.replace("API", "ay p eei")
    return text

# Step 2: Define the TTS function with sentence segmentation
def text_to_speech(input_text):
    # Preprocess and segment text
    processed_text = preprocess_text(input_text)
    # Apply custom acronym handling
    processed_text = custom_acronym_pronunciation(processed_text)
    # Split the processed text by punctuation to form shorter segments
    segments = re.split(r'(?<=[.!?]) +', processed_text)

    # Load speaker embeddings for consistent voice
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
    
    audio_outputs = []

    # Generate speech for each text segment
    for segment in segments:
        if segment.strip():  # Ensure the segment is not empty
            inputs = processor(text=segment, return_tensors="pt")
            speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
            audio_outputs.append(speech.numpy())

    # Concatenate audio from all segments
    complete_speech = np.concatenate(audio_outputs)
    
    # Save the concatenated speech as a .wav file
    output_file = "speech_output.wav"
    sf.write(output_file, complete_speech, samplerate=16000)

    return output_file

# Step 3: Create Gradio interface with examples
iface = gr.Interface(
    fn=text_to_speech,
    inputs="text",
    outputs="audio",
    title="Fine-tuning TTS for Technical Vocabulary",
    description="Enter text with technical jargon for TTS conversion. The model will handle abbreviations and technical terms for better pronunciation.",
    examples=[
        ["The API allows integration with OAuth and REST for scalable web services."],
        ["Using CUDA for deep learning optimizes the model training on GPUs."],
        ["In TTS models, the vocoder is essential for natural-sounding speech."]
    ]
)

# Step 4: Launch the app
iface.launch(share=True)