saidivyesh commited on
Commit
0ee4416
·
verified ·
1 Parent(s): 8389aed

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -0
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import soundfile as sf
4
+ import spaces
5
+ import os
6
+ import numpy as np
7
+ import re
8
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
+ from speechbrain.pretrained import EncoderClassifier
10
+ from datasets import load_dataset
11
+
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+
14
+ def load_models_and_data(language="en"):
15
+ model_name = "microsoft/speecht5_tts"
16
+ processor = SpeechT5Processor.from_pretrained(model_name)
17
+
18
+ # Replace with English technical TTS model or regional language-specific model
19
+ if language == "en":
20
+ model = SpeechT5ForTextToSpeech.from_pretrained("my_finetuned_english_tech_tts").to(device)
21
+ else:
22
+ model = SpeechT5ForTextToSpeech.from_pretrained("my_finetuned_regional_language_tts").to(device)
23
+
24
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
25
+
26
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
27
+ speaker_model = EncoderClassifier.from_hparams(
28
+ source=spk_model_name,
29
+ run_opts={"device": device},
30
+ savedir=os.path.join("/tmp", spk_model_name),
31
+ )
32
+
33
+ # Load a sample from a dataset for default embedding
34
+ if language == "en":
35
+ dataset = load_dataset("lj_speech", split="train")
36
+ else:
37
+ dataset = load_dataset("regional_language_dataset", split="train")
38
+
39
+ example = dataset[0]
40
+
41
+ return model, processor, vocoder, speaker_model, example
42
+
43
+ # Choose the language dynamically (English or Regional Language)
44
+ model, processor, vocoder, speaker_model, default_example = load_models_and_data(language="en")
45
+
46
+ def create_speaker_embedding(waveform):
47
+ with torch.no_grad():
48
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
49
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
50
+ speaker_embeddings = speaker_embeddings.squeeze()
51
+ return speaker_embeddings
52
+
53
+ def prepare_default_embedding(example):
54
+ audio = example["audio"]
55
+ return create_speaker_embedding(audio["array"])
56
+
57
+ default_embedding = prepare_default_embedding(default_example)
58
+
59
+ # Text normalization updates for English technical speech
60
+ technical_replacements = [
61
+ # Common technical replacements (examples)
62
+ ("HTTP", "H T T P"),
63
+ ("AI", "A I"),
64
+ # Add more technical abbreviations as needed
65
+ ]
66
+
67
+ def normalize_text(text, language="en"):
68
+ text = text.lower()
69
+
70
+ # Handle language-specific normalization
71
+ if language == "en":
72
+ # Replace technical terms or symbols
73
+ for old, new in technical_replacements:
74
+ text = text.replace(old, new)
75
+
76
+ # For regional language, include character replacements like the Turkish example
77
+ if language != "en":
78
+ replacements = [
79
+ # Character mappings for regional languages (like the Turkish example)
80
+ # Add region/language-specific character normalization here
81
+ ]
82
+ for old, new in replacements:
83
+ text = text.replace(old, new)
84
+
85
+ # Remove punctuation or handle them contextually for technical speech
86
+ text = re.sub(r'[^\w\s]', '', text)
87
+
88
+ return text
89
+
90
+ @spaces.GPU(duration=60)
91
+ def text_to_speech(text, audio_file=None, language="en"):
92
+ # Normalize the input text
93
+ normalized_text = normalize_text(text, language=language)
94
+
95
+ # Prepare the input for the model
96
+ inputs = processor(text=normalized_text, return_tensors="pt").to(device)
97
+
98
+ # Use the default speaker embedding
99
+ speaker_embeddings = default_embedding
100
+
101
+ # Generate speech
102
+ with torch.no_grad():
103
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
104
+
105
+ speech_np = speech.cpu().numpy()
106
+
107
+ return (16000, speech_np)
108
+
109
+ iface = gr.Interface(
110
+ fn=text_to_speech,
111
+ inputs=[
112
+ gr.Textbox(label="Enter text to convert to speech"),
113
+ gr.Dropdown(label="Language", choices=["English Technical", "Regional"], value="English Technical")
114
+ ],
115
+ outputs=[
116
+ gr.Audio(label="Generated Speech", type="numpy")
117
+ ],
118
+ title="Fine-Tuned TTS for Technical English and Regional Languages",
119
+ description="Enter text, choose the language, and listen to the generated speech."
120
+ )
121
+
122
+ iface.launch(share=True)