Solo448 commited on
Commit
ad8f297
·
verified ·
1 Parent(s): 56b7ca6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -0
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import python_multipart
4
+ import os
5
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
+ from datasets import load_dataset, Audio
7
+ import numpy as np
8
+ from speechbrain.inference import EncoderClassifier
9
+
10
+ # Load models and processor
11
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
12
+ model = SpeechT5ForTextToSpeech.from_pretrained("Solo448/SpeechT5-tuned-bn")
13
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
14
+
15
+ # Load speaker encoder
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ speaker_model = EncoderClassifier.from_hparams(
18
+ source="speechbrain/spkrec-xvect-voxceleb",
19
+ run_opts={"device": device},
20
+ savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb")
21
+ )
22
+
23
+ # Load a sample from the dataset for speaker embedding
24
+ try:
25
+ dataset = load_dataset("ucalyptus/train-bn", split="train", trust_remote_code=True)
26
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
27
+ sample = dataset[0]
28
+ speaker_embedding = create_speaker_embedding(sample['audio']['array'])
29
+ except Exception as e:
30
+ print(f"Error loading dataset: {e}")
31
+ # Use a random speaker embedding as fallback
32
+ speaker_embedding = torch.randn(1, 512)
33
+
34
+ def create_speaker_embedding(waveform):
35
+ with torch.no_grad():
36
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
37
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
38
+ speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
39
+ return speaker_embeddings
40
+
41
+ def text_to_speech(text):
42
+ # Clean up text
43
+ replacements = [
44
+ ("অ", "a"),
45
+ ("আ", "aa"),
46
+ ("ই", "i"),
47
+ ("ঈ", "ee"),
48
+ ("উ", "u"),
49
+ ("ঊ", "oo"),
50
+ ("ঋ", "ri"),
51
+ ("এ", "e"),
52
+ ("ঐ", "oi"),
53
+ ("ও", "o"),
54
+ ("ঔ", "ou"),
55
+ ("ক", "k"),
56
+ ("খ", "kh"),
57
+ ("গ", "g"),
58
+ ("ঘ", "gh"),
59
+ ("ঙ", "ng"),
60
+ ("চ", "ch"),
61
+ ("ছ", "chh"),
62
+ ("জ", "j"),
63
+ ("ঝ", "jh"),
64
+ ("ঞ", "nj"),
65
+ ("ট", "t"),
66
+ ("ঠ", "th"),
67
+ ("ড", "d"),
68
+ ("ঢ", "dh"),
69
+ ("ণ", "nr"),
70
+ ("ত", "t"),
71
+ ("থ", "th"),
72
+ ("দ", "d"),
73
+ ("ধ", "dh"),
74
+ ("ন", "n"),
75
+ ("প", "p"),
76
+ ("ফ", "ph"),
77
+ ("ব", "b"),
78
+ ("ভ", "bh"),
79
+ ("ম", "m"),
80
+ ("য", "ya"),
81
+ ("র", "r"),
82
+ ("ল", "l"),
83
+ ("শ", "sha"),
84
+ ("ষ", "sh"),
85
+ ("স", "s"),
86
+ ("হ", "ha"),
87
+ ("ড়", "rh"),
88
+ ("ঢ়", "rh"),
89
+ ("য়", "y"),
90
+ ("ৎ", "t"),
91
+ ("ঃ", "h"),
92
+ ("ঁ", "n"),
93
+ ("়", ""),
94
+ ("া", "a"),
95
+ ("ি", "i"),
96
+ ("ী", "ii"),
97
+ ("ু", "u"),
98
+ ("ূ", "uu"),
99
+ ("ৃ", "r"),
100
+ ("ে", "e"),
101
+ ("ৈ", "oi"),
102
+ ("ো", "o"),
103
+ ("ৌ", "ou"),
104
+ ("্", ""),
105
+ ("ৎ", "t"),
106
+ ("ৗ", "ou"),
107
+ ("ড়", "r"),
108
+ ("ঢ়", "r"),
109
+ ("য়", "y"),
110
+ ("ৰ", "r"),
111
+ ("৵", "lee"),
112
+ ("ং", "ng"),
113
+ ("১", "1"),
114
+ ("২", "2"),
115
+ ("৩", "3"),
116
+ ("৪", "4"),
117
+ ("৫", "5"),
118
+ ("৬", "6"),
119
+ ("৭", "7"),
120
+ ("৮", "8"),
121
+ ("৯", "9"),
122
+ ("০", "0")
123
+ ]
124
+ for src, dst in replacements:
125
+ text = text.replace(src, dst)
126
+
127
+ inputs = processor(text=text, return_tensors="pt")
128
+ speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
129
+ return (16000, speech.numpy())
130
+
131
+ iface = gr.Interface(
132
+ fn=text_to_speech,
133
+ inputs="text",
134
+ outputs="audio",
135
+ title="Bengali Text-to-Speech",
136
+ description="Enter bengali text to convert to speech"
137
+ )
138
+
139
+ iface.launch(share=True)