Update app.py
Browse files
app.py
CHANGED
@@ -2,13 +2,13 @@ import gradio as gr
|
|
2 |
import torch
|
3 |
import os
|
4 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
5 |
-
from datasets import load_dataset,Audio
|
6 |
import numpy as np
|
7 |
from speechbrain.inference import EncoderClassifier
|
8 |
|
9 |
# Load models and processor
|
10 |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
11 |
-
model = SpeechT5ForTextToSpeech.from_pretrained("
|
12 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
13 |
|
14 |
# Load speaker encoder
|
@@ -20,9 +20,15 @@ speaker_model = EncoderClassifier.from_hparams(
|
|
20 |
)
|
21 |
|
22 |
# Load a sample from the dataset for speaker embedding
|
23 |
-
|
24 |
-
dataset =
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
def create_speaker_embedding(waveform):
|
28 |
with torch.no_grad():
|
@@ -31,9 +37,6 @@ def create_speaker_embedding(waveform):
|
|
31 |
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
|
32 |
return speaker_embeddings
|
33 |
|
34 |
-
# Create a speaker embedding from the sample
|
35 |
-
speaker_embedding = create_speaker_embedding(sample['audio']['array'])
|
36 |
-
|
37 |
def text_to_speech(text):
|
38 |
# Clean up text
|
39 |
replacements = [
|
|
|
2 |
import torch
|
3 |
import os
|
4 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
5 |
+
from datasets import load_dataset, Audio
|
6 |
import numpy as np
|
7 |
from speechbrain.inference import EncoderClassifier
|
8 |
|
9 |
# Load models and processor
|
10 |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
11 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("YOUR_FINE_TUNED_MODEL_PATH")
|
12 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
13 |
|
14 |
# Load speaker encoder
|
|
|
20 |
)
|
21 |
|
22 |
# Load a sample from the dataset for speaker embedding
|
23 |
+
try:
|
24 |
+
dataset = load_dataset("Yassmen/TTS_English_Technical_data", split="train")
|
25 |
+
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
26 |
+
sample = dataset[0]
|
27 |
+
speaker_embedding = create_speaker_embedding(sample['audio']['array'])
|
28 |
+
except Exception as e:
|
29 |
+
print(f"Error loading dataset: {e}")
|
30 |
+
# Use a random speaker embedding as fallback
|
31 |
+
speaker_embedding = torch.randn(1, 512)
|
32 |
|
33 |
def create_speaker_embedding(waveform):
|
34 |
with torch.no_grad():
|
|
|
37 |
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
|
38 |
return speaker_embeddings
|
39 |
|
|
|
|
|
|
|
40 |
def text_to_speech(text):
|
41 |
# Clean up text
|
42 |
replacements = [
|