|
import gradio as gr |
|
import torch |
|
import os |
|
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
|
from datasets import load_dataset, Audio |
|
import numpy as np |
|
from speechbrain.inference import EncoderClassifier |
|
|
|
|
|
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") |
|
model = SpeechT5ForTextToSpeech.from_pretrained("YOUR_FINE_TUNED_MODEL_PATH") |
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
speaker_model = EncoderClassifier.from_hparams( |
|
source="speechbrain/spkrec-xvect-voxceleb", |
|
run_opts={"device": device}, |
|
savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb") |
|
) |
|
|
|
|
|
try: |
|
dataset = load_dataset("Yassmen/TTS_English_Technical_data", split="train") |
|
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) |
|
sample = dataset[0] |
|
speaker_embedding = create_speaker_embedding(sample['audio']['array']) |
|
except Exception as e: |
|
print(f"Error loading dataset: {e}") |
|
|
|
speaker_embedding = torch.randn(1, 512) |
|
|
|
def create_speaker_embedding(waveform): |
|
with torch.no_grad(): |
|
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) |
|
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) |
|
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() |
|
return speaker_embeddings |
|
|
|
def text_to_speech(text): |
|
|
|
replacements = [ |
|
('$', 'dollar'), ('%', 'percent'), ('&', 'and'), ('*', 'asterick'), |
|
('+', 'plus'), ('1', 'one'), ('2', 'two'), ('3', 'three'), ('4', 'four'), |
|
('5', 'five'), ('6', 'six'), ('7', 'seven'), ('8', 'eight'), ('9', 'nine'), |
|
('0', 'zero'), ('@', 'at'), ('\n', ' '), ('\xa0', ' '), (',', ' '), |
|
('"', '"'), ('"', '"'), |
|
] |
|
for src, dst in replacements: |
|
text = text.replace(src, dst) |
|
|
|
inputs = processor(text=text, return_tensors="pt") |
|
speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder) |
|
return (16000, speech.numpy()) |
|
|
|
iface = gr.Interface( |
|
fn=text_to_speech, |
|
inputs="text", |
|
outputs="audio", |
|
title="Technical Text-to-Speech", |
|
description="Enter technical text to convert to speech. The model has been fine-tuned on technical data." |
|
) |
|
|
|
iface.launch() |