Spaces:

saidivyesh
/

tts

Running

App Files Files Community

saidivyesh commited on Oct 19, 2024

Commit

0ee4416

verified ·

1 Parent(s): 8389aed

Create app.py

Browse files

Files changed (1) hide show

app.py +122 -0

app.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import gradio as gr
+import torch
+import soundfile as sf
+import spaces
+import os
+import numpy as np
+import re
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from speechbrain.pretrained import EncoderClassifier
+from datasets import load_dataset
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def load_models_and_data(language="en"):
+    model_name = "microsoft/speecht5_tts"
+    processor = SpeechT5Processor.from_pretrained(model_name)
+    # Replace with English technical TTS model or regional language-specific model
+    if language == "en":
+        model = SpeechT5ForTextToSpeech.from_pretrained("my_finetuned_english_tech_tts").to(device)
+    else:
+        model = SpeechT5ForTextToSpeech.from_pretrained("my_finetuned_regional_language_tts").to(device)
+    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
+    spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
+    speaker_model = EncoderClassifier.from_hparams(
+        source=spk_model_name,
+        run_opts={"device": device},
+        savedir=os.path.join("/tmp", spk_model_name),
+    )
+    # Load a sample from a dataset for default embedding
+    if language == "en":
+        dataset = load_dataset("lj_speech", split="train")
+    else:
+        dataset = load_dataset("regional_language_dataset", split="train")
+    example = dataset[0]
+    return model, processor, vocoder, speaker_model, example
+# Choose the language dynamically (English or Regional Language)
+model, processor, vocoder, speaker_model, default_example = load_models_and_data(language="en")
+def create_speaker_embedding(waveform):
+    with torch.no_grad():
+        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
+        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
+        speaker_embeddings = speaker_embeddings.squeeze()
+    return speaker_embeddings
+def prepare_default_embedding(example):
+    audio = example["audio"]
+    return create_speaker_embedding(audio["array"])
+default_embedding = prepare_default_embedding(default_example)
+# Text normalization updates for English technical speech
+technical_replacements = [
+    # Common technical replacements (examples)
+    ("HTTP", "H T T P"),
+    ("AI", "A I"),
+    # Add more technical abbreviations as needed
+]
+def normalize_text(text, language="en"):
+    text = text.lower()
+    # Handle language-specific normalization
+    if language == "en":
+        # Replace technical terms or symbols
+        for old, new in technical_replacements:
+            text = text.replace(old, new)
+    # For regional language, include character replacements like the Turkish example
+    if language != "en":
+        replacements = [
+            # Character mappings for regional languages (like the Turkish example)
+            # Add region/language-specific character normalization here
+        ]
+        for old, new in replacements:
+            text = text.replace(old, new)
+    # Remove punctuation or handle them contextually for technical speech
+    text = re.sub(r'[^\w\s]', '', text)
+    return text
+@spaces.GPU(duration=60)
+def text_to_speech(text, audio_file=None, language="en"):
+    # Normalize the input text
+    normalized_text = normalize_text(text, language=language)
+    # Prepare the input for the model
+    inputs = processor(text=normalized_text, return_tensors="pt").to(device)
+    # Use the default speaker embedding
+    speaker_embeddings = default_embedding
+    # Generate speech
+    with torch.no_grad():
+        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
+    speech_np = speech.cpu().numpy()
+    return (16000, speech_np)
+iface = gr.Interface(
+    fn=text_to_speech,
+    inputs=[
+        gr.Textbox(label="Enter text to convert to speech"),
+        gr.Dropdown(label="Language", choices=["English Technical", "Regional"], value="English Technical")
+    ],
+    outputs=[
+        gr.Audio(label="Generated Speech", type="numpy")
+    ],
+    title="Fine-Tuned TTS for Technical English and Regional Languages",
+    description="Enter text, choose the language, and listen to the generated speech."
+)
+iface.launch(share=True)