Spaces:
Runtime error
Runtime error
import gradio as gr | |
import torch | |
from datasets import load_dataset | |
from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech | |
# Load the fine-tuned model and vocoder for Italian from the new model ID | |
model_id = "Vinay15/speecht5_finetuned_voxpopuli_it" | |
model = SpeechT5ForTextToSpeech.from_pretrained(model_id) | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
# Load speaker embeddings dataset | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0) | |
# Load processor for the new Italian model | |
processor = SpeechT5Processor.from_pretrained(model_id) | |
# Optional: Text cleanup for Italian-specific characters | |
replacements = [ | |
("à", "a"), | |
("è", "e"), | |
("é", "e"), | |
("ì", "i"), | |
("ò", "o"), | |
("ù", "u"), | |
] | |
# Text-to-speech synthesis function | |
def synthesize_speech(text): | |
# Clean up text for Italian-specific accents | |
for src, dst in replacements: | |
text = text.replace(src, dst) | |
# Process input text | |
inputs = processor(text=text, return_tensors="pt") | |
# Generate speech using the model and vocoder | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
# Return the generated speech as (sample_rate, audio_array) | |
return (16000, speech.cpu().numpy()) | |
# Title and description for the Gradio interface | |
title = "Fine-tuning TTS for Italian as a Regional Language Using SpeechT5" | |
description = f""" | |
This Space generates speech in Italian, a regional language, using a fine-tuned SpeechT5 model from Hugging Face. | |
Italian is considered a regional language because it is primarily spoken within Italy and a few Italian-speaking regions in | |
other countries, such as Switzerland, San Marino, Vatican City, and areas in Croatia and Slovenia. | |
With about 85 million speakers worldwide, Italian's regional usage contrasts with the global reach of languages like English or Spanish. | |
**Fine-Tuned Model Preparation:** This model has been fine-tuned using the VoxPopuli Italian dataset to optimize SpeechT5 for | |
Italian pronunciation, intonation, and fluency. The fine-tuning process involved preprocessing the text data to ensure accurate | |
Italian accents and phonetics, resulting in high-quality Italian speech synthesis. | |
The fine-tuned model is available [here](https://huggingface.co/Vinay15/speecht5_finetuned_voxpopuli_it). | |
**Note:** Processing time may vary based on sentence length. Longer sentences may take more time to process and generate audio. | |
For more details, visit the [GitHub repository](https://github.com/Vinay152003/Fine-tuning-TTS-for-a-Italian-it-Language) and review the project [report](https://drive.google.com/file/d/1cvNPkuFlTZAu1iDaagCwVRGXFd6r6vqi/view?usp=sharing). | |
""" | |
# Create Gradio interface with multiple examples | |
interface = gr.Interface( | |
fn=synthesize_speech, | |
inputs=gr.Textbox(label="Input Text", placeholder="Enter Italian text here..."), | |
outputs=gr.Audio(label="Generated Speech"), | |
title=title, | |
description=description, | |
examples=[ | |
["Questa è una dimostrazione di sintesi vocale in italiano."], | |
["Benvenuti alla nostra piattaforma di sintesi vocale!"], | |
["Il modello è stato addestrato per parlare l'italiano in modo naturale e fluido."], | |
["Oggi il tempo è bello e il sole splende."], | |
["La città di Roma è una delle destinazioni turistiche più popolari al mondo."] | |
] | |
) | |
# Launch the interface | |
interface.launch() | |