import gradio as gr import torch from datasets import load_dataset from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech # Load the fine-tuned model and vocoder for Italian from the new model ID model_id = "Vinay15/speecht5_finetuned_voxpopuli_it" model = SpeechT5ForTextToSpeech.from_pretrained(model_id) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Load speaker embeddings dataset embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0) # Load processor for the new Italian model processor = SpeechT5Processor.from_pretrained(model_id) # Optional: Text cleanup for Italian-specific characters replacements = [ ("à", "a"), ("è", "e"), ("é", "e"), ("ì", "i"), ("ò", "o"), ("ù", "u"), ] # Text-to-speech synthesis function def synthesize_speech(text): # Clean up text for Italian-specific accents for src, dst in replacements: text = text.replace(src, dst) # Process input text inputs = processor(text=text, return_tensors="pt") # Generate speech using the model and vocoder speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) # Return the generated speech as (sample_rate, audio_array) return (16000, speech.cpu().numpy()) # Title and description for the Gradio interface title = "Fine-tuning TTS for Italian as a Regional Language Using SpeechT5" description = f""" This Space generates speech in Italian, a regional language, using a fine-tuned SpeechT5 model from Hugging Face. Italian is considered a regional language because it is primarily spoken within Italy and a few Italian-speaking regions in other countries, such as Switzerland, San Marino, Vatican City, and areas in Croatia and Slovenia. With about 85 million speakers worldwide, Italian's regional usage contrasts with the global reach of languages like English or Spanish. **Fine-Tuned Model Preparation:** This model has been fine-tuned using the VoxPopuli Italian dataset to optimize SpeechT5 for Italian pronunciation, intonation, and fluency. The fine-tuning process involved preprocessing the text data to ensure accurate Italian accents and phonetics, resulting in high-quality Italian speech synthesis. The fine-tuned model is available [here](https://huggingface.co/Vinay15/speecht5_finetuned_voxpopuli_it). **Note:** Processing time may vary based on sentence length. Longer sentences may take more time to process and generate audio. For more details, visit the [GitHub repository](https://github.com/Vinay152003/Fine-tuning-TTS-for-a-Italian-it-Language) and review the project [report](https://drive.google.com/file/d/1cvNPkuFlTZAu1iDaagCwVRGXFd6r6vqi/view?usp=sharing). """ # Create Gradio interface with multiple examples interface = gr.Interface( fn=synthesize_speech, inputs=gr.Textbox(label="Input Text", placeholder="Enter Italian text here..."), outputs=gr.Audio(label="Generated Speech"), title=title, description=description, examples=[ ["Questa è una dimostrazione di sintesi vocale in italiano."], ["Benvenuti alla nostra piattaforma di sintesi vocale!"], ["Il modello è stato addestrato per parlare l'italiano in modo naturale e fluido."], ["Oggi il tempo è bello e il sole splende."], ["La città di Roma è una delle destinazioni turistiche più popolari al mondo."] ] ) # Launch the interface interface.launch()