Spaces:

Vinay15
/

Fine-tuning_TTS_for_a_Regional_Language

Runtime error

App Files Files Community

Fine-tuning_TTS_for_a_Regional_Language / app.py

Vinay15

Update app.py

72e2358 verified 9 months ago

raw

history blame

3.58 kB

	import gradio as gr
	import torch
	from datasets import load_dataset
	from transformers import SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech

	# Load the fine-tuned model and vocoder for Italian from the new model ID
	model_id = "Vinay15/speecht5_finetuned_voxpopuli_it"
	model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

	# Load speaker embeddings dataset
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)

	# Load processor for the new Italian model
	processor = SpeechT5Processor.from_pretrained(model_id)

	# Optional: Text cleanup for Italian-specific characters
	replacements = [
	("à", "a"),
	("è", "e"),
	("é", "e"),
	("ì", "i"),
	("ò", "o"),
	("ù", "u"),
	]

	# Text-to-speech synthesis function
	def synthesize_speech(text):
	# Clean up text for Italian-specific accents
	for src, dst in replacements:
	text = text.replace(src, dst)

	# Process input text
	inputs = processor(text=text, return_tensors="pt")

	# Generate speech using the model and vocoder
	speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

	# Return the generated speech as (sample_rate, audio_array)
	return (16000, speech.cpu().numpy())

	# Title and description for the Gradio interface
	title = "Fine-tuning TTS for Italian as a Regional Language Using SpeechT5"
	description = f"""
	This Space generates speech in Italian, a regional language, using a fine-tuned SpeechT5 model from Hugging Face.
	Italian is considered a regional language because it is primarily spoken within Italy and a few Italian-speaking regions in
	other countries, such as Switzerland, San Marino, Vatican City, and areas in Croatia and Slovenia.
	With about 85 million speakers worldwide, Italian's regional usage contrasts with the global reach of languages like English or Spanish.

	Fine-Tuned Model Preparation: This model has been fine-tuned using the VoxPopuli Italian dataset to optimize SpeechT5 for
	Italian pronunciation, intonation, and fluency. The fine-tuning process involved preprocessing the text data to ensure accurate
	Italian accents and phonetics, resulting in high-quality Italian speech synthesis.

	The fine-tuned model is available [here](https://huggingface.co/Vinay15/speecht5_finetuned_voxpopuli_it).

	Note: Processing time may vary based on sentence length. Longer sentences may take more time to process and generate audio.

	For more details, visit the [GitHub repository](https://github.com/Vinay152003/Fine-tuning-TTS-for-a-Italian-it-Language) and review the project [report](https://drive.google.com/file/d/1cvNPkuFlTZAu1iDaagCwVRGXFd6r6vqi/view?usp=sharing).
	"""

	# Create Gradio interface with multiple examples
	interface = gr.Interface(
	fn=synthesize_speech,
	inputs=gr.Textbox(label="Input Text", placeholder="Enter Italian text here..."),
	outputs=gr.Audio(label="Generated Speech"),
	title=title,
	description=description,
	examples=[
	["Questa è una dimostrazione di sintesi vocale in italiano."],
	["Benvenuti alla nostra piattaforma di sintesi vocale!"],
	["Il modello è stato addestrato per parlare l'italiano in modo naturale e fluido."],
	["Oggi il tempo è bello e il sole splende."],
	["La città di Roma è una delle destinazioni turistiche più popolari al mondo."]
	]
	)

	# Launch the interface
	interface.launch()