Spaces:

Vinay15
/

Text-to-Speech_Model_for_English_Technical_Speech

Sleeping

App Files Files Community

Text-to-Speech_Model_for_English_Technical_Speech / app.py

Vinay15

Update app.py

dedcf07 verified 9 months ago

raw

history blame

2.1 kB

	import gradio as gr
	import json
	import torch
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from datasets import load_dataset
	import soundfile as sf
	import io

	# Load models and pronunciation dictionary
	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

	with open("pronunciation_dict.json", "r") as f:
	pronunciation_dict = json.load(f)

	# Preprocess the text
	def preprocess_text(text):
	for term, phonetic in pronunciation_dict.items():
	text = text.replace(term, phonetic)
	return text

	# Text-to-Speech function
	def text_to_speech(input_text):
	processed_text = preprocess_text(input_text)
	inputs = processor(text=processed_text, return_tensors="pt")

	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

	speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

	# Prepare audio as BytesIO
	audio_buffer = io.BytesIO()
	sf.write(audio_buffer, speech.numpy(), samplerate=16000, format='WAV')
	audio_buffer.seek(0)

	return audio_buffer

	# Define examples
	examples = [
	"We are using API for authentication.",
	"CUDA and TensorFlow work together for deep learning models.",
	"The database uses NoSQL and supports JSON for data storage.",
	"Machine learning and artificial intelligence are advancing fast.",
	"Natural language processing techniques are widely adopted."
	]

	# Create Gradio interface
	iface = gr.Interface(
	fn=text_to_speech,
	inputs="text",
	outputs="audio",
	title="Fine-tuning TTS for English with a Focus on Technical Vocabulary Using SpeechT5",
	description="Enter text with technical jargon for TTS conversion.",
	examples=examples,
	cache_examples=False # Disable caching for now
	)

	# Launch interface
	iface.launch()