Spaces:

Babyloncoder
/

Text-to-speech-with-pitch-controls

Running

Create app.py

c4ff521 verified over 1 year ago

1.64 kB

	import gradio as gr
	from transformers import VitsModel, AutoTokenizer
	import torch
	import scipy.io.wavfile
	import numpy as np
	import librosa
	import soundfile as sf
	import tempfile

	# Load the model and tokenizer
	model = VitsModel.from_pretrained("facebook/mms-tts-eng")
	tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")


	def pitch_shift_np(audio_np, sampling_rate, pitch_shift):
	# Correcting the function call
	return librosa.effects.pitch_shift(audio_np, sr=sampling_rate, n_steps=pitch_shift)


	def synthesize_speech(text, pitch_shift):
	# Tokenize the input text
	inputs = tokenizer(text, return_tensors="pt")

	# Generate waveform
	with torch.no_grad():
	output = model(**inputs).waveform.squeeze().numpy()

	# Pitch shift
	shifted_audio = pitch_shift_np(output, model.config.sampling_rate, pitch_shift)

	# Save to a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as fp:
	sf.write(fp.name, shifted_audio, model.config.sampling_rate)
	temp_file_path = fp.name

	return temp_file_path


	# Create the Gradio interface
	interface = gr.Interface(
	fn=synthesize_speech,
	inputs=[
	gr.components.Textbox(lines=2, placeholder="Type your text here..."),
	gr.components.Slider(minimum=-2, maximum=2, step=0.1, label="Pitch Shift (Semitones)")
	],
	outputs=gr.components.Audio(type="filepath", label="Generated Speech"),
	title="Text to Speech Synthesis",
	description="Type text and convert it to speech using a TTS model. Use the slider to adjust the pitch."
	)

	# Launch the application
	interface.launch()